In [1]:
import pandas as pd
import numpy as np

import pickle

import gensim

## Import Data

In [2]:
df = pd.read_pickle('../Data/cleaned_lyrics')

In [3]:
df.shape

(18702, 18)

In [4]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        

clean_sents = list(sent_to_words(df.lyrics))

In [5]:
def bigrams(clean_sents, bi_min=100):
    bigram = gensim.models.Phrases(clean_sents, min_count = bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod, bigram

In [6]:
def trigrams(bigram, clean_sents, tri_min=70):
    trigram = gensim.models.Phrases(bigram[clean_sents], min_count=2)  
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    return trigram_mod

In [7]:
def get_corpus(lyrics):
    """
    Get Bigram Model, Corpus, id2word mapping
    """
    
    clean_sents = list(sent_to_words(lyrics))
    bigram_mod, bigram = bigrams(clean_sents)
    trigram_mod = trigrams(bigram, clean_sents)
    clean_sents = [trigram_mod[bigram_mod[t]] for t in clean_sents]

    id2word = gensim.corpora.Dictionary(clean_sents)
    corpus = [id2word.doc2bow(t) for t in clean_sents]
    return corpus, id2word, clean_sents

In [8]:
corpus, id2word, clean_sents = get_corpus(df.lyrics)



In [10]:
pickle_out = open('../Data/corpus_ngramed', 'wb')
pickle.dump(corpus, pickle_out)
pickle_out.close()

pickle_out = open('../Data/id2word_ngramed', 'wb')
pickle.dump(id2word, pickle_out)
pickle_out.close()

In [10]:
for t in clean_sents[:5]:
    print(f'{" ".join(t)} \n')

old chanc get gonna marri_marri get act togeth star school put nice littl flat get financ togeth show old chanc get gonna buy stuff buy stuff new garag store car place us rehers wors never see need old chanc get gonna marri_marri get act togeth star school put nice littl flat get financ togeth show marri_marri marri_marri marri_marri want love 

bigger everyth ever done found reason stay home tonight self tonight gonna loop duplic heart million song found reason go tonight make tonight comput put guitar hold track come around realli mani interest effect wanna tri wanna tri bigger everyth ever done get sleep tonight feel good tonight damn feel sweet tonight neighbor complain caus got headphon get sleep tonight damn feel good tonight everyth alright comput gonna save million song hope enjoy realli mani interest effect wanna tri wanna tri understand repetet chatter bigger everyth ever done 

go back place born favorit hood believ found came use_roam street skateboard cheap_beer littl punk

In [11]:
clean_sents[0][:5]

['old', 'chanc', 'get', 'gonna', 'marri_marri']

## Checking if the artists match across datasets/

In [21]:
len(df.artist_name.unique().tolist())

403

In [22]:
import pickle

pickle_in = open('../Data/indie_pop_artists','rb')
queried_artists = pickle.load(pickle_in)
pickle_in.close()

In [24]:
len(list(set(queried_artists).intersection(set(df.Unaltered_artist_name.unique().tolist()))))

403

In [25]:
df.head()

Unnamed: 0,spotify_album_uri,spotify_artist_id,artist_name,spotify_artist_uri,duration_ms,explicit,spotify_song_id,song_title,song_spotify_page,track_number,spotify_song_uri,song_title.1,Unaltered_artist_name,lyrics,genius_song_id,geenius_song_url,genius_artist_id,lyrics_language
0,spotify:album:2h6MSR1rWemHOtmurgRq3T,7kAKqNxPBkfjgdHzUIdBtI,suburban kids with biblical names,spotify:artist:7kAKqNxPBkfjgdHzUIdBtI,138626,False,4okEZakOVppAtP4Dawd52x,marry me,https://open.spotify.com/track/4okEZakOVppAtP4...,1,spotify:track:4okEZakOVppAtP4Dawd52x,Marry Me,Suburban Kids With Biblical Names,old chanc get gonna marri marri get act togeth...,861607,https://genius.com/Suburban-kids-with-biblical...,353411,[en:0.9999975562687033]
1,spotify:album:2h6MSR1rWemHOtmurgRq3T,7kAKqNxPBkfjgdHzUIdBtI,suburban kids with biblical names,spotify:artist:7kAKqNxPBkfjgdHzUIdBtI,187106,False,2LV6sB5zTsu0R5r5kWohlD,loop duplicate my heart,https://open.spotify.com/track/2LV6sB5zTsu0R5r...,2,spotify:track:2LV6sB5zTsu0R5r5kWohlD,Loop Duplicate My Heart,Suburban Kids With Biblical Names,bigger everyth ever done x found reason stay h...,980120,https://genius.com/Suburban-kids-with-biblical...,353411,[en:0.999998823659918]
2,spotify:album:2h6MSR1rWemHOtmurgRq3T,7kAKqNxPBkfjgdHzUIdBtI,suburban kids with biblical names,spotify:artist:7kAKqNxPBkfjgdHzUIdBtI,176026,False,53uzYuKe433aXBjzBiuvqe,parakit,https://open.spotify.com/track/53uzYuKe433aXBj...,4,spotify:track:53uzYuKe433aXBjzBiuvqe,Parakit,Suburban Kids With Biblical Names,go back place born favorit hood believ found c...,1583231,https://genius.com/Suburban-kids-with-biblical...,353411,[en:0.9999962698239809]
3,spotify:album:2h6MSR1rWemHOtmurgRq3T,7kAKqNxPBkfjgdHzUIdBtI,suburban kids with biblical names,spotify:artist:7kAKqNxPBkfjgdHzUIdBtI,198013,False,7p0pJgizlHS5msrgDPU6li,trees and squirrels,https://open.spotify.com/track/7p0pJgizlHS5msr...,5,spotify:track:7p0pJgizlHS5msrgDPU6li,Trees And Squirrels,Suburban Kids With Biblical Names,tree wild undisput beauti today see squirrel b...,1340623,https://genius.com/Suburban-kids-with-biblical...,353411,[en:0.9999973351330183]
4,spotify:album:2h6MSR1rWemHOtmurgRq3T,7kAKqNxPBkfjgdHzUIdBtI,suburban kids with biblical names,spotify:artist:7kAKqNxPBkfjgdHzUIdBtI,178040,False,1drw02VGWNxVtZuF2Qjp8e,funeral face,https://open.spotify.com/track/1drw02VGWNxVtZu...,6,spotify:track:1drw02VGWNxVtZuF2Qjp8e,Funeral Face,Suburban Kids With Biblical Names,said love said like want see sun goe wrote tol...,1042112,https://genius.com/Suburban-kids-with-biblical...,353411,[en:0.9999975014770117]


In [26]:
df.to_pickle('../Data/n-gramed_lyrics')