## Purpose of first model
This first, simple, prelimiary model take the ~2700 songs from the US database and attempt to see which appoach leads to the most meaningful result.

First, clean up lyrics by removing non-alphanumeric values, punctation, and then performing lemmatizing

Try two different vectorizers
1) CountVectorizer
2) TF-TDF vectorizers

Try three different topic modeling techniques
1) Truncated SVD (LSA)
2) NMF
3) LDA


In [1]:
import pickle
import pandas as pd

import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
file = open('../../Webscraper/us_pops_raw', 'rb')
us_pops = pickle.load(file)
us_pops.dropna(inplace=True)
us_pops = us_pops['Lyrics']

In [3]:
us_pops

0       I know this pain (I know this pain)\nWhy do yo...
1       Lay a whisper on my pillow\nLeave the winter o...
2       It's been seven hours and fifteen days\nSince ...
3       Yeah, Spyderman and Freeze in full effect\nUh-...
4       Strike a pose\nStrike a pose\nVogue (vogue, vo...
                              ...                        
2750    We've been to both Carolinas\nSeen a big Monta...
2751    I'm jealous of the blue jeans that you're wear...
2752    I'm a motherfuckin' train wreck\nI don't wanna...
2753    There's somethin' in the way you roll your eye...
2754    Man, what? (Haha)\nThis shit funny, one sec\nO...
Name: Lyrics, Length: 2671, dtype: object

### 1. Clean up text for NLP

In [4]:
# only keep alphanumeric, remove newlines, and make everything lowercase
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
rm_newline = lambda x: re.sub("\n", ' ',x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

us_pops_nopunc = us_pops.map(alphanumeric).map(rm_newline).map(punc_lower)

In [5]:
# lemmatize. do this by turning every line into a list of words, and concat back to full lyrics
us_pops_list = us_pops_nopunc.apply(lambda x: x.split(' '))

def lemmatize(list_of_words):
    lmt = WordNetLemmatizer()
    strg = ''
    for word in list_of_words:
        lem_word = lmt.lemmatize(word)
        if len(lem_word) > 2:
            strg += lem_word + ' '
    return strg
        

us_pops_clean = us_pops_list.apply(lemmatize)

### 2. Create two vectorized formats, CountVectorizer and TFIDF vectorizer

In [6]:
from sklearn.feature_extraction import text

In [7]:
stop_words_added = text.ENGLISH_STOP_WORDS.union(['huh','woo','whoa','hey','hold']) #these words added through iteration

In [8]:
cv = CountVectorizer(
    stop_words = 'english',
    max_df = 0.2,
    min_df = 0.0005
)
X = cv.fit_transform(us_pops_clean)
cv_doc_term_matrix = pd.DataFrame(X.toarray(), columns=cv.get_feature_names())

In [9]:
cv_doc_term_matrix

Unnamed: 0,aaa,aah,aaliyah,aback,abandoned,abashed,abc,abdul,abeat,abel,...,zulu,zurück,çünkü,étais,était,être,über,üstünde,şey,şimdi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2666,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2667,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2668,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2669,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
tfidf =  TfidfVectorizer(
    stop_words = stop_words_added,
    max_df = 0.2,
    min_df = 0.0005
)
Y = tfidf.fit_transform(us_pops_clean)
tfidf_doc_term_matrix = pd.DataFrame(Y.toarray(), columns=tfidf.get_feature_names())


In [11]:
tfidf_doc_term_matrix

Unnamed: 0,aaa,aah,aaliyah,aback,abandoned,abashed,abc,abdul,abeat,abel,...,zulu,zurück,çünkü,étais,était,être,über,üstünde,şey,şimdi
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.046599,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2666,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2667,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2668,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2669,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 3. Use the two vectorizers above to feed into LSA, NMF, and LDA
### LSA below

In [12]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [13]:
# LSA with CountVectorizer

lsa_cv = TruncatedSVD(5)
doc_topic = lsa_cv.fit_transform(cv_doc_term_matrix)
# lsa.explained_variance_ratio_

topic_word = pd.DataFrame(
    lsa_cv.components_.round(3), 
    index = ['topic1','topic2','topic3','topic4','topic5'],
    columns = cv.get_feature_names()
    )
topic_word

Unnamed: 0,aaa,aah,aaliyah,aback,abandoned,abashed,abc,abdul,abeat,abel,...,zulu,zurück,çünkü,étais,était,être,über,üstünde,şey,şimdi
topic1,0.0,0.0,0.0,0.001,0.003,0.001,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
topic2,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
topic3,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
topic4,0.0,0.001,0.0,-0.0,0.004,0.002,0.0,0.0,0.0,0.0,...,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
topic5,0.0,0.003,0.0,0.0,-0.002,-0.001,0.0,0.0,0.001,0.0,...,0.004,0.001,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.0


In [14]:
display_topics(lsa_cv, cv.get_feature_names(), 10)


Topic  0
saint, did, friend, moment, people, room, word, quite, grandmother, great

Topic  1
doo, shark, shoo, woman, alright, wild, juan, looking, run, doop

Topic  2
choo, ride, woo, train, juan, nigga, ana, hey, devil, statue

Topic  3
juan, ana, devil, statue, woman, heaven, tanner, old, hell, men

Topic  4
nigga, hey, bitch, shit, fuck, featuring, low, money, rock, gotta


In [15]:
# LSA with TF-IDF

lsa_tfidf = TruncatedSVD(5)
doc_topic = lsa_tfidf.fit_transform(tfidf_doc_term_matrix)
# lsa.explained_variance_ratio_

topic_word = pd.DataFrame(
    lsa_tfidf.components_.round(3), 
    index = ['topic1','topic2','topic3','topic4','topic5'],
    columns = tfidf.get_feature_names()
    )
topic_word

Unnamed: 0,aaa,aah,aaliyah,aback,abandoned,abashed,abc,abdul,abeat,abel,...,zulu,zurück,çünkü,étais,était,être,über,üstünde,şey,şimdi
topic1,0.001,0.006,0.001,0.0,0.001,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
topic2,-0.001,-0.003,0.001,-0.0,-0.001,-0.0,0.0,-0.0,-0.0,-0.0,...,0.0,0.001,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0
topic3,-0.001,-0.0,-0.0,-0.0,-0.001,-0.0,-0.0,-0.0,-0.0,-0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0
topic4,-0.0,-0.005,-0.001,-0.0,0.001,-0.0,-0.0,-0.0,-0.0,-0.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0
topic5,-0.002,-0.007,0.001,-0.0,-0.001,-0.0,0.0,-0.0,-0.0,-0.001,...,-0.0,-0.001,-0.0,-0.0,-0.0,-0.0,-0.001,-0.0,-0.0,-0.0


In [16]:
display_topics(lsa_tfidf, tfidf.get_feature_names(), 10)


Topic  0
nigga, tonight, gotta, really, long, bitch, better, shit, turn, friend

Topic  1
nigga, bitch, shit, fuck, ayy, money, gon, lil, hoe, hit

Topic  2
tonight, body, dance, party, alright, gotta, shake, rock, floor, everybody

Topic  3
tonight, nigga, bitch, fuck, dream, believe, forever, shit, promise, true

Topic  4
really, gotta, bad, better, promise, babe, friend, stay, real, sorry


### NMF below

In [17]:
# NMF with CountVectorizer

nmf_cv = NMF(5)
doc_topic = nmf_cv.fit_transform(cv_doc_term_matrix)

# doc_topic_mat = pd.DataFrame(
#     doc_topic.round(3),
#     columns = ['topic1','topic2','topic3','topic4','topic5']
# )

topic_word = pd.DataFrame(
    nmf_cv.components_.round(3),
    columns = cv.get_feature_names()
    )
topic_word

Unnamed: 0,aaa,aah,aaliyah,aback,abandoned,abashed,abc,abdul,abeat,abel,...,zulu,zurück,çünkü,étais,était,être,über,üstünde,şey,şimdi
0,0.0,0.001,0.0,0.029,0.11,0.027,0.0,0.001,0.002,0.001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.001,0.001,0.0,0.002,0.098,0.049,0.001,0.003,0.004,0.002,...,0.0,0.0,0.0,0.0,0.0,0.0,0.002,0.0,0.0,0.0
4,0.001,0.048,0.005,0.005,0.0,0.0,0.003,0.007,0.011,0.007,...,0.075,0.02,0.0,0.0,0.0,0.001,0.014,0.0,0.001,0.001


In [18]:
display_topics(nmf_cv, cv.get_feature_names(), 15)


Topic  0
saint, did, friend, moment, people, room, word, quite, grandmother, great, house, face, thought, having, woman

Topic  1
doo, shark, shoo, alright, wild, woman, looking, doop, shooby, bay, run, end, west, morning, wop

Topic  2
choo, ride, woo, train, coming, sound, getting, dancing, walk, mon, jump, pack, drive, talk, hoo

Topic  3
juan, woman, ana, devil, statue, old, heaven, tanner, friend, men, lady, hell, did, social, force

Topic  4
nigga, hey, bitch, shit, fuck, featuring, money, rock, low, gotta, bad, big, gon, beat, doh


In [19]:
# NMF with TF-IDF

nmf_tfidf = NMF(5)
doc_topic = nmf_tfidf.fit_transform(tfidf_doc_term_matrix)

# doc_topic_mat = pd.DataFrame(
#     doc_topic.round(3),
#     columns = ['topic1','topic2','topic3','topic4','topic5']
# )

topic_word = pd.DataFrame(
    nmf_tfidf.components_.round(3),
    columns = tfidf.get_feature_names()
    )
topic_word

Unnamed: 0,aaa,aah,aaliyah,aback,abandoned,abashed,abc,abdul,abeat,abel,...,zulu,zurück,çünkü,étais,était,être,über,üstünde,şey,şimdi
0,0.004,0.019,0.0,0.0,0.004,0.0,0.0,0.001,0.001,0.001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.002,0.0,0.0,0.0
1,0.0,0.0,0.003,0.0,0.0,0.0,0.001,0.0,0.0,0.0,...,0.0,0.003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.012,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.002,0.0,0.0,0.0,0.002,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
display_topics(nmf_tfidf, tfidf.get_feature_names(), 15)


Topic  0
dream, believe, gone, forever, long, remember, wish, true, fall, inside, home, face, place, better, stay

Topic  1
nigga, bitch, shit, fuck, ayy, money, gon, lil, hoe, real, big, hit, fuckin, pussy, dick

Topic  2
tonight, alright, waiting, tomorrow, fight, party, tight, kiss, inside, gotta, drink, dancing, boo, end, broken

Topic  3
dance, body, shake, rock, everybody, party, stop, turn, floor, play, music, round, roll, club, beat

Topic  4
really, gotta, bad, somebody, woman, real, friend, yes, lover, try, babe, stay, sorry, talk, care


### LDA Below

In [21]:
# for TF DTM
lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
lda_tf.fit(cv_doc_term_matrix)

# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=10, random_state=0)
lda_tfidf.fit(tfidf_doc_term_matrix)

LatentDirichletAllocation(random_state=0)

In [22]:
display_topics(lda_tf, cv.get_feature_names(), 15)


Topic  0
tonight, break, friend, fall, light, going, dem, gimme, end, power, black, turn, lie, blue, clean

Topic  1
doo, really, believe, dream, forever, hold, feeling, whoa, did, inside, better, miss, place, face, close

Topic  2
gotta, crazy, walk, kiss, talk, tonight, hold, try, yes, touch, lover, everybody, sorry, rock, hey

Topic  3
shake, gon, real, wit, step, nigga, run, long, gotta, throw, dat, hit, huh, booty, really

Topic  4
woo, ride, que, hot, party, fly, rock, choo, whoa, roll, bye, bit, lady, hip, stop

Topic  5
hey, woman, wild, juan, ready, head, old, wee, happy, lady, bring, dee, sweet, hoo, people

Topic  6
body, low, dance, turn, play, stop, song, music, floor, alright, fine, wish, babe, matter, hey

Topic  7
did, people, moment, saint, friend, word, room, great, quite, ich, house, having, face, thought, went

Topic  8
work, ayy, home, remember, gone, wake, stay, mmm, til, jump, beautiful, change, wait, doh, long

Topic  9
nigga, bitch, shit, fuck, bad, money, bet

In [23]:
display_topics(lda_tfidf, tfidf.get_feature_names(), 15)


Topic  0
runaway, beggin, dreamed, wha, brokenhearted, dah, exception, downtown, husband, anniversary, barefoot, funkdafied, international, passing, jagger

Topic  1
ich, shoo, doop, shoop, mmmmmm, vogue, clout, battlefield, nicht, woohoo, glamorous, monica, lounging, untouchable, nutty

Topic  2
bye, rockabye, dutty, diva, whoomp, kidd, mattered, ditty, soak, suicidal, reached, boi, tasted, delilah, eah

Topic  3
aah, bent, woop, romantic, ron, coco, dura, confidence, traded, ella, dougie, eruption, location, motherfucking, skater

Topic  4
nigga, bitch, shit, money, fuck, gon, body, gotta, rock, ayy, dance, hit, party, bout, shake

Topic  5
hypnotized, excited, sha, heyy, desert, unwind, upper, romeo, fighter, aaa, peat, hopeless, quan, timber, natural

Topic  6
diggin, kissin, coco, choo, circus, justify, ong, whistle, overwhelmed, daughter, insensitive, thurr, sneakin, titanium, draw

Topic  7
breathin, dangerous, duh, halo, haunt, understanding, insist, swang, getta, ayer, damage