In [1]:
import pickle as pkl
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity


In [17]:
with open('all_lyrics_df.pkl', 'rb') as f:
    all_lyrics_df = pkl.load(f)

with open('taxis_df.pkl', 'rb') as f:
    taxis_df = pkl.load(f)

with open('prosody_df.pkl', 'rb') as f:
    prosody_df = pkl.load(f)

In [3]:
with open('tfidf_v1.pkl', 'rb') as f: 
    X_tfidf = pkl.load(f)

In [4]:

X = all_lyrics_df.lyrics
y = all_lyrics_df.song

In [5]:
vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = vectorizer.fit_transform(X)

In [6]:
# LSA
lsa = TruncatedSVD(3)
song_topic = lsa.fit_transform(X_tfidf)


In [7]:
lsa.components_

array([[ 3.13268289e-04,  2.92496530e-04,  3.74620438e-05, ...,
         1.52458754e-05,  1.72233910e-05,  6.86466341e-05],
       [-2.29357954e-04, -4.96095020e-04, -1.80382420e-05, ...,
        -3.26364398e-05, -5.29796716e-05,  7.38895047e-05],
       [-5.85078767e-04,  7.75026130e-04, -4.82829699e-05, ...,
        -3.55822884e-05, -2.72232314e-05, -1.18669622e-04]])

In [8]:
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = ["component_{}".format(i) for i in range(1,4)],
             columns = vectorizer.get_feature_names())
topic_word

Unnamed: 0,aa,aaa,aaaa,aaaaa,aaaaaa,aaaaaaaaaaaah,aaaaaaaaaaaassssssss,aaaaaaaalchemist,aaaaaah,aaaaaaminé,...,ﬁghter,ﬁghtin,ﬁgure,ﬁnd,ﬁnger,ﬁshers,ﬁt,ﬁtted,ﬂirts,𝘐𝘵
component_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_2,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0
component_3,-0.001,0.001,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0


In [9]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [10]:
display_topics(lsa, vectorizer.get_feature_names(), 50)


Topic  0
im, like, yeah, nigga, dont, got, know, aint, niggas, bitch, shit, just, fuck, cause, love, baby, want, wanna, man, thats, make, em, time, money, say, let, need, oh, right, girl, tell, way, ya, gon, come, life, real, gotta, feel, yo, youre, ill, think, bitches, ass, really, hit, yall, said, bout

Topic  1
nigga, bitch, niggas, fuck, shit, bitches, money, ass, lil, aint, hoes, ayy, gang, pussy, fuckin, dick, got, em, ho, yall, big, gon, yo, bout, motherfuckin, hit, talkin, pull, fucking, young, block, shoot, glock, rich, gangsta, wit, bag, bang, real, trap, bands, huh, hoe, broke, motherfucker, cash, pop, hood, dope, gettin

Topic  2
yeah, ayy, ooh, oh, baby, bitch, uh, woo, woah, girl, huh, shawty, lil, hey, gon, love, skrrt, want, ima, diamonds, racks, ah, okay, gang, bands, yuh, bad, whoa, bag, nah, tryna, pull, ho, wrist, bitches, uhhuh, drip, pussy, wanna, need, coupe, babe, slatt, trap, know, flex, dick, alright, yah, aw


In [11]:
nmf = NMF(4)
doc_topic = nmf.fit_transform(X_tfidf)

In [12]:
topic_word = pd.DataFrame(nmf.components_.round(3),
             index = ["component_{}".format(i) for i in range(1,5)],
             columns = vectorizer.get_feature_names())
topic_word

Unnamed: 0,aa,aaa,aaaa,aaaaa,aaaaaa,aaaaaaaaaaaah,aaaaaaaaaaaassssssss,aaaaaaaalchemist,aaaaaah,aaaaaaminé,...,ﬁghter,ﬁghtin,ﬁgure,ﬁnd,ﬁnger,ﬁshers,ﬁt,ﬁtted,ﬂirts,𝘐𝘵
component_1,0.003,0.0,0.0,0.0,0.001,0.001,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_3,0.0,0.002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_4,0.0,0.003,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
display_topics(nmf, vectorizer.get_feature_names(), 50)



Topic  0
im, like, dont, just, got, know, aint, cause, man, life, time, thats, make, em, say, shit, way, right, ill, think, ya, feel, yo, people, gonna, gotta, youre, let, tell, day, need, better, said, wanna, mind, yall, ive, come, look, world, black, good, really, little, hard, new, live, trying, real, god

Topic  1
nigga, niggas, bitch, fuck, shit, aint, got, bitches, money, like, ass, im, gon, hoes, em, dont, lil, pussy, real, fuckin, bout, know, hit, gang, thats, yo, dick, big, yall, ho, fucking, young, want, ya, boy, ain, man, don, pull, talkin, bad, wit, wanna, run, cause, broke, gotta, ima, block, make

Topic  2
yeah, ayy, uh, ooh, im, bitch, oh, got, woo, like, huh, woah, know, hey, ima, okay, gon, tryna, shawty, ah, diamonds, lil, skrrt, racks, aint, nah, bad, dont, pull, just, big, bag, need, yuh, fuck, whoa, gang, money, bands, wanna, uhhuh, party, new, look, wrist, way, ho, gettin, drip, thats

Topic  3
love, baby, girl, oh, know, want, dont, wanna, just, youre, let, need

#### Notes: 
Topic 0: Significantly fewer interjections ('ayy', 'ooh', etc.) and the clearest diversity of vocabulary. Terms appear comparatively more abstract. Likely capturing more lyrically dense songs relative to other categories. 

Topic 1: The most dominated by profanity and bragadocious terms ('b*tches', 'd*ck', 'pull', 'money') as well as an outsized amount of first/second person pronouns  ('im', 'em', 'yall', 'ya', 'ima') implying combative or comparative content. 

Topic 2: Dominated by interjections and materialistic nouns ('diamonds', 'racks', 'bag', 'money', 'bands', 'wrist', 'drip'). Likely to be catchier, less lyrically dense songs (i.e. 'club bangers').

Topic 3: Very clearly oriented around love and sex ('love', 'baby', 'girl' are the top 3 words), focusing on emotional terms ('wanna', 'feel', 'mind') and intimacy ('night', 'heart', 'hold', 'body', 'babe', 'eyes').  

In [14]:
H = pd.DataFrame(doc_topic.round(5),
             index = y,
             columns = ['Lyrical/Misc', 'Bragging/Combative', 'Money/Party', 'Romance/Sex'])
H = H.reset_index()
H 

Unnamed: 0,song,Lyrical/Misc,Bragging/Combative,Money/Party,Romance/Sex
0,/songs/3546392,0.01363,0.01723,0.00556,0.02202
1,/songs/3801456,0.01710,0.02290,0.00417,0.00493
2,/songs/3545295,0.00576,0.02158,0.02876,0.00000
3,/songs/4485607,0.01700,0.03912,0.00314,0.00820
4,/songs/3801450,0.00948,0.06000,0.01103,0.03082
...,...,...,...,...,...
17561,/songs/347240,0.03320,0.00000,0.00938,0.00697
17562,/songs/48980,0.04583,0.00000,0.00000,0.01404
17563,/songs/48968,0.03609,0.00000,0.00000,0.00568
17564,/songs/48960,0.03703,0.00000,0.00030,0.00000


In [15]:
artist_mean_topic = all_lyrics_df[['artist', 'song']].merge(H).groupby('artist').mean()

#### Notable Artists 

*Lyrical/Misc*:
Eminem, Black Thought, Grandmaster Flash, Ab-Soul, Fort Minor, Logic

*Bragging/Combative*: 
Meek Mill, Jadakiss, Pusha T, Bobby Shmurda, Soulja Boy, Too Short

*Money/Party*:
Young Thug, Lil Uzi Vert, Nicki Minaj, Lil Pump, Playboi Carti, DaBaby 

*Romance/Sex*:The Weeknd, Usher, Chris Brown, R. Kelly, Trey Songz



In [16]:
artist_mean_topic.sort_values('Romance/Sex', ascending=False).head(51)

Unnamed: 0_level_0,Lyrical/Misc,Bragging/Combative,Money/Party,Romance/Sex
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
The-Dream,0.0102,0.01989,0.02368,0.0813
Taio Cruz,0.015455,0.0,0.00096,0.073485
Mo B. Dick,0.02533,0.0,0.0101,0.07092
Jennifer Lopez,0.023398,0.002757,0.01965,0.070156
Johntá Austin,0.01359,0.0,0.0,0.06708
J. Valentine,0.024882,0.00407,0.019495,0.06577
Aloe Blacc,0.000725,0.0,0.06328,0.065115
Benny Blanco,0.02272,0.0,0.02493,0.06493
Curtiss King,0.016532,0.011083,1.3e-05,0.064118
Focus...,0.006962,0.005955,0.042385,0.062872


In [25]:
artist_mean_taxis = taxis_df.groupby('artist').mean()
artist_mean_taxis

Unnamed: 0_level_0,lines,words,words_per_line,unique_words,unique_word_rate
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
03 Greedo,76.312500,2684.312500,35.125816,185.375000,0.070810
070 Shake,70.352941,2122.294118,30.507007,137.470588,0.071274
2 Chainz,68.333333,2240.666667,32.916186,194.111111,0.090778
2 Pistols,78.545455,2203.181818,28.376804,178.272727,0.085574
21 Savage,81.904762,2764.666667,33.424155,189.095238,0.070440
...,...,...,...,...,...
Zebra Katz,67.666667,1737.833333,25.122420,132.944444,0.087163
Zelooperz,57.636364,1636.636364,29.884914,151.060606,0.095839
Zion I,60.300000,2030.800000,34.300381,210.200000,0.104322
bbno$,53.000000,1655.666667,31.152735,92.333333,0.055000


In [26]:
artist_mean_prosody = prosody_df.groupby('artist').mean()
artist_mean_prosody

Unnamed: 0_level_0,syllables,syllables_per_line,syllables_per_word
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
03 Greedo,701.687500,9.211347,0.261876
070 Shake,561.941176,8.091570,0.264949
2 Chainz,590.000000,8.641371,0.262462
2 Pistols,562.272727,7.243027,0.255224
21 Savage,706.047619,8.484020,0.253769
...,...,...,...
Zebra Katz,451.888889,6.521429,0.259716
Zelooperz,422.696970,7.746176,0.258715
Zion I,532.600000,8.986945,0.262015
bbno$,456.000000,8.575163,0.275106


In [23]:
artist_mean_topic

Unnamed: 0_level_0,Lyrical/Misc,Bragging/Combative,Money/Party,Romance/Sex
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
03 Greedo,0.012388,0.040464,0.021360,0.011842
070 Shake,0.015179,0.001804,0.021861,0.036665
2 Chainz,0.018329,0.023071,0.021967,0.006407
2 Pistols,0.013755,0.030431,0.018532,0.021855
21 Savage,0.007464,0.046448,0.046098,0.008801
...,...,...,...,...
Zebra Katz,0.025077,0.012421,0.018921,0.016527
Zelooperz,0.018682,0.037817,0.003213,0.006282
Zion I,0.031290,0.000425,0.003131,0.006091
bbno$,0.009183,0.010923,0.004047,0.009563


In [29]:
artist_features = artist_mean_topic.join(artist_mean_taxis, how='inner').join(artist_mean_prosody, how='inner')
artist_features

Unnamed: 0_level_0,Lyrical/Misc,Bragging/Combative,Money/Party,Romance/Sex,lines,words,words_per_line,unique_words,unique_word_rate,syllables,syllables_per_line,syllables_per_word
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
03 Greedo,0.012388,0.040464,0.021360,0.011842,76.312500,2684.312500,35.125816,185.375000,0.070810,701.687500,9.211347,0.261876
070 Shake,0.015179,0.001804,0.021861,0.036665,70.352941,2122.294118,30.507007,137.470588,0.071274,561.941176,8.091570,0.264949
2 Chainz,0.018329,0.023071,0.021967,0.006407,68.333333,2240.666667,32.916186,194.111111,0.090778,590.000000,8.641371,0.262462
2 Pistols,0.013755,0.030431,0.018532,0.021855,78.545455,2203.181818,28.376804,178.272727,0.085574,562.272727,7.243027,0.255224
21 Savage,0.007464,0.046448,0.046098,0.008801,81.904762,2764.666667,33.424155,189.095238,0.070440,706.047619,8.484020,0.253769
...,...,...,...,...,...,...,...,...,...,...,...,...
Zebra Katz,0.025077,0.012421,0.018921,0.016527,67.666667,1737.833333,25.122420,132.944444,0.087163,451.888889,6.521429,0.259716
Zelooperz,0.018682,0.037817,0.003213,0.006282,57.636364,1636.636364,29.884914,151.060606,0.095839,422.696970,7.746176,0.258715
Zion I,0.031290,0.000425,0.003131,0.006091,60.300000,2030.800000,34.300381,210.200000,0.104322,532.600000,8.986945,0.262015
bbno$,0.009183,0.010923,0.004047,0.009563,53.000000,1655.666667,31.152735,92.333333,0.055000,456.000000,8.575163,0.275106


In [30]:
with open('artist_features.pkl', 'wb') as f:
    pkl.dump(artist_features, f)