# NMF For Topic Modelling 

In [62]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from nltk.corpus import stopwords
from unidecode import unidecode

In [2]:
path = '../../data/lyrics/data/subset_bof.csv'
df = pd.read_csv(path,index_col=0)

In [5]:
df.head()

Unnamed: 0,i,the,you,to,and,a,me,it,not,in,...,writer,motivo,bake,insist,wel,santo,pe,gee,colleg,kad
TRAYYAU128F92D58D0,0.0,3.0,4.0,2.0,4.0,5.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TRARURM128F931A91B,5.0,5.0,9.0,2.0,0.0,4.0,0.0,1.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TRAMSSK128F92FDC80,8.0,13.0,5.0,1.0,6.0,4.0,1.0,0.0,5.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TRAAMPA128F92E7D0D,17.0,0.0,4.0,4.0,5.0,20.0,8.0,18.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TRAWHZK12903CB5A01,18.0,1.0,3.0,6.0,0.0,2.0,0.0,1.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### How Sparse Is Our Data?

In [12]:
(df==0).sum().sum()/np.product(df.shape).astype(float)

0.99460578204103389

### Try NMF

In [142]:
n = 30
nmf = NMF(n_components=n,init='random')

In [143]:
# subset = df.values[:,:1500]
W = nmf.fit_transform(df)
H=nmf.components_

In [144]:
mask=np.array([row[::-1] for row in np.argsort(H,axis=1)])[:,:100]

In [145]:
possible = np.array(df.columns)[mask]

In [146]:
stop_words = stopwords.words('english') + stopwords.words('spanish') + stopwords.words('german') + stopwords.words('french')
filtered = np.array([[word for word in topic  if word not in stop_words] for topic in possible])

In [147]:
for topic in filtered:
    print topic[:10]

['come', 'get', 'top', 'caus', 'bottom', 'cop', 'pick', 'block', 'knock', 'give']
['know', 'wanna', 'realli', 'alright', 'want', 'done', 'noth', 'gonna', 'nobodi', 'leav']
['time', 'littl', 'eye', 'end', 'life', 'one', 'fall', 'soul', 'lie', 'thought']
['head', 'like', 'one', 'see', 'music', 'world', 'fuck', 'rock', 'got', 'hit']
['know', 'would', 'need', 'whi', 'feel', 'could', 'never', 'see', 'way', 'said']
['like', 'niggaz', 'next', 'fuck', 'na', 'wish', 'want', 'drinkin', 'drink', 'ghetto']
['round', 'make', 'said', 'say', 'start', 'onli', 'eye', 'well', 'face', 'hold']
['love', 'babi', 'song', 'girl', 'heart', 'g', 'onli', 'show', 'first', 'give']
['quiero', 'si', 'amor', 'mas', 'ser', 'vida', 'solo', 'nunca', 'ahora', 'vez']
['let', 'tell', 'like', 'get', 'take', 'got', 'yeah', 'rain', 'feel', 'make']
['go', 'babi', 'let', 'rhythm', 'upset', 'hey', 'girl', 'tell', 'time', 'say']
['like', 'get', 'gonna', 'caus', 'never', 'tonight', 'fuck', 'nigga', 'yeah', 'bad']
['one', 'gonna', 

In [155]:
np.argmax(W,axis=1)

array([2, 0, 2, ..., 0, 0, 0])

In [156]:
from collections import Counter
c = Counter(np.argmax(W,axis=1))
c

Counter({0: 5359,
         1: 418,
         2: 791,
         3: 217,
         4: 248,
         5: 6,
         6: 2,
         7: 4,
         8: 187,
         9: 64,
         10: 1,
         11: 11,
         12: 51,
         13: 6,
         14: 30,
         15: 28,
         16: 2,
         17: 49,
         18: 1,
         19: 4,
         20: 4,
         21: 7,
         23: 9,
         25: 1,
         27: 3,
         28: 3})

### Try LDA

In [3]:
from sklearn.decomposition import LatentDirichletAllocation as LDA

In [75]:
n = 30
lda = LDA(n_topics=n)

In [76]:
X = lda.fit_transform(df)



In [125]:
components = lda.components_
mask=np.array([row[::-1] for row in np.argsort(components,axis=1)])[:,:100]
possible = np.array(df.columns)[mask]
stop_words = stopwords.words('english') + stopwords.words('spanish') + stopwords.words('german') + stopwords.words('french')
filtered = np.array([[word for word in topic  if word not in stop_words] for topic in possible])

for topic in filtered:
    print topic[:12]

['get', 'like', 'nigga', 'shit', 'fuck', 'em', 'yall', 'hot', 'parti', 'hit', 'beat', 'bitch']
['love', 'know', 'want', 'would', 'feel', 'say', 'time', 'never', 'ca', 'see', 'thing', 'one']
['victim', 'disconnect', 'p', 'circumst', 'robot', 'ow', 'june', 'leaf', 'balloon', 'aha', 'jim', 'santa']
['like', 'got', 'get', 'make', 'back', 'know', 'go', 'take', 'caus', 'see', 'well', 'gonna']
['nah', 'final', 'oi', 'favor', 'spoon', 'dat', 'oper', 'ei', 'possibl', 'metal', 'devil', 'worm']
['na', 'meu', 'cantar', 'quero', 'pra', 'sem', 'seu', 'com', 'vou', 'minha', 'tal', 'mim']
['volta', 'tenho', 'ficar', 'ond', 'sail', 'sempr', 'lua', 'ann', 'come', 'joe', 'say', 'see']
['song', 'old', 'said', 'stand', 'round', 'jesus', '&', 'goin', 'play', 'start', 'never', 'tonight']
['babi', 'let', 'go', 'ay', 'take', 'fantasi', 'got', 'ladi', 'blame', 'cant', 'wont', 'tonight']
['denk', 'immer', 'mehr', 'jag', 'ja', 'ab', 'rap', 'gibt', 'nein', 'seh', 'mal', 'och']
['rain', 'fire', 'chorus', 'heaven', 



In [101]:
ind_mask = np.argmax(X,axis=0)
ind_mask.shape
X.shape

(7506, 30)

In [79]:
imp_track_ids = np.array(df.index)[ind_mask]

In [80]:
conn = sqlite3.connect('../../data/MillionSongSubset/AdditionalFiles/subset_track_metadata.db')
q = "SELECT title, artist_name FROM songs WHERE track_id = '{}';"
# q2 = "SELECT title, artist_name FROM songs WHERE track_id = 'TRAWVWS128F42ADCD1';"
# print q.format(imp_track_ids[0])
for track_id, topic in zip(imp_track_ids,filtered):
    song,artist= conn.execute(q.format(track_id)).fetchone()
    formatted = '"{}" by {}, Keywords: {}'
    print formatted.format(unidecode(song), unidecode(artist), topic[0:5])
conn.close()

"You Gotta Be Movin'" by Corona, Keywords: ['get', 'ya', 'like', 'nigga', 'shit']
"It's Not Me" by 3 Doors Down, Keywords: ['love', 'know', 'want', 'would', 'feel']
"Victim Of Circumstance" by Ten Years After, Keywords: ['victim', 'disconnect', 'p', 'circumst', 'robot']
"Give It To 'Em" by Chops, Keywords: ['like', 'got', 'get', 'make', 'back']
"Beni Aglatma" by Mustafa Sandal, Keywords: ['nah', 'final', 'oi', 'favor', 'spoon']
"Meu Bem Meu Mal (Mi Bien Mi Mal)" by Gal Costa, Keywords: ['na', 'e', 'eu', 'um', 'meu']
"Civilized Reggae" by Burning Spear, Keywords: ['volta', 'tenho', 'ficar', 'ond', 'sail']
"Whiskey on the Fire" by Aaron Watson, Keywords: ['song', 'man', 'old', 'said', 'stand']
"Don't Upset The Rhythm (Go Baby Go)" by Noisettes, Keywords: ['babi', 'let', 'go', 'ay', 'take']
"Mein Herz kann man nicht kaufen" by Gall_ France, Keywords: ['ich', 'und', 'du', 'die', 'das']
"Cursed Angel Of Doom (Previously Unreleased&Re-Recorded Track)" by Behemoth, Keywords: ['rain', 'fire', 

In [127]:
songs_mask = np.argsort(X,axis=0).T
songs_mask = np.array([mask[::-1] for mask in songs_mask])[:,:5]


In [130]:
imp_3track_ids = np.array(df.index)[songs_mask]


In [131]:
conn = sqlite3.connect('../../data/MillionSongSubset/AdditionalFiles/subset_track_metadata.db')
q = "SELECT title, artist_name FROM songs WHERE track_id = '{}';"
# q2 = "SELECT title, artist_name FROM songs WHERE track_id = 'TRAWVWS128F42ADCD1';"
# print q.format(imp_track_ids[0])
for track_ids, topic in zip(imp_3track_ids,filtered):
    print topic[:10]
    formatted = '\t"{}" by {}'
    for track_id in track_ids:
        song,artist= conn.execute(q.format(track_id)).fetchone()
        print formatted.format(unidecode(song), unidecode(artist), topic[0:5])
conn.close()

['get', 'like', 'nigga', 'shit', 'fuck', 'em', 'yall', 'hot', 'parti', 'hit']
	"You Gotta Be Movin'" by Corona
	"Da Da Da Da" by Tha Liks
	"Spelling Beatnuts with Lil' Donny" by Beatnuts with Lil' Donny
	"I'ma Bang" by DMX
	"We Got What You Want" by Busta Rhymes
['love', 'know', 'want', 'would', 'feel', 'say', 'time', 'never', 'ca', 'see']
	"It's Not Me" by 3 Doors Down
	"Never Alone" by Open Hand
	"What You Deserve (Album Version)" by Ill Nino
	"I Don't Know" by Teenage Fanclub
	"No Answer (Outro)" by Lollipop Lust Kill
['victim', 'disconnect', 'p', 'circumst', 'robot', 'ow', 'june', 'leaf', 'balloon', 'aha']
	"Victim Of Circumstance" by Ten Years After
	"Disconnected" by Cowboy Mouth
	"CNN War Theme" by Monster Magnet
	"Power Shower" by Caspa
	"Tocame" by Gisselle
['like', 'got', 'get', 'make', 'back', 'know', 'go', 'take', 'caus', 'see']
	"Give It To 'Em" by Chops
	"Early In The Morning" by B.B. King
	"Hypergasm" by Chaotica
	"Is It Clean (Ethereal - Progressive)" by The Gone Jackal

In [148]:
np.argmax(X,axis=1)

array([7, 1, 1, ..., 0, 0, 0])

In [134]:
len(X)

7506

In [136]:
from collections import Counter
c = Counter(np.argmax(X,axis=1))

In [137]:
c

Counter({0: 5196,
         1: 1032,
         3: 595,
         4: 1,
         5: 13,
         7: 3,
         8: 3,
         9: 34,
         13: 8,
         14: 12,
         15: 2,
         17: 2,
         19: 115,
         20: 3,
         21: 6,
         22: 86,
         23: 3,
         25: 189,
         26: 13,
         28: 190})

In [140]:
components.shape

(30, 5000)

In [157]:
len(c)

26