# NMF For Topic Modelling 

In [62]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from nltk.corpus import stopwords
from unidecode import unidecode

In [2]:
path = '../../data/lyrics/data/subset_bof.csv'
df = pd.read_csv(path,index_col=0)

In [5]:
df.head()

Unnamed: 0,i,the,you,to,and,a,me,it,not,in,...,writer,motivo,bake,insist,wel,santo,pe,gee,colleg,kad
TRAYYAU128F92D58D0,0.0,3.0,4.0,2.0,4.0,5.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TRARURM128F931A91B,5.0,5.0,9.0,2.0,0.0,4.0,0.0,1.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TRAMSSK128F92FDC80,8.0,13.0,5.0,1.0,6.0,4.0,1.0,0.0,5.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TRAAMPA128F92E7D0D,17.0,0.0,4.0,4.0,5.0,20.0,8.0,18.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TRAWHZK12903CB5A01,18.0,1.0,3.0,6.0,0.0,2.0,0.0,1.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### How Sparse Is Our Data?

In [12]:
(df==0).sum().sum()/np.product(df.shape).astype(float)

0.99460578204103389

### Try NMF

In [6]:
n = 15
iters = 10000
nmf = NMF(n_components=n,init='random')

In [7]:
# subset = df.values[:,:1500]
W = nmf.fit_transform(df)
H=nmf.components_

In [8]:
mask=np.array([row[::-1] for row in np.argsort(H,axis=1)])[:,:100]

In [9]:
possible = np.array(df.columns)[mask]

In [10]:
filtered = np.array([[word for word in topic  if word not in stopwords.words('english')] for topic in possible])

In [11]:
for topic in filtered:
    print topic[:10]

['want', 'tri', 'make', 'got', 'would', 'time', 'back', 'could', 'life', 'one']
['let', 'go', 'tell', 'get', 'like', 'yeah', 'take', 'got', 'ya', 'right']
['love', 'know', 'see', 'think', 'want', 'feel', 'never', 'would', 'heart', 'chang']
['get', 'like', 'come', 'littl', 'caus', 'top', 'got', 'man', 'ya', 'bitch']
['go', 'oh', 'us', 'never', 'thing', 'day', 'everi', 'togeth', 'know', 'live']
['come', 'get', 'said', 'start', 'could', 'eye', 'one', 'right', 'say', 'caus']
['que', 'de', 'la', 'te', 'tu', 'en', 'el', 'mi', 'un', 'yo']
['ca', 'go', 'know', 'babi', 'got', 'wanna', 'ai', 'wo', 'realli', 'thing']
['da', 'like', 'niggaz', 'next', 'fuck', 'na', 'wish', 'want', 'drink', 'rhyme']
['head', 'one', 'see', 'like', 'say', 'light', 'world', 'nigga', 'follow', 'fuck']
['love', 'babi', 'let', 'come', 'help', 'tell', 'call', 'girl', 'go', 'make']
['oh', 'hood', 'life', 'hous', 'love', 'babi', 'got', 'heart', 'nigga', 'would']
['ich', 'und', 'denk', 'du', 'die', 'nicht', 'das', 'es', 'der'

### Try LDA

In [3]:
from sklearn.decomposition import LatentDirichletAllocation as LDA

In [75]:
n = 30
lda = LDA(n_topics=n)

In [76]:
X = lda.fit_transform(df)



In [77]:
components = lda.components_
mask=np.array([row[::-1] for row in np.argsort(components,axis=1)])[:,:100]
possible = np.array(df.columns)[mask]
filtered = np.array([[word for word in topic  if word not in stopwords.words('english')] for topic in possible])

for topic in filtered:
    print topic[:12]



['get', 'ya', 'like', 'nigga', 'shit', 'fuck', 'da', 'yo', 'em', 'yall', 'hot', 'parti']
['love', 'know', 'want', 'would', 'feel', 'say', 'time', 'never', 'ca', 'see', 'thing', 'one']
['victim', 'disconnect', 'p', 'circumst', 'robot', 'ow', 'june', 'leaf', 'balloon', 'aha', 'jim', 'santa']
['like', 'got', 'get', 'make', 'back', 'know', 'go', 'take', 'caus', 'see', 'well', 'man']
['nah', 'final', 'oi', 'favor', 'spoon', 'dat', 'oper', 'ei', 'possibl', 'metal', 'devil', 'worm']
['na', 'e', 'eu', 'um', 'meu', 'cantar', 'quero', 'pra', 'sem', 'seu', 'com', 'vou']
['volta', 'tenho', 'ficar', 'ond', 'sail', 'sempr', 'lua', 'ann', 'come', 'joe', 'say', 'see']
['song', 'man', 'old', 'said', 'stand', 'round', 'jesus', '&', 'goin', 'play', 'start', 'son']
['babi', 'let', 'go', 'ay', 'take', 'fantasi', 'got', 'ladi', 'blame', 'cant', 'wont', 'tonight']
['ich', 'und', 'du', 'die', 'das', 'dem', 'der', 'nicht', 'ist', 'es', 'mich', 'auf']
['rain', 'fire', 'chorus', 'heaven', 'black', 'war', 'bottom

In [101]:
ind_mask = np.argmax(X,axis=0)
ind_mask.shape
X.shape

(7506, 30)

In [79]:
imp_track_ids = np.array(df.index)[ind_mask]

In [80]:
conn = sqlite3.connect('../../data/MillionSongSubset/AdditionalFiles/subset_track_metadata.db')
q = "SELECT title, artist_name FROM songs WHERE track_id = '{}';"
# q2 = "SELECT title, artist_name FROM songs WHERE track_id = 'TRAWVWS128F42ADCD1';"
# print q.format(imp_track_ids[0])
for track_id, topic in zip(imp_track_ids,filtered):
    song,artist= conn.execute(q.format(track_id)).fetchone()
    formatted = '"{}" by {}, Keywords: {}'
    print formatted.format(unidecode(song), unidecode(artist), topic[0:5])
conn.close()

"You Gotta Be Movin'" by Corona, Keywords: ['get', 'ya', 'like', 'nigga', 'shit']
"It's Not Me" by 3 Doors Down, Keywords: ['love', 'know', 'want', 'would', 'feel']
"Victim Of Circumstance" by Ten Years After, Keywords: ['victim', 'disconnect', 'p', 'circumst', 'robot']
"Give It To 'Em" by Chops, Keywords: ['like', 'got', 'get', 'make', 'back']
"Beni Aglatma" by Mustafa Sandal, Keywords: ['nah', 'final', 'oi', 'favor', 'spoon']
"Meu Bem Meu Mal (Mi Bien Mi Mal)" by Gal Costa, Keywords: ['na', 'e', 'eu', 'um', 'meu']
"Civilized Reggae" by Burning Spear, Keywords: ['volta', 'tenho', 'ficar', 'ond', 'sail']
"Whiskey on the Fire" by Aaron Watson, Keywords: ['song', 'man', 'old', 'said', 'stand']
"Don't Upset The Rhythm (Go Baby Go)" by Noisettes, Keywords: ['babi', 'let', 'go', 'ay', 'take']
"Mein Herz kann man nicht kaufen" by Gall_ France, Keywords: ['ich', 'und', 'du', 'die', 'das']
"Cursed Angel Of Doom (Previously Unreleased&Re-Recorded Track)" by Behemoth, Keywords: ['rain', 'fire', 

In [112]:
songs_mask = np.argsort(X,axis=0).T
songs_mask = np.array([mask[::-1] for mask in songs_mask])[:,:3]


In [113]:
songs_mask

array([[7376,  156, 4967],
       [4803, 6675,  548],
       [ 546, 5275, 7102],
       [3626, 1190, 2526],
       [5479, 3277, 1044],
       [5876,  922, 6806],
       [7505, 2946, 2931],
       [   0, 3099, 4864],
       [ 690, 1449, 3740],
       [2205, 2947, 2566],
       [2714,  232,  102],
       [5956, 2006, 2802],
       [ 975, 6086, 6392],
       [   3,  375, 7032],
       [ 119, 3176,  800],
       [ 604, 3702, 4912],
       [1377, 4578, 3821],
       [ 204,  126, 1065],
       [6668, 2676, 6430],
       [3831, 2016,  174],
       [   8, 1282, 4864],
       [ 590, 1330, 5860],
       [ 575, 1732, 4479],
       [1483, 4420, 3177],
       [5304, 7076, 4692],
       [3193, 1445,   52],
       [  19,  359, 7154],
       [3240, 1903, 6196],
       [1657,   85,  971],
       [6668, 4724, 5898]])

In [115]:
imp_3track_ids = np.array(df.index)[songs_mask]
imp_3track_ids

array([['TRAWVWS128F42ADCD1', 'TRBFZRA128F427030C', 'TRBAHED128F426A1C2'],
       ['TRACBWP128C7196948', 'TRARBGM12903CC5026', 'TRABUUH128F4241346'],
       ['TRACNXS128F93395BA', 'TRAGYXL128F92F9FCB', 'TRADPDI128F92EBD32'],
       ['TRAKOAK128F4248629', 'TRAJFRJ128F9328FF1', 'TRAZSZD128F424AA49'],
       ['TRBGICJ128F427BA9A', 'TRARFAZ128F93481AD', 'TRAKCNI128F427BA71'],
       ['TRBCKEI128F92FE542', 'TRBBWIP128F4287400', 'TRAPHDI128F92F5EA1'],
       ['TRBGZFI12903CA54B6', 'TRBFKMN128F4286D1B', 'TRBCYJG128F93283CB'],
       ['TRAYYAU128F92D58D0', 'TRBFDRD128F92F79D5', 'TRAUGYG128F428A174'],
       ['TRAMLGE128F92F81DD', 'TRAJKMI128F4281107', 'TRAVUAJ128E078EDA2'],
       ['TRBHSDN128F4274E9F', 'TRAQDBO128F92ED2AC', 'TRBGBIK128F4294A67'],
       ['TRBBUOJ128F92FC6F2', 'TRANHJV128F92D819D', 'TRBBFHG128F932B9D6'],
       ['TRBGGRK128F422CA7F', 'TRAZYZQ128F146653D', 'TRAXJIS12903CCAF92'],
       ['TRAUKJZ128F9335412', 'TRBBPPB128F14AB242', 'TRASGOE128F42328B4'],
       ['TRAAMPA128F92E7D

In [119]:
conn = sqlite3.connect('../../data/MillionSongSubset/AdditionalFiles/subset_track_metadata.db')
q = "SELECT title, artist_name FROM songs WHERE track_id = '{}';"
# q2 = "SELECT title, artist_name FROM songs WHERE track_id = 'TRAWVWS128F42ADCD1';"
# print q.format(imp_track_ids[0])
for track_ids, topic in zip(imp_3track_ids,filtered):
    print topic[:10]
    formatted = '\t"{}" by {}'
    for track_id in track_ids:
        song,artist= conn.execute(q.format(track_id)).fetchone()
        print formatted.format(unidecode(song), unidecode(artist), topic[0:5])
conn.close()

['get', 'ya', 'like', 'nigga', 'shit', 'fuck', 'da', 'yo', 'em', 'yall']
	"You Gotta Be Movin'" by Corona
	"Da Da Da Da" by Tha Liks
	"Spelling Beatnuts with Lil' Donny" by Beatnuts with Lil' Donny
['love', 'know', 'want', 'would', 'feel', 'say', 'time', 'never', 'ca', 'see']
	"It's Not Me" by 3 Doors Down
	"Never Alone" by Open Hand
	"What You Deserve (Album Version)" by Ill Nino
['victim', 'disconnect', 'p', 'circumst', 'robot', 'ow', 'june', 'leaf', 'balloon', 'aha']
	"Victim Of Circumstance" by Ten Years After
	"Disconnected" by Cowboy Mouth
	"CNN War Theme" by Monster Magnet
['like', 'got', 'get', 'make', 'back', 'know', 'go', 'take', 'caus', 'see']
	"Give It To 'Em" by Chops
	"Early In The Morning" by B.B. King
	"Hypergasm" by Chaotica
['nah', 'final', 'oi', 'favor', 'spoon', 'dat', 'oper', 'ei', 'possibl', 'metal']
	"Beni Aglatma" by Mustafa Sandal
	"Isku Pitkasta Ilosta" by Korpiklaani
	"Iki Tas Corba" by Mustafa Sandal
['na', 'e', 'eu', 'um', 'meu', 'cantar', 'quero', 'pra', '