# NMF For Topic Modelling 

In [1]:
import pandas as pd
import numpy as np
import sqlite3
from sklearn.decomposition import NMF
from nltk.corpus import stopwords
from unidecode import unidecode
from collections import Counter

In [2]:
path = '../../data/lyrics/data/subset_bow_2.csv'
df = pd.read_csv(path,index_col=0)

In [3]:
df.head()

Unnamed: 0,i,the,you,to,and,a,me,it,not,in,...,writer,motivo,bake,insist,wel,santo,pe,gee,colleg,kad
TRAYYAU128F92D58D0,0.0,3.0,4.0,2.0,4.0,5.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TRARURM128F931A91B,5.0,5.0,9.0,2.0,0.0,4.0,0.0,1.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TRAMSSK128F92FDC80,8.0,13.0,5.0,1.0,6.0,4.0,1.0,0.0,5.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TRAAMPA128F92E7D0D,17.0,0.0,4.0,4.0,5.0,20.0,8.0,18.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TRAWHZK12903CB5A01,18.0,1.0,3.0,6.0,0.0,2.0,0.0,1.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
to_drop = []
for ind in df.index:
    if df.loc[ind].sum()==0:
        to_drop.append(ind)

In [5]:
len(to_drop)

5156

In [6]:
df = df.drop(to_drop)

In [9]:
df.columns

Index([u'i', u'the', u'you', u'to', u'and', u'a', u'me', u'it', u'not', u'in',
       ...
       u'writer', u'motivo', u'bake', u'insist', u'wel', u'santo', u'pe',
       u'gee', u'colleg', u'kad'],
      dtype='object', length=5000)

### How Sparse Is Our Data?

In [8]:
(df==0).sum().sum()/np.product(df.shape).astype(float)

0.9827326808510638

### Let's try dropping stopwords

In [13]:
stop_words = stopwords.words('english') + stopwords.words('spanish') + stopwords.words('german') + stopwords.words('french')
col_keep = []
for word in df.columns:
    if word not in stop_words:
        col_keep.append(word)



In [14]:
len(col_keep)

4654

In [15]:
df = df[col_keep]

### Try NMF

In [67]:
n = 18
nmf = NMF(n_components=n,init='random')

In [68]:
# subset = df.values[:,:1500]
W = nmf.fit_transform(df)
H=nmf.components_

In [69]:
mask=np.array([row[::-1] for row in np.argsort(H,axis=1)])[:,:100]

In [70]:
possible = np.array(df.columns)[mask]

In [71]:
stop_words = stopwords.words('english') + stopwords.words('spanish') + stopwords.words('german') + stopwords.words('french')
filtered = np.array([[word for word in topic  if word not in stop_words] for topic in possible])

In [72]:
songs_mask = np.argsort(W,axis=0).T
songs_mask = np.array([mask[::-1] for mask in songs_mask])[:,:5]
imp_3track_ids = np.array(df.index)[songs_mask]

conn = sqlite3.connect('../../data/MillionSongSubset/AdditionalFiles/subset_track_metadata.db')
q = "SELECT title, artist_name FROM songs WHERE track_id = '{}';"
# q2 = "SELECT title, artist_name FROM songs WHERE track_id = 'TRAWVWS128F42ADCD1';"
# print q.format(imp_track_ids[0])
for track_ids, topic in zip(imp_3track_ids,filtered):
    print topic[:10]
    formatted = '\t"{}" by {}'
    for track_id in track_ids:
        song,artist= conn.execute(q.format(track_id)).fetchone()
        print formatted.format(unidecode(song), unidecode(artist), topic[0:5])
conn.close()

['got' 'nigga' 'make' 'way' 'wanna' 'gonna' 'shit' 'caus' 'well' 'keep']
	"Ain't Got Nothin' But The Blues (Album Version)" by Robben Ford
	"Coyotes (Album Version)" by Jason Mraz
	"La Photo" by Basement Jaxx
	"Warm Embrace (LP Version)" by Twista & The Speedknot Mobstaz
	"The Hotness (ft. Shontelle)" by Rihanna
['know' 'want' 'need' 'gonna' 'right' 'thing' 'feel' 'give' 'time' 'wanna']
	"What You Deserve (Album Version)" by Ill Nino
	"I Don't Know" by Teenage Fanclub
	"Tha Day" by K's Choice
	"Out In The Street (Live) (2008 Digital Remaster)" by UFO
	"Dying to Know (Album Version)" by Pennywise
['oh' 'bye' 'ooh' 'yeah' 'whoa' 'way' 'new' 'hey' 'take' 'wow']
	"Heaven's In New York" by Wyclef Jean
	"It Ain't No Use" by Jimmy McCracklin
	"Contagious" by Y&T
	"Slipping Away" by Vivian
	"The Same Old Innocence" by Architecture In Helsinki
['love' 'song' 'first' 'heart' 'show' 'onli' 'give' 'caus' 'way' 'put']
	"Don't Phunk With My Heart" by Java
	"(This Is Not A) Love Song (Live)" by Publi

In [66]:
# for topic in filtered:
#     print topic[:10]

In [73]:
np.argmax(W,axis=1)

array([ 5,  7,  1, ..., 17,  3, 17])

In [74]:
c = Counter(np.argmax(W,axis=1))
c

Counter({0: 268,
         1: 544,
         2: 258,
         3: 100,
         4: 47,
         5: 129,
         6: 96,
         7: 140,
         8: 55,
         9: 31,
         10: 58,
         11: 62,
         12: 33,
         13: 74,
         14: 53,
         15: 48,
         16: 94,
         17: 260})

### Try LDA

In [75]:
from sklearn.decomposition import LatentDirichletAllocation as LDA

In [84]:
n = 20
lda = LDA(n_topics=n,max_iter=25)

In [85]:
X = lda.fit_transform(df)



In [78]:
components = lda.components_
mask=np.array([row[::-1] for row in np.argsort(components,axis=1)])[:,:100]
possible = np.array(df.columns)[mask]
stop_words = stopwords.words('english') + stopwords.words('spanish') + stopwords.words('german') + stopwords.words('french')
filtered = np.array([[word for word in topic  if word not in stop_words] for topic in possible])

for topic in filtered:
    print topic[:10]

['lord' 'god' 'jesus' 'di' 'holi' 'heaven' 'savior' 'che' 'nuh' 'thi']
['ah' 'whoa' 'woman' 'well' 'wish' 'town' 'oh' 'pretti' 'sail' 'littl']
['dan' 'tout' 'cest' 'si' 'ell' 'mai' 'fair' 'plus' 'jai' 'ca']
['love' 'babi' 'oh' 'yeah' 'girl' 'go' 'want' 'ooh' 'hey' 'boy']
['quiero' 'si' 'amor' 'mas' 'solo' 'corazon' 'vida' 'nunca' 'ser' 'vez']
['kill' 'help' 'round' 'nah' 'aliv' 'ca' 'want' 'sick' 'scare' 'breath']
['would' 'never' 'one' 'life' 'want' 'onli' 'whi' 'look' 'could' 'see']
['alright' 'saturday' 'lead' 'u' 'sunday' 'lie' 'jame' 'around' 'tonight'
 'bore']
['parti' 'beauti' 'follow' 'say' 'road' 'king' 'g' '4' 'lyric' 'machin']
['jag' 'gimm' 'som' 'har' 'kan' 'det' 'ur' 'samba' 'och' 'whatcha']
['eye' 'us' 'burn' 'face' 'world' 'see' 'life' 'blood' 'rain' 'fall']
['know' 'love' 'feel' 'let' 'time' 'go' 'say' 'like' 'away' 'way']
['come' 'danc' 'light' 'see' 'pleas' 'roll' 'hous' 'yea' 'bye' 'around']
['believ' 'rock' 'gone' 'easi' 'true' 'morn' 'blue' 'mama' 'hard' 'daddi']
[

In [79]:
ind_mask = np.argmax(X,axis=0)

In [80]:
songs_mask = np.argsort(X,axis=0).T
songs_mask = np.array([mask[::-1] for mask in songs_mask])[:,:5]
imp_3track_ids = np.array(df.index)[songs_mask]

In [81]:
conn = sqlite3.connect('../../data/MillionSongSubset/AdditionalFiles/subset_track_metadata.db')
q = "SELECT title, artist_name FROM songs WHERE track_id = '{}';"
# q2 = "SELECT title, artist_name FROM songs WHERE track_id = 'TRAWVWS128F42ADCD1';"
# print q.format(imp_track_ids[0])
for track_ids, topic in zip(imp_3track_ids,filtered):
    print topic[:10]
    formatted = '\t"{}" by {}'
    for track_id in track_ids:
        song,artist= conn.execute(q.format(track_id)).fetchone()
        print formatted.format(unidecode(song), unidecode(artist), topic[0:5])
conn.close()

['lord' 'god' 'jesus' 'di' 'holi' 'heaven' 'savior' 'che' 'nuh' 'thi']
	"24.000 Baci" by Christos Dantis
	"Alla Fine Di Tutto Questo" by Fabri Fibra
	"Un nuovo giorno" by Cali
	"Balon Combo" by Mau Mau
	"Silent Night_ Holy Night! (Album Version)" by Faith Hill
['ah' 'whoa' 'woman' 'well' 'wish' 'town' 'oh' 'pretti' 'sail' 'littl']
	"We Wish You A Merry Christmas" by SNOWPATROL
	"See My Friends" by Ray Davies
	"River Is Waiting" by Irma Thomas
	"Slipping Away" by Vivian
	"Have Yourself A Merry Little Christmas (LP Version)" by George Huff
['dan' 'tout' 'cest' 'si' 'ell' 'mai' 'fair' 'plus' 'jai' 'ca']
	"Laquelle Des Deux Est La Plus Snob" by Jacques Dutronc
	"Chlore" by Pascal Obispo
	"Mon dieu" by Joselito
	"Si Tu Veux Le Savoir" by Marc Lavoine
	"Desole Pour La Poussiere" by Miossec
['love' 'babi' 'oh' 'yeah' 'girl' 'go' 'want' 'ooh' 'hey' 'boy']
	"It's You" by Toots & The Maytals
	"Pianolude" by Usher
	"Don't Phunk With My Heart" by Java
	"Don't Upset The Rhythm (Go Baby Go)" by Nois

In [82]:
len(X)

2350

In [83]:
c = Counter(np.argmax(X,axis=1))
c

Counter({0: 9,
         1: 22,
         2: 66,
         3: 82,
         4: 179,
         5: 12,
         6: 398,
         7: 8,
         8: 19,
         9: 16,
         10: 281,
         11: 744,
         12: 24,
         13: 12,
         14: 19,
         15: 51,
         16: 15,
         17: 393})

In [28]:
components.shape

(26, 5000)

In [157]:
len(c)

26

In [164]:
np.all(X>0)

True

In [186]:
sum(X[:,0]> X[:,1])

177

In [198]:
X

array([[  4.13564930e-04,   4.13564930e-04,   4.13564930e-04, ...,
          4.13564930e-04,   1.12188362e-01,   4.13564930e-04],
       [  7.25689405e-04,   7.25689405e-04,   7.25689405e-04, ...,
          7.25689405e-04,   1.03806499e-01,   7.25689405e-04],
       [  4.18060201e-04,   1.12876254e-02,   4.18060201e-04, ...,
          4.18060201e-04,   9.67809365e-01,   4.18060201e-04],
       ..., 
       [  3.84615385e-02,   3.84615385e-02,   3.84615385e-02, ...,
          3.84615385e-02,   3.84615385e-02,   3.84615385e-02],
       [  3.84615385e-02,   3.84615385e-02,   3.84615385e-02, ...,
          3.84615385e-02,   3.84615385e-02,   3.84615385e-02],
       [  3.84615385e-02,   3.84615385e-02,   3.84615385e-02, ...,
          3.84615385e-02,   3.84615385e-02,   3.84615385e-02]])

In [200]:
sum(X[:,0]>X[:,1])

177

In [201]:
sum(X[:,0]>=X[:,1])

6824

In [202]:
X.shape

(7506, 26)

In [206]:
for i in xrange(1,26):
    print sum(X[:,0]==X[:,i])

6647
5364
6347
6539
5611
5803
5311
5885
5428
5915
5525
5339
5358
5367
5261
6973
5801
7112
6468
5929
7020
5285
6961
5279
5722


### Does clustering work?

In [47]:
from sklearn.cluster import KMeans

In [52]:
kmean = KMeans(n_clusters=15,n_jobs=-1)

In [53]:
pred = kmean.fit_predict(df)

In [54]:
Counter(pred)

Counter({0: 265,
         1: 113,
         2: 17,
         3: 1,
         4: 2,
         5: 1,
         6: 93,
         7: 2,
         8: 1240,
         9: 104,
         10: 3,
         11: 34,
         12: 1,
         13: 1,
         14: 473})

### Too many ties! Try "drafting" points in case of tie

In [68]:
X.shape

(7506, 15)

In [43]:
np.where(X[:,0]==X[:,1])

(array([1870]),)

In [44]:
df.index[3]

'TRAAMPA128F92E7D0D'

In [45]:
np.where(X==np.max(X,axis=1)[:,np.newaxis])

(array([   0,    1,    2, ..., 2347, 2348, 2349]),
 array([ 7, 12,  4, ...,  4, 14, 12]))

In [46]:
ties = []
for i,m in enumerate(np.max(X,axis=1)):
    ties.append(len(np.where(X[i]==m)[0]))
Counter(ties)
# print ties

Counter({1: 2349, 15: 1})

In [55]:
components.shape

(26, 5000)

In [61]:
X[6]

array([ 0.03846154,  0.03846154,  0.03846154,  0.03846154,  0.03846154,
        0.03846154,  0.03846154,  0.03846154,  0.03846154,  0.03846154,
        0.03846154,  0.03846154,  0.03846154,  0.03846154,  0.03846154,
        0.03846154,  0.03846154,  0.03846154,  0.03846154,  0.03846154,
        0.03846154,  0.03846154,  0.03846154,  0.03846154,  0.03846154,
        0.03846154])

In [62]:
X[7]

array([ 0.03846154,  0.03846154,  0.03846154,  0.03846154,  0.03846154,
        0.03846154,  0.03846154,  0.03846154,  0.03846154,  0.03846154,
        0.03846154,  0.03846154,  0.03846154,  0.03846154,  0.03846154,
        0.03846154,  0.03846154,  0.03846154,  0.03846154,  0.03846154,
        0.03846154,  0.03846154,  0.03846154,  0.03846154,  0.03846154,
        0.03846154])

In [63]:
sum(X[7])

0.99999999999999956

In [84]:
X[6]

array([ 0.06666667,  0.06666667,  0.06666667,  0.06666667,  0.06666667,
        0.06666667,  0.06666667,  0.06666667,  0.06666667,  0.06666667,
        0.06666667,  0.06666667,  0.06666667,  0.06666667,  0.06666667])

In [88]:
ind = df.index[6]

In [92]:
df.loc[6].sum()

70.0

In [5]:
(df.iloc[6]>0).sum()

0