# NMF For Topic Modelling 

In [2]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from nltk.corpus import stopwords

In [4]:
path = '../../data/lyrics/data/subset_bof.csv'
df = pd.read_csv(path,index_col=0)

In [5]:
df.head()

Unnamed: 0,i,the,you,to,and,a,me,it,not,in,...,writer,motivo,bake,insist,wel,santo,pe,gee,colleg,kad
TRAYYAU128F92D58D0,0.0,3.0,4.0,2.0,4.0,5.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TRARURM128F931A91B,5.0,5.0,9.0,2.0,0.0,4.0,0.0,1.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TRAMSSK128F92FDC80,8.0,13.0,5.0,1.0,6.0,4.0,1.0,0.0,5.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TRAAMPA128F92E7D0D,17.0,0.0,4.0,4.0,5.0,20.0,8.0,18.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TRAWHZK12903CB5A01,18.0,1.0,3.0,6.0,0.0,2.0,0.0,1.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### How Sparse Is Our Data?

In [12]:
(df==0).sum().sum()/np.product(df.shape).astype(float)

0.99460578204103389

### Try NMF

In [6]:
n = 15
iters = 10000
nmf = NMF(n_components=n,init='random')

In [7]:
# subset = df.values[:,:1500]
W = nmf.fit_transform(df)
H=nmf.components_

In [8]:
mask=np.array([row[::-1] for row in np.argsort(H,axis=1)])[:,:100]

In [9]:
possible = np.array(df.columns)[mask]

In [10]:
filtered = np.array([[word for word in topic  if word not in stopwords.words('english')] for topic in possible])

In [11]:
for topic in filtered:
    print topic[:10]

['want', 'tri', 'make', 'got', 'would', 'time', 'back', 'could', 'life', 'one']
['let', 'go', 'tell', 'get', 'like', 'yeah', 'take', 'got', 'ya', 'right']
['love', 'know', 'see', 'think', 'want', 'feel', 'never', 'would', 'heart', 'chang']
['get', 'like', 'come', 'littl', 'caus', 'top', 'got', 'man', 'ya', 'bitch']
['go', 'oh', 'us', 'never', 'thing', 'day', 'everi', 'togeth', 'know', 'live']
['come', 'get', 'said', 'start', 'could', 'eye', 'one', 'right', 'say', 'caus']
['que', 'de', 'la', 'te', 'tu', 'en', 'el', 'mi', 'un', 'yo']
['ca', 'go', 'know', 'babi', 'got', 'wanna', 'ai', 'wo', 'realli', 'thing']
['da', 'like', 'niggaz', 'next', 'fuck', 'na', 'wish', 'want', 'drink', 'rhyme']
['head', 'one', 'see', 'like', 'say', 'light', 'world', 'nigga', 'follow', 'fuck']
['love', 'babi', 'let', 'come', 'help', 'tell', 'call', 'girl', 'go', 'make']
['oh', 'hood', 'life', 'hous', 'love', 'babi', 'got', 'heart', 'nigga', 'would']
['ich', 'und', 'denk', 'du', 'die', 'nicht', 'das', 'es', 'der'

### Try LDA

In [13]:
from sklearn.decomposition import LatentDirichletAllocation as LDA

In [40]:
n = 15
lda = LDA(n_topics=n)

In [36]:
lda.fit(df)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_jobs=1, n_topics=25, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [39]:
components = lda.components_
mask=np.array([row[::-1] for row in np.argsort(components,axis=1)])[:,:100]
possible = np.array(df.columns)[mask]
filtered = np.array([[word for word in topic  if word not in stopwords.words('english')] for topic in possible])

for topic in filtered:
    print topic[:12]

['like', 'get', 'got', 'nigga', 'make', 'shit', 'caus', 'em', 'back', 'yo', 'ya', 'head']
['love', 'know', 'would', 'one', 'tell', 'time', 'thing', 'make', 'got', 'could', 'see', 'feel']
['hey', 'us', 'wish', 'wrong', 'year', 'christma', 'go', 'get', 'hope', 'right', 'wo', 'thing']
['know', 'ca', 'want', 'never', 'say', 'go', 'whi', 'away', 'feel', 'get', 'like', 'gonna']
['oh', 'ah', 'whoa', 'ooh', 'night', 'bye', 'ay', 'lone', 'take', 'slip', 'pure', 'thrill']
['fall', 'kill', 'miss', 'rainbow', 'jingl', 'jah', 'favor', 'aim', 'total', 'twenti', 'thru', 'wont']
['cri', 'goodby', 'anymor', 'lover', 'spend', 'aint', 'stranger', 'im', 'dont', 'cant', 'late', 'harder']
['saturday', 'sunday', 'special', 'friday', 'basta', 'tuesday', 'sugar', 'aye', 'afternoon', 'pon', 'bonni', 'gi']
['countri', 'dig', 'slap', 'jungl', 'oo', 'circumst', 'cotton', 'main', 'victim', 'ow', 'fun', 'rocket']
['la', 'de', 'je', 'et', 'les', 'le', 'pas', 'des', 'un', 'dan', 'qui', 'pour']
['que', 'de', 'la', 'te'