# NMF For Topic Modelling 

In [1]:
import pandas as pd
import numpy as np
import sqlite3
from sklearn.decomposition import NMF
from nltk.corpus import stopwords
from unidecode import unidecode
from collections import Counter

In [2]:
path = '../../data/lyrics/data/subset_bow_2.csv'
df = pd.read_csv(path,index_col=0)

In [3]:
df.head()

Unnamed: 0,i,the,you,to,and,a,me,it,not,in,...,writer,motivo,bake,insist,wel,santo,pe,gee,colleg,kad
TRAYYAU128F92D58D0,0.0,3.0,4.0,2.0,4.0,5.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TRARURM128F931A91B,5.0,5.0,9.0,2.0,0.0,4.0,0.0,1.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TRAMSSK128F92FDC80,8.0,13.0,5.0,1.0,6.0,4.0,1.0,0.0,5.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TRAAMPA128F92E7D0D,17.0,0.0,4.0,4.0,5.0,20.0,8.0,18.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TRAWHZK12903CB5A01,18.0,1.0,3.0,6.0,0.0,2.0,0.0,1.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
to_drop = []
for ind in df.index:
    if df.loc[ind].sum()==0:
        to_drop.append(ind)

In [5]:
len(to_drop)

5156

In [6]:
df = df.drop(to_drop)

In [9]:
df.columns

Index([u'i', u'the', u'you', u'to', u'and', u'a', u'me', u'it', u'not', u'in',
       ...
       u'writer', u'motivo', u'bake', u'insist', u'wel', u'santo', u'pe',
       u'gee', u'colleg', u'kad'],
      dtype='object', length=5000)

### How Sparse Is Our Data?

In [8]:
(df==0).sum().sum()/np.product(df.shape).astype(float)

0.9827326808510638

### Let's try dropping stopwords

In [13]:
stop_words = stopwords.words('english') + stopwords.words('spanish') + stopwords.words('german') + stopwords.words('french')
col_keep = []
for word in df.columns:
    if word not in stop_words:
        col_keep.append(word)



In [14]:
len(col_keep)

4654

In [15]:
df = df[col_keep]

### Try NMF

In [67]:
n = 18
nmf = NMF(n_components=n,init='random')

In [68]:
# subset = df.values[:,:1500]
W = nmf.fit_transform(df)
H=nmf.components_

In [69]:
mask=np.array([row[::-1] for row in np.argsort(H,axis=1)])[:,:100]

In [70]:
possible = np.array(df.columns)[mask]

In [71]:
stop_words = stopwords.words('english') + stopwords.words('spanish') + stopwords.words('german') + stopwords.words('french')
filtered = np.array([[word for word in topic  if word not in stop_words] for topic in possible])

In [72]:
songs_mask = np.argsort(W,axis=0).T
songs_mask = np.array([mask[::-1] for mask in songs_mask])[:,:5]
imp_3track_ids = np.array(df.index)[songs_mask]

conn = sqlite3.connect('../../data/MillionSongSubset/AdditionalFiles/subset_track_metadata.db')
q = "SELECT title, artist_name FROM songs WHERE track_id = '{}';"
# q2 = "SELECT title, artist_name FROM songs WHERE track_id = 'TRAWVWS128F42ADCD1';"
# print q.format(imp_track_ids[0])
for track_ids, topic in zip(imp_3track_ids,filtered):
    print topic[:10]
    formatted = '\t"{}" by {}'
    for track_id in track_ids:
        song,artist= conn.execute(q.format(track_id)).fetchone()
        print formatted.format(unidecode(song), unidecode(artist), topic[0:5])
conn.close()

['got' 'nigga' 'make' 'way' 'wanna' 'gonna' 'shit' 'caus' 'well' 'keep']
	"Ain't Got Nothin' But The Blues (Album Version)" by Robben Ford
	"Coyotes (Album Version)" by Jason Mraz
	"La Photo" by Basement Jaxx
	"Warm Embrace (LP Version)" by Twista & The Speedknot Mobstaz
	"The Hotness (ft. Shontelle)" by Rihanna
['know' 'want' 'need' 'gonna' 'right' 'thing' 'feel' 'give' 'time' 'wanna']
	"What You Deserve (Album Version)" by Ill Nino
	"I Don't Know" by Teenage Fanclub
	"Tha Day" by K's Choice
	"Out In The Street (Live) (2008 Digital Remaster)" by UFO
	"Dying to Know (Album Version)" by Pennywise
['oh' 'bye' 'ooh' 'yeah' 'whoa' 'way' 'new' 'hey' 'take' 'wow']
	"Heaven's In New York" by Wyclef Jean
	"It Ain't No Use" by Jimmy McCracklin
	"Contagious" by Y&T
	"Slipping Away" by Vivian
	"The Same Old Innocence" by Architecture In Helsinki
['love' 'song' 'first' 'heart' 'show' 'onli' 'give' 'caus' 'way' 'put']
	"Don't Phunk With My Heart" by Java
	"(This Is Not A) Love Song (Live)" by Publi

In [66]:
# for topic in filtered:
#     print topic[:10]

In [73]:
np.argmax(W,axis=1)

array([ 5,  7,  1, ..., 17,  3, 17])

In [74]:
c = Counter(np.argmax(W,axis=1))
c

Counter({0: 268,
         1: 544,
         2: 258,
         3: 100,
         4: 47,
         5: 129,
         6: 96,
         7: 140,
         8: 55,
         9: 31,
         10: 58,
         11: 62,
         12: 33,
         13: 74,
         14: 53,
         15: 48,
         16: 94,
         17: 260})

### Try LDA

In [75]:
from sklearn.decomposition import LatentDirichletAllocation as LDA

In [115]:
n = 18
lda = LDA(n_topics=n,max_iter=30,learning_method='online')

In [116]:
X = lda.fit_transform(df)

In [126]:
import cPickle as pickle
def pickle_object(something, name):
    with open('{}.pkl'.format(name), 'w') as f:
        pickle.dump(something, f)

    return None

# pickle_object(lda,'lda')

In [140]:
labels = np.argmax(lda.transform(df),axis=1)

array([2, 2, 2, ..., 2, 2, 2])

In [141]:
Counter(labels)

Counter({0: 75,
         1: 5,
         2: 633,
         3: 116,
         4: 173,
         5: 5,
         6: 31,
         7: 14,
         8: 21,
         9: 12,
         10: 82,
         11: 25,
         12: 285,
         13: 41,
         14: 505,
         15: 265,
         16: 5,
         17: 57})

In [142]:
labeled_data = np.hstack((np.array(df.index)[:,np.newaxis],labels[:,np.newaxis]))

In [150]:
labeled_df = pd.DataFrame(labeled_data,columns=(['track_id','label']))
labeled_df.to_csv('labeled_df.csv',index=False)

In [117]:
components = lda.components_
mask=np.array([row[::-1] for row in np.argsort(components,axis=1)])[:,:100]
possible = np.array(df.columns)[mask]
stop_words = stopwords.words('english') + stopwords.words('spanish') + stopwords.words('german') + stopwords.words('french')
filtered = np.array([[word for word in topic  if word not in stop_words] for topic in possible])

for topic in filtered:
    print topic[:10]

['burn' 'na' 'fli' 'sky' 'turn' 'everyth' 'hous' 'fire' 'water' 'watch']
['oh' 'whoa' 'save' 'left' 'keep' 'way' 'slip' 'away' 'superman' 'lazi']
['never' 'day' 'time' 'one' 'away' 'see' 'love' 'know' 'dream' 'would']
['world' 'one' 'life' 'us' 'blood' 'god' 'follow' 'dead' 'live' 'wrong']
['quiero' 'amor' 'mas' 'si' 'solo' 'corazon' 'vida' 'nunca' 'ser' 'vez']
['si' 'part' 'comm' 'encor' 'ici' 'san' 'ye' 'lamour' 'vent' 'chose']
['parti' 'rock' 'gone' 'song' 'round' 'rememb' 'kill' 'goin' 'back' 'turn']
['wish' 'christma' 'jag' 'merri' 'som' 'har' 'kan' 'det' 'dig' 'ur']
['come' 'danc' 'wait' 'light' 'see' 'want' 'around' 'get' 'everybodi'
 'take']
['music' 'lone' 'di' 'sail' 'stop' 'river' 'night' 'lie' 'che' 'nuh']
['hey' 'dan' 'tout' 'cest' 'ell' 'mai' 'ca' 'plus' 'bye' 'jai']
['free' 'road' 'white' 'woman' 'black' 'set' 'hand' 'ride' 'bleed'
 'gangsta']
['got' 'yeah' 'gonna' 'girl' 'know' 'tell' 'like' 'get' 'well' 'say']
['lord' 'god' 'ah' 'day' 'jesus' 'old' 'stand' 'nah' 'blue'

In [118]:
ind_mask = np.argmax(X,axis=0)

In [119]:
songs_mask = np.argsort(X,axis=0).T
songs_mask = np.array([mask[::-1] for mask in songs_mask])[:,:5]
imp_3track_ids = np.array(df.index)[songs_mask]

In [120]:
conn = sqlite3.connect('../../data/MillionSongSubset/AdditionalFiles/subset_track_metadata.db')
q = "SELECT title, artist_name FROM songs WHERE track_id = '{}';"
# q2 = "SELECT title, artist_name FROM songs WHERE track_id = 'TRAWVWS128F42ADCD1';"
# print q.format(imp_track_ids[0])
for track_ids, topic in zip(imp_3track_ids,filtered):
    print topic[:10]
    formatted = '\t"{}" by {}'
    for track_id in track_ids:
        song,artist= conn.execute(q.format(track_id)).fetchone()
        print formatted.format(unidecode(song), unidecode(artist), topic[0:5])
conn.close()

['burn' 'na' 'fli' 'sky' 'turn' 'everyth' 'hous' 'fire' 'water' 'watch']
	"Destroy Everything (Album Version)" by Hatebreed
	"The Modern Rome Burning" by Anti-Flag
	"One the road (to Damnation)" by It Dies Today
	"Vincent" by Madilu System
	"Moisture" by Dishwalla
['oh' 'whoa' 'save' 'left' 'keep' 'way' 'slip' 'away' 'superman' 'lazi']
	"Slipping Away" by Vivian
	"You Left Me" by The Maine
	"You Appearing" by M83
	"Ability To Create A War" by A Skylit Drive
	"The Same Old Innocence" by Architecture In Helsinki
['never' 'day' 'time' 'one' 'away' 'see' 'love' 'know' 'dream' 'would']
	"A Little Of You" by Eurythmics
	"Beautiful Maria of My Soul" by Virtuoso
	"All Things Must Pass (2009 Digital Remaster)" by George Harrison
	"I'll Never Find Another You" by Sonny James
	"Nothing's The Same (2002 Digital Remaster)" by Gary Moore
['world' 'one' 'life' 'us' 'blood' 'god' 'follow' 'dead' 'live' 'wrong']
	"Within My Blood" by Skeletonwitch
	"Towards Eternity" by Old Man's Child
	"Dead Wrong" by

In [121]:
c = Counter(np.argmax(X,axis=1))
c

Counter({0: 75,
         1: 5,
         2: 633,
         3: 116,
         4: 173,
         5: 5,
         6: 31,
         7: 14,
         8: 21,
         9: 12,
         10: 82,
         11: 25,
         12: 285,
         13: 41,
         14: 505,
         15: 265,
         16: 5,
         17: 57})

### Does clustering work?

In [47]:
from sklearn.cluster import KMeans

In [123]:
kmean = KMeans(n_clusters=15,n_jobs=-1)

In [124]:
pred = kmean.fit_predict(df)

In [125]:
Counter(pred)

Counter({0: 5,
         1: 1434,
         2: 1,
         3: 142,
         4: 1,
         5: 8,
         6: 582,
         7: 18,
         8: 5,
         9: 2,
         10: 1,
         11: 1,
         12: 147,
         13: 1,
         14: 2})

### How many ties?

In [122]:
ties = []
for i,m in enumerate(np.max(X,axis=1)):
    ties.append(len(np.where(X[i]==m)[0]))
Counter(ties)
# print ties

Counter({1: 2349, 18: 1})