In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [5]:
df = pd.read_pickle('../Data/n-gramed_lyrics')
df.reset_index(inplace=True, drop=True)

In [13]:
cv = CountVectorizer(max_df=0.95)
cv_vectorizer = cv.fit_transform(df.lyrics)
cv_df = pd.DataFrame(cv_vectorizer.toarray(), columns=cv.get_feature_names())
cv_df.shape

(15340, 22298)

In [15]:
nmf_model = NMF(n_components=5, random_state=42)
nmf = nmf_model.fit_transform(cv_df)

In [16]:
W = nmf
H = nmf_model.components_

The W factor contains the document membership weights relative to each of the k topics. Each row corresponds to a single document, and each column correspond to a topic.

In [17]:
W.shape

(15340, 5)

The H factor contains the term weights relative to each of the k topics. In this case, each row corresponds to a topic, and each column corresponds to a unique term in the corpus vocabulary.

In [18]:
H.shape

(5, 22298)

In [19]:
def get_descriptor(terms, H, topic_index, top):
    # reverse sort the values to sort the indices
    top_indices = np.argsort(H[topic_index, :])[::-1]
    top_terms = []
    for term_index in top_indices[0:top]:
        top_terms.append(terms[term_index])
    return top_terms

In [20]:
def list_top_wrods(model, feature_names, n_top_words):
    top_words = []
    for topic in model.components_:
        top_words.append([feature_names[i] for i in
                          topic.argsort()[:-n_top_words - 1:-1]])
    return top_words

In [21]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

    print()

In [23]:
print_top_words(nmf_model, cv.get_feature_names(), 30)

Topic #0:
know go like get want never verse got time come one let see feel say back take way make could cause away day heart yeah thing gonna need wanna life
Topic #1:
oh ah yeah whoa verse body let move pretty night like heart got tonight life hey find na pre uh hand dream woah emotional come something song bridge rhythm god
Topic #2:
ooh ah want help woo verse way make suzanne ooooh back give need baby get tonight go come yeah pre little maybe long going could home dream girl gonna outro
Topic #3:
love baby verse say heart like one give way want fall gimmie never little still cause make time gonna hey need tell wanna much find keep got let think said
Topic #4:
la da life day everything like much hey high lie away dear feel lost riding verse could look miss fire need singing wanna ha little say lisa come fall wishing



In [24]:
df = df.merge(pd.DataFrame(nmf), left_index=True, right_index=True)
df.to_pickle('../Data/PCA_features_CV')
df.head(2)

Unnamed: 0,spotify_album_uri,spotify_artist_id,artist_name,spotify_artist_uri,duration_ms,explicit,spotify_song_id,song_title,song_spotify_page,track_number,...,unaltered_artist_name,lyrics,genius_song_id,genius_song_url,genius_artist_id,0,1,2,3,4
0,spotify:album:2h6MSR1rWemHOtmurgRq3T,7kAKqNxPBkfjgdHzUIdBtI,suburban kids with biblical names,spotify:artist:7kAKqNxPBkfjgdHzUIdBtI,138626,False,4okEZakOVppAtP4Dawd52x,marry me,https://open.spotify.com/track/4okEZakOVppAtP4...,1,...,Suburban Kids With Biblical Names,old chance get gonna marry marry get act toget...,861607,https://genius.com/Suburban-kids-with-biblical...,353411,0.094388,0.0,0.003197,0.047663,0.0
1,spotify:album:2h6MSR1rWemHOtmurgRq3T,7kAKqNxPBkfjgdHzUIdBtI,suburban kids with biblical names,spotify:artist:7kAKqNxPBkfjgdHzUIdBtI,187106,False,2LV6sB5zTsu0R5r5kWohlD,loop duplicate my heart,https://open.spotify.com/track/2LV6sB5zTsu0R5r...,2,...,Suburban Kids With Biblical Names,bigger everything ever done x found reason sta...,980120,https://genius.com/Suburban-kids-with-biblical...,353411,0.116763,0.00631,0.003715,0.006283,0.004452
