In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [2]:
df = pd.read_pickle('../Data/n-gramed_lyrics')
df.reset_index(inplace=True,drop=True)

In [3]:
tf_idf = TfidfVectorizer(max_df=0.95)
tf_idf_vectorizer = tf_idf.fit(df.lyrics)
tf_idf_array = tf_idf.fit_transform(df.lyrics).toarray()
tf_idf_df = pd.DataFrame(tf_idf_array, columns=tf_idf.get_feature_names())
tf_idf_df.shape

(15340, 22298)

In [4]:
nmf_model = NMF(n_components=7, random_state=42)
nmf = nmf_model.fit_transform(tf_idf_df)

In [5]:
W = nmf
H = nmf_model.components_

The W factor contains the document membership weights relative to each of the k topics. Each row corresponds to a single document, and each column correspond to a topic.

In [6]:
W.shape

(15340, 7)

The H factor contains the term weights relative to each of the k topics. In this case, each row corresponds to a topic, and each column corresponds to a unique term in the corpus vocabulary.

In [7]:
H.shape

(7, 22298)

In [8]:
def get_descriptor(terms, H, topic_index, top):
    # reverse sort the values to sort the indices
    top_indices = np.argsort(H[topic_index, :])[::-1]
    top_terms = []
    for term_index in top_indices[0:top]:
        top_terms.append(terms[term_index])
    return top_terms

In [9]:
def list_top_wrods(model, feature_names, n_top_words):
    top_words = []
    for topic in model.components_:
        top_words.append([feature_names[i] for i in
                          topic.argsort()[:-n_top_words - 1:-1]])
    return top_words

In [10]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

    print()

In [11]:
print_top_words(nmf_model, tf_idf.get_feature_names(), 10)

Topic #0:
like time never one see got could get day verse
Topic #1:
oh yeah ah whoa heart verse uh god hey woah
Topic #2:
love baby heart verse give fall like sing enough cry
Topic #3:
ooh baby ah need verse yeah oooh pre wanna whoa
Topic #4:
know want wanna say tell need think cause really see
Topic #5:
la da wanna di yeah hey sha alright got ha
Topic #6:
go let come back away take never home wanna stay



In [12]:
df = df.merge(pd.DataFrame(nmf), left_index=True, right_index=True)
df.to_pickle('../Data/PCA_features')
df.head(2)

Unnamed: 0,spotify_album_uri,spotify_artist_id,artist_name,spotify_artist_uri,duration_ms,explicit,spotify_song_id,song_title,song_spotify_page,track_number,...,genius_song_id,genius_song_url,genius_artist_id,0,1,2,3,4,5,6
0,spotify:album:2h6MSR1rWemHOtmurgRq3T,7kAKqNxPBkfjgdHzUIdBtI,suburban kids with biblical names,spotify:artist:7kAKqNxPBkfjgdHzUIdBtI,138626,False,4okEZakOVppAtP4Dawd52x,marry me,https://open.spotify.com/track/4okEZakOVppAtP4...,1,...,861607,https://genius.com/Suburban-kids-with-biblical...,353411,0.018357,0.0,0.0074,0.000625,0.005167,0.000298,0.004434
1,spotify:album:2h6MSR1rWemHOtmurgRq3T,7kAKqNxPBkfjgdHzUIdBtI,suburban kids with biblical names,spotify:artist:7kAKqNxPBkfjgdHzUIdBtI,187106,False,2LV6sB5zTsu0R5r5kWohlD,loop duplicate my heart,https://open.spotify.com/track/2LV6sB5zTsu0R5r...,2,...,980120,https://genius.com/Suburban-kids-with-biblical...,353411,0.037431,0.000505,0.0,0.002672,0.010756,0.002123,0.006457
