In [14]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [15]:
df = pd.read_pickle('../Data/n-gramed_lyrics')

In [16]:
tf_idf = TfidfVectorizer(max_df=0.95)
tf_idf_vectorizer = tf_idf.fit(df.lyrics)
tf_idf_array = tf_idf.fit_transform(df.lyrics).toarray()
tf_idf_df = pd.DataFrame(tf_idf_array,columns=tf_idf.get_feature_names())
tf_idf_df.shape

(18702, 15576)

In [17]:
nmf_model = NMF(n_components=7, random_state=42)
nmf = nmf_model.fit_transform(tf_idf_df)

In [18]:
W = nmf
H = nmf_model.components_

The W factor contains the document membership weights relative to each of the k topics. Each row corresponds to a single document, and each column correspond to a topic.

In [19]:
W.shape

(18702, 7)

The H factor contains the term weights relative to each of the k topics. In this case, each row corresponds to a topic, and each column corresponds to a unique term in the corpus vocabulary.

In [20]:
H.shape

(7, 15576)

In [21]:
def get_descriptor(terms, H, topic_index, top):
    #reverse sort the values to sort the indices
    top_indices = np.argsort(H[topic_index,:])[::-1]
    top_terms = []
    for term_index in top_indices[0:top]:
        top_terms.append(terms[term_index])
    return top_terms

In [22]:
def list_top_wrods(model, feature_names, n_top_words):
    top_words = []
    for topic in model.components_:
        top_words.append([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
    return top_words

In [23]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
    print()

In [24]:
print_top_words(nmf_model,tf_idf.get_feature_names(),10)

Topic #0:
like come time chorus see feel one get never vers
Topic #1:
oh yeah ah chorus whoa heart hey uh vers like
Topic #2:
love babi chorus heart fall vers full like give sing
Topic #3:
know say think need tell feel care well chorus alway
Topic #4:
ooh babi chorus ah la vers need pre oooh yeah
Topic #5:
go let away home take back never come gonna wanna
Topic #6:
want wanna say need tell realli chorus think make caus



In [25]:
df = df.merge(pd.DataFrame(nmf), left_index=True, right_index=True)
df.to_pickle('../Data/topic_model_features')
df.head(2)

Unnamed: 0,spotify_album_uri,spotify_artist_id,artist_name,spotify_artist_uri,duration_ms,explicit,spotify_song_id,song_title,song_spotify_page,track_number,...,geenius_song_url,genius_artist_id,lyrics_language,0,1,2,3,4,5,6
0,spotify:album:2h6MSR1rWemHOtmurgRq3T,7kAKqNxPBkfjgdHzUIdBtI,suburban kids with biblical names,spotify:artist:7kAKqNxPBkfjgdHzUIdBtI,138626,False,4okEZakOVppAtP4Dawd52x,marry me,https://open.spotify.com/track/4okEZakOVppAtP4...,1,...,https://genius.com/Suburban-kids-with-biblical...,353411,[en:0.9999975562687033],0.01745,0.0,0.007167,0.0,1.9e-05,0.004037,0.011575
1,spotify:album:2h6MSR1rWemHOtmurgRq3T,7kAKqNxPBkfjgdHzUIdBtI,suburban kids with biblical names,spotify:artist:7kAKqNxPBkfjgdHzUIdBtI,187106,False,2LV6sB5zTsu0R5r5kWohlD,loop duplicate my heart,https://open.spotify.com/track/2LV6sB5zTsu0R5r...,2,...,https://genius.com/Suburban-kids-with-biblical...,353411,[en:0.999998823659918],0.032805,0.000615,0.0,0.004371,0.002723,0.015593,0.013163


In [26]:
df.columns.tolist()

['spotify_album_uri',
 'spotify_artist_id',
 'artist_name',
 'spotify_artist_uri',
 'duration_ms',
 'explicit',
 'spotify_song_id',
 'song_title',
 'song_spotify_page',
 'track_number',
 'spotify_song_uri',
 'song_title',
 'Unaltered_artist_name',
 'lyrics',
 'genius_song_id',
 'geenius_song_url',
 'genius_artist_id',
 'lyrics_language',
 0,
 1,
 2,
 3,
 4,
 5,
 6]