## Purpose 
First model showed that using TF-IDF vectorizer and NMF decomposition led to the topics that made the most sense. 
Will re-iterate that pipeline only here, and expand analysis, for the us_pops data


In [1]:
import pickle
import pandas as pd

import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

from sklearn.feature_extraction import text

In [2]:
file = open('../../Webscraper/uk_pops_raw', 'rb')
uk_pops_raw = pickle.load(file)
uk_pops_raw = uk_pops_raw.dropna().reset_index(drop=True)
uk_pops = uk_pops_raw['Lyrics']

# only keep alphanumeric, remove newlines, and make everything lowercase
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
rm_newline = lambda x: re.sub("\n", ' ',x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

uk_pops_nopunc = uk_pops.map(alphanumeric).map(rm_newline).map(punc_lower)

In [3]:
# lemmatize. do this by turning every line into a list of words, and concat back to full lyrics
uk_pops_list = uk_pops_nopunc.apply(lambda x: x.split(' '))

def lemmatize(list_of_words):
    lmt = WordNetLemmatizer()
    strg = ''
    for word in list_of_words:
        lem_word = lmt.lemmatize(word)
        if len(lem_word) > 2:
            strg += lem_word + ' '
    return strg

uk_pops_clean = uk_pops_list.apply(lemmatize)

In [10]:
stop_words_added = text.ENGLISH_STOP_WORDS.union(['huh','woo','whoa','hey','hold','ooh','aah','whoa','oooh','ayy','hoo'])

tfidf =  TfidfVectorizer(
    stop_words = stop_words_added,
    max_df = 0.2,
    min_df = 0.001
)
Y = tfidf.fit_transform(uk_pops_clean)
doc_term_matrix = pd.DataFrame(Y.toarray(), columns=tfidf.get_feature_names())

In [11]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [18]:
nmf_model = NMF(7)
doc_topic = nmf_model.fit_transform(doc_term_matrix)

display_topics(nmf_model, tfidf.get_feature_names(), 15)


Topic  0
really, did, long, dream, thought, believe, gone, word, face, stay, live, better, change, fall, wrong

Topic  1
tonight, stay, tomorrow, alright, dancing, hand, going, fight, promise, feelin, kiss, fine, chance, party, tight

Topic  2
dance, everybody, body, party, stop, rock, music, beat, floor, shake, hand, alright, club, rhythm, gon

Topic  3
little, bit, bad, work, crazy, maybe, late, broken, new, fall, conversation, middle, hot, fun, looking

Topic  4
boy, gotta, bad, bring, better, kiss, walk, feeling, talk, crazy, head, superstar, sweet, nah, really

Topic  5
touch, kiss, high, fly, sky, body, doo, crazy, waiting, real, anybody, head, feeling, close, warmth

Topic  6
coming, turn, home, run, light, gone, shine, sun, til, babe, carry, leave, song, running, waiting


In [19]:
doc_topic_mat = pd.DataFrame(
    doc_topic.round(3),
    columns = ['topic1','topic2','topic3','topic4','topic5','topic6','topic7']
)

# re-add Song/Artist
doc_topic_mat = doc_topic_mat.join(uk_pops_raw[['Song','Artist']])

In [17]:
doc_topic_mat.sort_values('topic5',ascending=False).head(20)

Unnamed: 0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10,Song,Artist
1460,0.0,0.0,0.0,0.0,0.482,0.0,0.0,0.0,0.0,0.0,Boys Will Be Boys,The Ordinary Boys
1309,0.0,0.0,0.0,0.024,0.413,0.0,0.0,0.0,0.049,0.0,See It In A Boy's Eyes,Jamelia
1138,0.0,0.0,0.0,0.0,0.408,0.16,0.0,0.0,0.0,0.0,Down Boy,Holly Valance
1152,0.0,0.001,0.01,0.006,0.397,0.0,0.0,0.0,0.0,0.0,Hey Baby,No Doubt
1781,0.0,0.086,0.0,0.017,0.379,0.0,0.0,0.0,0.0,0.0,Rude Boy,Rihanna
1188,0.012,0.0,0.0,0.0,0.368,0.007,0.055,0.004,0.009,0.0,Hole In The Head,Sugababes
715,0.0,0.0,0.0,0.0,0.357,0.0,0.0,0.0,0.0,0.0,The Boy Is Mine,Brandy & Monica
1718,0.0,0.0,0.0,0.011,0.355,0.0,0.0,0.0,0.0,0.0,BAD BOYS,ALEXANDRA BURKE FT FLO RIDA
1245,0.0,0.0,0.0,0.0,0.35,0.0,0.0,0.0,0.0,0.0,Call On Me,Eric Prydz
1653,0.0,0.008,0.0,0.0,0.314,0.0,0.0,0.002,0.0,0.082,Pumpkin Soup,Kate Nash


In [101]:
# topic_word = pd.DataFrame(
#     nmf_model.components_.round(3),
#     columns = tfidf.get_feature_names()
#     )
# topic_word