In [2]:
from pprint import pprint
import re
import numpy as np
import pandas as pd
import spacy

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# NLTK Stop words
import nltk
from nltk.corpus import stopwords

In [26]:
songs = pd.read_csv('/Users/kaimiddlebrook/Downloads/lyrics.csv')
songs = songs.head(10000)
songs = songs.dropna()
print(songs.shape)
songs.head()

(6988, 6)


Unnamed: 0,index,song,year,artist,genre,lyrics
0,0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...
3,3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


In [27]:
songs.genre.unique()

array(['Pop', 'Hip-Hop', 'Not Available', 'Rock', 'Metal', 'Other',
       'Country', 'Jazz', 'Electronic', 'Folk', 'R&B', 'Indie'],
      dtype=object)

In [28]:
stop_words = stopwords.words('english')

In [29]:
lyrics = songs.lyrics.values.tolist()
lyrics = [re.sub(r'\s+', ' ', line) for line in lyrics] # remove whitespace and newlines
lyrics = [re.sub("\'", "", line) for line in lyrics] # remove distracting single quotes
pprint(lyrics[:1])

['Oh baby, how you doing? You know Im gonna cut right to the chase Some women '
 'were made but me, myself I like to think that I was created for a special '
 'purpose You know, whats more special than you? You feel me Its on baby, lets '
 'get lost You dont need to call into work cause youre the boss For real, want '
 'you to show me how you feel I consider myself lucky, thats a big deal Why? '
 'Well, you got the key to my heart But you aint gonna need it, Id rather you '
 'open up my body And show me secrets, you didnt know was inside No need for '
 'me to lie Its too big, its too wide Its too strong, it wont fit Its too '
 'much, its too tough He talk like this cause he can back it up He got a big '
 'ego, such a huge ego I love his big ego, its too much He walk like this '
 'cause he can back it up Usually Im humble, right now I dont choose You can '
 'leave with me or you could have the blues Some call it arrogant, I call it '
 'confident You decide when you find on what Im worki

# Tokenize words and Clean-up text

In [30]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(lyrics))
print(data_words[:1])


[['oh', 'baby', 'how', 'you', 'doing', 'you', 'know', 'im', 'gonna', 'cut', 'right', 'to', 'the', 'chase', 'some', 'women', 'were', 'made', 'but', 'me', 'myself', 'like', 'to', 'think', 'that', 'was', 'created', 'for', 'special', 'purpose', 'you', 'know', 'whats', 'more', 'special', 'than', 'you', 'you', 'feel', 'me', 'its', 'on', 'baby', 'lets', 'get', 'lost', 'you', 'dont', 'need', 'to', 'call', 'into', 'work', 'cause', 'youre', 'the', 'boss', 'for', 'real', 'want', 'you', 'to', 'show', 'me', 'how', 'you', 'feel', 'consider', 'myself', 'lucky', 'thats', 'big', 'deal', 'why', 'well', 'you', 'got', 'the', 'key', 'to', 'my', 'heart', 'but', 'you', 'aint', 'gonna', 'need', 'it', 'id', 'rather', 'you', 'open', 'up', 'my', 'body', 'and', 'show', 'me', 'secrets', 'you', 'didnt', 'know', 'was', 'inside', 'no', 'need', 'for', 'me', 'to', 'lie', 'its', 'too', 'big', 'its', 'too', 'wide', 'its', 'too', 'strong', 'it', 'wont', 'fit', 'its', 'too', 'much', 'its', 'too', 'tough', 'he', 'talk', 'li

# Creating Bigrams and Trigram Models

In [31]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['oh', 'baby', 'how', 'you', 'doing', 'you', 'know', 'im', 'gonna', 'cut', 'right', 'to', 'the', 'chase', 'some', 'women', 'were', 'made', 'but', 'me', 'myself', 'like', 'to', 'think', 'that', 'was', 'created', 'for', 'special', 'purpose', 'you', 'know', 'whats', 'more', 'special', 'than', 'you', 'you', 'feel', 'me', 'its', 'on', 'baby', 'lets', 'get', 'lost', 'you', 'dont', 'need', 'to', 'call', 'into', 'work', 'cause', 'youre', 'the', 'boss', 'for', 'real', 'want', 'you', 'to', 'show', 'me', 'how', 'you', 'feel', 'consider', 'myself', 'lucky', 'thats', 'big_deal', 'why', 'well', 'you', 'got', 'the', 'key', 'to', 'my', 'heart', 'but', 'you', 'aint', 'gonna', 'need', 'it', 'id_rather', 'you', 'open', 'up', 'my', 'body', 'and', 'show', 'me', 'secrets', 'you', 'didnt', 'know', 'was', 'inside', 'no', 'need', 'for', 'me', 'to', 'lie', 'its', 'too', 'big', 'its', 'too', 'wide', 'its', 'too', 'strong', 'it', 'wont', 'fit', 'its', 'too', 'much', 'its', 'too', 'tough', 'he', 'talk', 'like', 't

# Make Bigrams and Lemmatize

In [32]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [33]:
# Form Bigrams
data_words_bigrams = make_bigrams(data_words)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['baby', 'how', 'do', 'know', 'be', 'go', 'cut', 'right', 'chase', 'woman', 'be', 'make', 'like', 'think', 'be', 'create', 'special', 'purpose', 'know', 's', 'more', 'special', 'feel', 'baby', 'let', 'get', 'lose', 'do', 'not', 'need', 'call', 'work', 'be', 'boss', 'real', 'want', 'show', 'how', 'feel', 'consider', 'lucky', 's', 'big_deal', 'why', 'get', 'key', 'heart', 'be', 'not', 'go', 'need', 'open', 'body', 'show', 'secret', 'do', 'not', 'know', 'be', 'need', 'lie', 'too', 'big', 'too', 'wide', 'too', 'strong', 'not', 'fit', 'too', 'much', 'too', 'tough', 'talk', 'can', 'back', 'get', 'big_ego', 'such_huge', 'ego', 'love', 'too', 'much', 'walk', 'can', 'back', 'usually', 'be', 'humble', 'right', 'now', 'do', 'not', 'choose', 'can', 'leave', 'could', 'have', 'blue', 'call', 'arrogant', 'call', 'confident', 'decide', 'when', 'find', 'be', 'work', 'damn', 'know', 'be', 'kill', 'leg', 'better', 'yet', 'thigh', 'matter_fact', 'smile', 'maybe', 'eye', 'boy', 'site', 'see', 'kind', 'som

# Create the Dictionary and Corpus needed for Topic Modeling

In [34]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

print(corpus[:1]) # view (word_id, word_frequency) for the first document
print(id2word[0]) # view the word that corresponds to word id 0

[[(0, 1), (1, 1), (2, 2), (3, 11), (4, 11), (5, 1), (6, 1), (7, 4), (8, 1), (9, 3), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 3), (16, 13), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 5), (27, 6), (28, 1), (29, 3), (30, 1), (31, 3), (32, 6), (33, 2), (34, 1), (35, 1), (36, 2), (37, 1), (38, 1), (39, 1), (40, 1), (41, 5), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 3), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 6), (55, 1), (56, 4), (57, 9), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 2), (65, 2), (66, 1), (67, 1), (68, 2), (69, 1), (70, 1), (71, 1), (72, 2), (73, 1), (74, 2), (75, 4), (76, 4), (77, 4), (78, 1), (79, 1), (80, 18), (81, 3), (82, 1), (83, 5), (84, 1), (85, 1), (86, 1), (87, 3), (88, 1), (89, 2), (90, 1)]]
admit


In [35]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('admit', 1),
  ('arrogant', 1),
  ('baby', 2),
  ('back', 11),
  ('be', 11),
  ('beat', 1),
  ('better', 1),
  ('big', 4),
  ('big_deal', 1),
  ('big_ego', 3),
  ('bitch', 1),
  ('blue', 1),
  ('body', 1),
  ('boss', 1),
  ('boy', 1),
  ('call', 3),
  ('can', 13),
  ('chase', 1),
  ('choose', 1),
  ('confident', 1),
  ('consider', 1),
  ('could', 1),
  ('create', 1),
  ('cut', 1),
  ('damn', 1),
  ('decide', 1),
  ('do', 5),
  ('ego', 6),
  ('eye', 1),
  ('feel', 3),
  ('find', 1),
  ('fit', 3),
  ('get', 6),
  ('go', 2),
  ('have', 1),
  ('heart', 1),
  ('how', 2),
  ('humble', 1),
  ('key', 1),
  ('kill', 1),
  ('kind', 1),
  ('know', 5),
  ('leave', 1),
  ('leg', 1),
  ('let', 1),
  ('lie', 1),
  ('like', 1),
  ('lose', 1),
  ('love', 3),
  ('lucky', 1),
  ('make', 1),
  ('matter_fact', 1),
  ('maybe', 1),
  ('more', 1),
  ('much', 6),
  ('must', 1),
  ('need', 4),
  ('not', 9),
  ('now', 1),
  ('open', 1),
  ('piano', 1),
  ('purpose', 1),
  ('real', 1),
  ('reason', 1),
  ('rig

# Building the Topic Model

In [36]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

# View the topics in LDA model

In [37]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())

[(0,
  '0.091*"win" + 0.033*"mile" + 0.020*"above" + 0.018*"thin" + 0.016*"course" '
  '+ 0.011*"press" + 0.011*"cheek" + 0.009*"boom_boom" + 0.009*"pas" + '
  '0.009*"believer"'),
 (1,
  '0.106*"be" + 0.056*"not" + 0.039*"do" + 0.021*"go" + 0.019*"know" + '
  '0.017*"have" + 0.014*"love" + 0.014*"get" + 0.014*"so" + 0.014*"s"'),
 (2,
  '0.045*"hot" + 0.038*"shadow" + 0.033*"heat" + 0.032*"warrior" + '
  '0.019*"crowd" + 0.016*"thunder" + 0.016*"daylight" + 0.016*"race" + '
  '0.015*"freak" + 0.013*"drown"'),
 (3,
  '0.047*"moon" + 0.041*"summer" + 0.027*"remain" + 0.020*"dem" + '
  '0.019*"beautiful" + 0.019*"prepare" + 0.016*"flower" + 0.015*"letter" + '
  '0.015*"wound" + 0.015*"wan"'),
 (4,
  '0.073*"shorty" + 0.059*"whip" + 0.055*"dance" + 0.028*"ghetto" + '
  '0.022*"wrist" + 0.015*"thinking" + 0.015*"body" + 0.014*"mood" + '
  '0.013*"party" + 0.012*"your"'),
 (5,
  '0.016*"hat" + 0.015*"quiero" + 0.013*"ich" + 0.012*"und" + 0.012*"son" + '
  '0.011*"sei" + 0.011*"doch" + 0.011*