In [1]:
## Basic Packages
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import re
from pprint import pprint

## Import NLTK packages
from nltk.corpus import stopwords
from nltk import RegexpTokenizer
from nltk import wordpunct_tokenize
from nltk.stem.porter import PorterStemmer

## Import Gensim packages
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

## Notebook display settings
pd.options.display.max_columns = 999



## Reading in data

In [2]:
lyrics_df = pd.read_csv('data//lyric_data.csv', encoding = 'utf8')

In [3]:
beatles_lyrics = lyrics_df[lyrics_df.artist == 'The Beatles']

In [4]:
beatles_lyrics.drop(columns = ['link'], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [5]:
beatles_lyrics.reset_index(drop = True, inplace = True)

In [6]:
beatles_lyrics.rename(columns = {'text' : 'lyrics'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [7]:
beatles_lyrics.to_csv('data//beatles_lyrics.csv')

## Text cleanup

In [106]:
## Creating a list of lyrics

stripped_lyrics = beatles_lyrics.lyrics.values.tolist()
stripped_lyrics = [x.lower() for x in stripped_lyrics]

In [107]:
## Cleaning up the text - removing new lines, extra spaces, random characters

stripped_lyrics = [re.sub('\n', '', sent) for sent in stripped_lyrics]
stripped_lyrics = [re.sub('\s{2,}', ' ', sent) for sent in stripped_lyrics]
stripped_lyrics = [re.sub("\\\'", '', sent) for sent in stripped_lyrics]

In [129]:
len(stripped_lyrics)

178

In [109]:
## Importing a stop word list and including a few extra words

stop_words = stopwords.words('english')
stop_words.extend(['hey', 'nah', '[chorus]', 'la'])

In [113]:
## Tokenizing the lyrics in each song using RegexpTokenizer and removing stop words

tokenizer = RegexpTokenizer(r'\w+')
porter = PorterStemmer()

token_list = []

for i in range(len(stripped_lyrics)):   
    tokenized_list = []
    tokenized_list = tokenizer.tokenize(stripped_lyrics[i])
    tokenized_list = [i for i in tokenized_list if i not in stop_words]
    tokenized_list = [porter.stem(tokens) for tokens in tokenized_list]
    token_list.append(tokenized_list)

In [115]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(token_list, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[token_list], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)



In [119]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [122]:
data_words_bigrams = make_bigrams(token_list)
data_words_trigrams = make_trigrams(token_list)

In [124]:
# Create Dictionary
id2word = corpora.Dictionary(data_words_trigrams)

# Create Corpus
texts = data_words_trigrams

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 2), (1, 1), (2, 3), (3, 4), (4, 3), (5, 2), (6, 3), (7, 1), (8, 3), (9, 1), (10, 3), (11, 3), (12, 6), (13, 1), (14, 1), (15, 3), (16, 1), (17, 14), (18, 3), (19, 2), (20, 1), (21, 1), (22, 1), (23, 5), (24, 3), (25, 1), (26, 2), (27, 3), (28, 2), (29, 1), (30, 1), (31, 3), (32, 1), (33, 1), (34, 1), (35, 7), (36, 2), (37, 3), (38, 3), (39, 3), (40, 4), (41, 5), (42, 3), (43, 1), (44, 5), (45, 1), (46, 8), (47, 1), (48, 1), (49, 3), (50, 3), (51, 1), (52, 9), (53, 3), (54, 3)]]


In [132]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=15, 
                                           random_state=45,
                                           update_every=1,
                                           chunksize=100,
                                           passes=100,
                                           alpha='auto',
                                           per_word_topics=True)

In [133]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.057*"im" + 0.048*"good" + 0.045*"day" + 0.045*"know" + 0.036*"lie" + '
  '0.029*"love" + 0.028*"way" + 0.027*"she" + 0.024*"thing" + 0.024*"said"'),
 (1,
  '0.067*"dont" + 0.036*"im" + 0.025*"got" + 0.024*"get" + 0.022*"know" + '
  '0.022*"girl" + 0.018*"come" + 0.018*"your" + 0.018*"ive" + 0.017*"cant"'),
 (2,
  '0.073*"go" + 0.069*"let" + 0.043*"come" + 0.024*"take" + 0.023*"johnni" + '
  '0.021*"chang" + 0.021*"world" + 0.018*"youll" + 0.015*"may" + '
  '0.015*"gonna"'),
 (3,
  '0.031*"home" + 0.023*"time" + 0.022*"troubl" + 0.020*"like" + 0.019*"knew" '
  '+ 0.019*"commonwealth" + 0.018*"your" + 0.018*"christma" + 0.017*"year" + '
  '0.017*"feel"'),
 (4,
  '0.155*"yeah" + 0.062*"babi" + 0.047*"oh" + 0.032*"got" + 0.029*"well" + '
  '0.020*"feel" + 0.018*"come" + 0.016*"ah" + 0.016*"long" + 0.011*"wo"'),
 (5,
  '0.064*"im" + 0.021*"sleep" + 0.020*"night" + 0.020*"mind" + 0.019*"leav" + '
  '0.018*"told" + 0.016*"mayb" + 0.016*"didnt" + 0.015*"gonna" + 0.014*"wait"'),
 (6,
