In [178]:
## Basic Packages
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import re
from pprint import pprint

## Import NLTK packages
from nltk.corpus import stopwords
from nltk import RegexpTokenizer
from nltk import wordpunct_tokenize
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag, word_tokenize
from sklearn.feature_extraction.text import TfidfTransformer

## Import Spacy
import spacy

## Import Gensim packages
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, TfidfModel

## Plotting Tools
import pyLDAvis
import pyLDAvis.gensim

## Notebook display settings
pd.options.display.max_columns = 999

## Reading in data

In [179]:
lyrics_df = pd.read_csv('data//lyric_data.csv', encoding = 'utf8')

FileNotFoundError: File b'data//lyric_data.csv' does not exist

In [180]:
beatles_lyrics = lyrics_df[lyrics_df.artist == 'The Beatles']

NameError: name 'lyrics_df' is not defined

In [None]:
beatles_lyrics.drop(columns = ['link'], inplace = True)

In [None]:
beatles_lyrics.reset_index(drop = True, inplace = True)

In [None]:
beatles_lyrics.rename(columns = {'text' : 'lyrics'}, inplace = True)

In [None]:
beatles_lyrics.to_csv('data//beatles_lyrics.csv')

## Text cleanup

In [2]:
beatles_lyrics = pd.read_csv('data//beatles_lyrics.csv')

In [3]:
## Creating a list of lyrics

stripped_lyrics = beatles_lyrics.lyrics.values.tolist()
stripped_lyrics = [x.lower() for x in stripped_lyrics]

In [4]:
## Cleaning up the text - removing new lines, extra spaces, random characters

stripped_lyrics = [re.sub('\n', '', sent) for sent in stripped_lyrics]
stripped_lyrics = [re.sub('\s{2,}', ' ', sent) for sent in stripped_lyrics]
stripped_lyrics = [re.sub("\\\'", '', sent) for sent in stripped_lyrics]

In [5]:
len(stripped_lyrics)

178

In [6]:
## Importing a stop word list and including a few extra words

stop_words = stopwords.words('english')
stop_words.extend(['hey', 'nah', '[chorus]', 'la', 'na', 'ta', 'll'])

In [126]:
## Tokenizing the lyrics in each song using RegexpTokenizer and removing stop words

tokenizer = RegexpTokenizer(r'\w+')
porter = PorterStemmer()

token_list = []

for i in range(len(stripped_lyrics)):   
    tokenized_list = []
    #tokenized_list = tokenizer.tokenize(stripped_lyrics[i])
    tokenized_list = word_tokenize(stripped_lyrics[i])
    tokenized_list = [i for i in tokenized_list if i not in stop_words]
    #tokenized_list = [pos_tag(word_tokenize(tokens), tagset='universal') for tokens in tokenized_list]
    #tokenized_list = [porter.stem(tokens) for tokens in tokenized_list]
    token_list.append(tokenized_list)

In [127]:
longer_token_list = []

for i in token_list:
    iterate_list = []
    for j in i:
        if ((len(j) > 3) == True):
            iterate_list.append(j)
    longer_token_list.append(iterate_list)

In [128]:
token_list = longer_token_list

In [104]:
nlp = spacy.load('en', disable=['parser', 'ner'])

  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr)).reshape(obj[b'shape'])
  dtype=np.dtype(descr))[0]
  dtype=

In [116]:
def lemmatization(texts, allowed_postags=['NOUN', 'VERB', 'ADJ', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [117]:
token_list = lemmatization(token_list, allowed_postags=['NOUN', 'VERB', 'ADJ', 'ADV'])

In [118]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(token_list, min_count=5, threshold=70) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[token_list], threshold=70)  

# Faster way to get a sentence labeled as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)



In [129]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [130]:
data_words_bigrams = make_bigrams(token_list)
data_words_trigrams = make_trigrams(token_list)

In [131]:
# Create Dictionary
id2word = corpora.Dictionary(data_words_trigrams)

# Create Corpus
texts = data_words_trigrams

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 2), (4, 1), (5, 3), (6, 4), (7, 3), (8, 2), (9, 3), (10, 3), (11, 1), (12, 3), (13, 3), (14, 6), (15, 1), (16, 1), (17, 3), (18, 1), (19, 6), (20, 2), (21, 1), (22, 1), (23, 5), (24, 3), (25, 1), (26, 2), (27, 3), (28, 2), (29, 1), (30, 3), (31, 1), (32, 1), (33, 7), (34, 2), (35, 3), (36, 3), (37, 3), (38, 4), (39, 5), (40, 3), (41, 5), (42, 1), (43, 7), (44, 1), (45, 1), (46, 1), (47, 3), (48, 3), (49, 9), (50, 3), (51, 3)]]


In [132]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5,
                                           random_state=42,
                                           update_every=10,
                                           chunksize=50,
                                           passes=4,
                                           alpha='auto',
                                           per_word_topics=True)

In [173]:
tfidf_model = gensim.models.TfidfModel(corpus, id2word = id2word)

lda_model = gensim.models.ldamodel.LdaModel(tfidf_model[corpus], id2word = id2word, num_topics = 13)

In [174]:
# Print the Keyword in the 5 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.008*"yeah" + 0.004*"sunshine" + 0.004*"hand" + 0.004*"wont" + '
  '0.004*"commonwealth" + 0.004*"youve" + 0.004*"loving" + 0.003*"coming" + '
  '0.003*"leave" + 0.003*"blackbird"'),
 (1,
  '0.005*"hello" + 0.004*"help" + 0.004*"lucille" + 0.004*"well" + '
  '0.003*"cross_town" + 0.003*"roll_mystery_tour" + 0.003*"never" + '
  '0.003*"waited" + 0.003*"bulldog" + 0.003*"dreamers"'),
 (2,
  '0.005*"christmas" + 0.005*"dear_prudence" + 0.005*"to-night" + '
  '0.004*"tonight" + 0.004*"lucy" + 0.004*"diamonds" + 0.003*"black" + '
  '0.003*"hands" + 0.003*"keep" + 0.003*"molly"'),
 (3,
  '0.007*"better" + 0.005*"robert" + 0.005*"girl" + 0.004*"back" + '
  '0.004*"together" + 0.003*"week" + 0.003*"thing" + 0.003*"cheat" + '
  '0.003*"jude" + 0.003*"belonged"'),
 (4,
  '0.008*"want" + 0.005*"money" + 0.004*"thats" + 0.004*"komm" + '
  '0.004*"mister_moonlight" + 0.004*"guilty" + 0.004*"misery" + '
  '0.004*"evrything" + 0.003*"love" + 0.003*"needed"'),
 (5,
  '0.005*"rich" + 0.004*"bo

In [175]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus)) 

coherence_model_lda = CoherenceModel(model = lda_model, texts = texts, dictionary = id2word, coherence = 'c_v')
coherence_lda = coherence_model_lda.get_coherence()

print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.94584329035625

Coherence Score:  0.5649375726137557


In [176]:
## Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)

In [177]:
vis

## TF-IDF on lyrics

In [20]:
#print(token_list)

In [111]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_1 = CountVectorizer()
matrix = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False).fit_transform(token_list)
matrix.toarray()
#matrix = vectorizer.fit_transform(token_list)
#print(stemmed_words)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]], dtype=int64)

In [125]:
from sklearn.feature_extraction.text import TfidfTransformer

no_features = 500

transformer = TfidfTransformer()
tfidf_transform = transformer.fit_transform(matrix)

In [126]:
tfidf_transform

<178x1724 sparse matrix of type '<class 'numpy.float64'>'
	with 6409 stored elements in Compressed Sparse Row format>

In [124]:
tfidf_vectorizer = TfidfVectorizer(max_df=.85, min_df=2, max_features=no_features)
tfidf = tfidf_vectorizer.fit_transform(tfidf_transform)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

AttributeError: lower not found

In [116]:
no_topics = 5

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=0.05, l1_ratio=.5, init='nndsvd').fit(tfidf)

In [131]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        for k in topic.argsort():
            print("Word: ",feature_names[k], "  Score: ", topic[k]/ sum(topic))

no_top_words = 10

display_topics(nmf, tfidf_feature_names, no_top_words)

Topic 0:
Word:  act   Score:  0.0


IndexError: list index out of range

In [100]:
no_features = 500

documents = beatles_lyrics.lyrics.values.tolist()

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=.85, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=.85, min_df=2, max_features=no_features, stop_words= stop_words)
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

In [101]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

no_topics = 5

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=0.05, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=20,random_state=0).fit(tf)

In [102]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
don ll ve want time got know come let just
Topic 1:
love know need ll girl true whoa loved make heart
Topic 2:
yeah oh got john good said brian feeling come end
Topic 3:
baby man want old glad know trying clarabella gonna sister
Topic 4:
hey talk boys better bye ah bulldog ya ha make
Topic 0:
love yeah want let baby oh know come well like
Topic 1:
back baby cry fun get got honey sky lucy diamonds
Topic 2:
go yeah long want oh hand day know good way
Topic 3:
know bye say please time never love hello help oh
Topic 4:
oh girl john mm love di falling yes come right
