In [1]:
## Basic Packages
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import re
from pprint import pprint

## Import NLTK packages
from nltk.corpus import stopwords
from nltk import RegexpTokenizer
from nltk import wordpunct_tokenize
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag, word_tokenize
from sklearn.feature_extraction.text import TfidfTransformer

## Import Spacy
import spacy

## Import Gensim packages
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

## Notebook display settings
pd.options.display.max_columns = 999

## Reading in data

In [3]:
lyrics_df = pd.read_csv('data//lyric_data.csv', encoding = 'utf8')

FileNotFoundError: File b'data/lyric_data.csv' does not exist

In [None]:
beatles_lyrics = lyrics_df[lyrics_df.artist == 'The Beatles']

In [None]:
beatles_lyrics.drop(columns = ['link'], inplace = True)

In [None]:
beatles_lyrics.reset_index(drop = True, inplace = True)

In [None]:
beatles_lyrics.rename(columns = {'text' : 'lyrics'}, inplace = True)

In [None]:
beatles_lyrics.to_csv('data//beatles_lyrics.csv')

## Text cleanup

In [4]:
beatles_lyrics = pd.read_csv('data//beatles_lyrics.csv')

In [5]:
## Creating a list of lyrics

stripped_lyrics = beatles_lyrics.lyrics.values.tolist()
stripped_lyrics = [x.lower() for x in stripped_lyrics]

In [6]:
## Cleaning up the text - removing new lines, extra spaces, random characters

stripped_lyrics = [re.sub('\n', '', sent) for sent in stripped_lyrics]
stripped_lyrics = [re.sub('\s{2,}', ' ', sent) for sent in stripped_lyrics]
stripped_lyrics = [re.sub("\\\'", '', sent) for sent in stripped_lyrics]

In [7]:
len(stripped_lyrics)

178

In [67]:
## Importing a stop word list and including a few extra words

stop_words = stopwords.words('english')
stop_words.extend(['hey', 'nah', '[chorus]', 'la', 'na', 'ta', 'll'])

In [9]:
## Tokenizing the lyrics in each song using RegexpTokenizer and removing stop words

tokenizer = RegexpTokenizer(r'\w+')
porter = PorterStemmer()

token_list = []

for i in range(len(stripped_lyrics)):   
    tokenized_list = []
    #tokenized_list = tokenizer.tokenize(stripped_lyrics[i])
    tokenized_list = word_tokenize(stripped_lyrics[i])
    tokenized_list = [i for i in tokenized_list if i not in stop_words]
    #tokenized_list = [pos_tag(word_tokenize(tokens), tagset='universal') for tokens in tokenized_list]
    #tokenized_list = [porter.stem(tokens) for tokens in tokenized_list]
    token_list.append(tokenized_list)

In [10]:
nlp = spacy.load('en', disable=['parser', 'ner'])

In [11]:
def lemmatization(texts, allowed_postags=['NOUN', 'VERB', 'ADJ', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [12]:
token_list = lemmatization(token_list, allowed_postags=['NOUN', 'VERB', 'ADJ', 'ADV'])

In [13]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(token_list, min_count=5, threshold=70) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[token_list], threshold=70)  

# Faster way to get a sentence labeled as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)



In [14]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [15]:
data_words_bigrams = make_bigrams(token_list)
data_words_trigrams = make_trigrams(token_list)

In [16]:
# Create Dictionary
id2word = corpora.Dictionary(data_words_trigrams)

# Create Corpus
texts = data_words_trigrams

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 3), (3, 3), (4, 4), (5, 2), (6, 3), (7, 1), (8, 3), (9, 1), (10, 3), (11, 3), (12, 6), (13, 1), (14, 3), (15, 1), (16, 1), (17, 15), (18, 3), (19, 2), (20, 1), (21, 1), (22, 3), (23, 2), (24, 3), (25, 2), (26, 3), (27, 2), (28, 1), (29, 1), (30, 8), (31, 3), (32, 1), (33, 1), (34, 1), (35, 7), (36, 2), (37, 3), (38, 2), (39, 3), (40, 3), (41, 4), (42, 5), (43, 3), (44, 1), (45, 5), (46, 1), (47, 8), (48, 1), (49, 1), (50, 3), (51, 3), (52, 1), (53, 8), (54, 3)]]


In [17]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=13,
                                           random_state=42,
                                           update_every=10,
                                           chunksize=50,
                                           passes=4,
                                           alpha='auto',
                                           per_word_topics=True)

In [18]:
# Print the Keyword in the 5 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.067*"go" + 0.057*"not" + 0.052*"do" + 0.024*"long" + 0.020*"be" + '
  '0.017*"know" + 0.014*"never" + 0.014*"love" + 0.014*"see" + 0.013*"johnny"'),
 (1,
  '0.049*"not" + 0.029*"be" + 0.027*"do" + 0.018*"make" + 0.016*"say" + '
  '0.016*"know" + 0.016*"buy" + 0.015*"back" + 0.015*"help" + 0.014*"get"'),
 (2,
  '0.056*"be" + 0.034*"know" + 0.032*"not" + 0.020*"love" + 0.019*"little" + '
  '0.017*"come" + 0.017*"do" + 0.014*"say" + 0.013*"will" + 0.013*"would"'),
 (3,
  '0.039*"s" + 0.032*"get" + 0.024*"not" + 0.020*"be" + 0.017*"want" + '
  '0.017*"good" + 0.016*"love" + 0.016*"have" + 0.014*"girl" + 0.014*"go"'),
 (4,
  '0.054*"be" + 0.044*"baby" + 0.036*"get" + 0.032*"not" + 0.029*"love" + '
  '0.025*"ill" + 0.018*"want" + 0.013*"long" + 0.012*"know" + 0.012*"can"'),
 (5,
  '0.040*"together" + 0.028*"want" + 0.024*"cry" + 0.024*"man" + 0.022*"baby" '
  '+ 0.022*"be" + 0.020*"love" + 0.015*"back" + 0.015*"bring" + 0.014*"gon"'),
 (6,
  '0.039*"love" + 0.039*"hold" + 0.036*"da

In [19]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus)) 


Perplexity:  -6.07424918674643


## TF-IDF on lyrics

In [20]:
#print(token_list)

In [111]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_1 = CountVectorizer()
matrix = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False).fit_transform(token_list)
matrix.toarray()
#matrix = vectorizer.fit_transform(token_list)
#print(stemmed_words)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]], dtype=int64)

In [125]:
from sklearn.feature_extraction.text import TfidfTransformer

no_features = 500

transformer = TfidfTransformer()
tfidf_transform = transformer.fit_transform(matrix)

In [126]:
tfidf_transform

<178x1724 sparse matrix of type '<class 'numpy.float64'>'
	with 6409 stored elements in Compressed Sparse Row format>

In [124]:
tfidf_vectorizer = TfidfVectorizer(max_df=.85, min_df=2, max_features=no_features)
tfidf = tfidf_vectorizer.fit_transform(tfidf_transform)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

AttributeError: lower not found

In [116]:
no_topics = 5

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=0.05, l1_ratio=.5, init='nndsvd').fit(tfidf)

In [131]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        for k in topic.argsort():
            print("Word: ",feature_names[k], "  Score: ", topic[k]/ sum(topic))

no_top_words = 10

display_topics(nmf, tfidf_feature_names, no_top_words)

Topic 0:
Word:  act   Score:  0.0


IndexError: list index out of range

In [100]:
no_features = 500

documents = beatles_lyrics.lyrics.values.tolist()

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=.85, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=.85, min_df=2, max_features=no_features, stop_words= stop_words)
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

In [101]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

no_topics = 5

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=0.05, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=20,random_state=0).fit(tf)

In [102]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
don ll ve want time got know come let just
Topic 1:
love know need ll girl true whoa loved make heart
Topic 2:
yeah oh got john good said brian feeling come end
Topic 3:
baby man want old glad know trying clarabella gonna sister
Topic 4:
hey talk boys better bye ah bulldog ya ha make
Topic 0:
love yeah want let baby oh know come well like
Topic 1:
back baby cry fun get got honey sky lucy diamonds
Topic 2:
go yeah long want oh hand day know good way
Topic 3:
know bye say please time never love hello help oh
Topic 4:
oh girl john mm love di falling yes come right
