In [27]:
## Basic Packages
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import re
from pprint import pprint

## Import NLTK packages
from nltk.corpus import stopwords
from nltk import RegexpTokenizer
from nltk import wordpunct_tokenize
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag, word_tokenize
from sklearn.feature_extraction.text import TfidfTransformer

## Import Spacy
import spacy

## Import Gensim packages
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

## Notebook display settings
pd.options.display.max_columns = 999

## Reading in data

In [3]:
lyrics_df = pd.read_csv('data//lyric_data.csv', encoding = 'utf8')

FileNotFoundError: File b'data//lyric_data.csv' does not exist

In [None]:
beatles_lyrics = lyrics_df[lyrics_df.artist == 'The Beatles']

In [None]:
beatles_lyrics.drop(columns = ['link'], inplace = True)

In [None]:
beatles_lyrics.reset_index(drop = True, inplace = True)

In [None]:
beatles_lyrics.rename(columns = {'text' : 'lyrics'}, inplace = True)

In [None]:
beatles_lyrics.to_csv('data//beatles_lyrics.csv')

## Text cleanup

In [4]:
beatles_lyrics = pd.read_csv('data//beatles_lyrics.csv')

In [5]:
## Creating a list of lyrics

stripped_lyrics = beatles_lyrics.lyrics.values.tolist()
stripped_lyrics = [x.lower() for x in stripped_lyrics]

In [6]:
## Cleaning up the text - removing new lines, extra spaces, random characters

stripped_lyrics = [re.sub('\n', '', sent) for sent in stripped_lyrics]
stripped_lyrics = [re.sub('\s{2,}', ' ', sent) for sent in stripped_lyrics]
stripped_lyrics = [re.sub("\\\'", '', sent) for sent in stripped_lyrics]

In [7]:
len(stripped_lyrics)

178

In [8]:
## Importing a stop word list and including a few extra words

stop_words = stopwords.words('english')
stop_words.extend(['hey', 'nah', '[chorus]', 'la', 'na', 'ta'])

In [9]:
## Tokenizing the lyrics in each song using RegexpTokenizer and removing stop words

tokenizer = RegexpTokenizer(r'\w+')
porter = PorterStemmer()

token_list = []

for i in range(len(stripped_lyrics)):   
    tokenized_list = []
    #tokenized_list = tokenizer.tokenize(stripped_lyrics[i])
    tokenized_list = word_tokenize(stripped_lyrics[i])
    tokenized_list = [i for i in tokenized_list if i not in stop_words]
    #tokenized_list = [pos_tag(word_tokenize(tokens), tagset='universal') for tokens in tokenized_list]
    #tokenized_list = [porter.stem(tokens) for tokens in tokenized_list]
    token_list.append(tokenized_list)

In [13]:
nlp = spacy.load('en', disable=['parser', 'ner'])

In [14]:
def lemmatization(texts, allowed_postags=['NOUN', 'VERB', 'ADJ', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [15]:
token_list = lemmatization(token_list, allowed_postags=['NOUN', 'VERB', 'ADJ', 'ADV'])

In [18]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(token_list, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[token_list], threshold=100)  

# Faster way to get a sentence labeled as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)



In [19]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [20]:
data_words_bigrams = make_bigrams(token_list)
data_words_trigrams = make_trigrams(token_list)

In [21]:
# Create Dictionary
id2word = corpora.Dictionary(data_words_trigrams)

# Create Corpus
texts = data_words_trigrams

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 3), (3, 3), (4, 4), (5, 2), (6, 3), (7, 1), (8, 3), (9, 1), (10, 3), (11, 3), (12, 6), (13, 1), (14, 3), (15, 1), (16, 1), (17, 15), (18, 3), (19, 2), (20, 1), (21, 1), (22, 3), (23, 2), (24, 3), (25, 2), (26, 3), (27, 2), (28, 1), (29, 1), (30, 8), (31, 3), (32, 1), (33, 1), (34, 1), (35, 7), (36, 2), (37, 3), (38, 2), (39, 3), (40, 3), (41, 4), (42, 5), (43, 3), (44, 1), (45, 5), (46, 1), (47, 8), (48, 1), (49, 1), (50, 3), (51, 3), (52, 1), (53, 8), (54, 3)]]


In [24]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=13,
                                           random_state=42,
                                           update_every=10,
                                           chunksize=50,
                                           passes=4,
                                           alpha='auto',
                                           per_word_topics=True)

In [25]:
# Print the Keyword in the 5 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.067*"go" + 0.057*"not" + 0.052*"do" + 0.024*"long" + 0.020*"be" + '
  '0.017*"know" + 0.014*"never" + 0.014*"love" + 0.014*"see" + 0.013*"johnny"'),
 (1,
  '0.049*"not" + 0.029*"be" + 0.027*"do" + 0.018*"make" + 0.016*"say" + '
  '0.016*"know" + 0.016*"buy" + 0.015*"back" + 0.015*"help" + 0.014*"get"'),
 (2,
  '0.056*"be" + 0.034*"know" + 0.032*"not" + 0.020*"love" + 0.019*"little" + '
  '0.017*"come" + 0.017*"do" + 0.014*"say" + 0.013*"will" + 0.013*"would"'),
 (3,
  '0.039*"s" + 0.032*"get" + 0.024*"not" + 0.020*"be" + 0.017*"want" + '
  '0.017*"good" + 0.016*"love" + 0.016*"have" + 0.014*"girl" + 0.014*"go"'),
 (4,
  '0.054*"be" + 0.044*"baby" + 0.036*"get" + 0.032*"not" + 0.029*"love" + '
  '0.025*"ill" + 0.018*"want" + 0.013*"long" + 0.012*"know" + 0.012*"can"'),
 (5,
  '0.040*"together" + 0.028*"want" + 0.024*"cry" + 0.024*"man" + 0.022*"baby" '
  '+ 0.022*"be" + 0.020*"love" + 0.015*"back" + 0.015*"bring" + 0.014*"gon"'),
 (6,
  '0.039*"love" + 0.039*"hold" + 0.036*"da

In [138]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus)) 


Perplexity:  -5.901105077234152


## TF-IDF on lyrics

In [183]:
#print(token_list)

In [52]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_1 = CountVectorizer()
matrix = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False).fit_transform(token_list)
matrix.toarray()
#matrix = vectorizer.fit_transform(token_list)
#print(stemmed_words)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]], dtype=int64)

In [73]:
print(matrix)

  (0, 676)	3
  (0, 1400)	5
  (0, 677)	2
  (0, 96)	3
  (0, 517)	3
  (0, 286)	3
  (0, 246)	3
  (0, 1353)	3
  (0, 340)	3
  (0, 1528)	3
  (0, 1701)	3
  (0, 1305)	3
  (0, 1541)	3
  (0, 1253)	3
  (0, 596)	3
  (0, 93)	3
  (0, 807)	3
  (0, 397)	6
  (0, 888)	3
  (0, 1651)	8
  (0, 1330)	3
  (0, 342)	3
  (0, 1091)	3
  (0, 942)	2
  (0, 597)	2
  :	:
  (177, 1289)	1
  (177, 1512)	1
  (177, 1704)	1
  (177, 1303)	1
  (177, 1518)	1
  (177, 718)	3
  (177, 872)	1
  (177, 1273)	2
  (177, 1671)	4
  (177, 1633)	2
  (177, 886)	8
  (177, 1242)	1
  (177, 907)	2
  (177, 236)	1
  (177, 1400)	2
  (177, 286)	4
  (177, 1701)	1
  (177, 93)	16
  (177, 807)	3
  (177, 1330)	1
  (177, 1044)	5
  (177, 207)	1
  (177, 579)	2
  (177, 520)	1
  (177, 1435)	1


In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

tfidf = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False).fit_transform(token_list)

In [72]:
print(tfidf)

  (0, 676)	3
  (0, 1400)	5
  (0, 677)	2
  (0, 96)	3
  (0, 517)	3
  (0, 286)	3
  (0, 246)	3
  (0, 1353)	3
  (0, 340)	3
  (0, 1528)	3
  (0, 1701)	3
  (0, 1305)	3
  (0, 1541)	3
  (0, 1253)	3
  (0, 596)	3
  (0, 93)	3
  (0, 807)	3
  (0, 397)	6
  (0, 888)	3
  (0, 1651)	8
  (0, 1330)	3
  (0, 342)	3
  (0, 1091)	3
  (0, 942)	2
  (0, 597)	2
  :	:
  (177, 1289)	1
  (177, 1512)	1
  (177, 1704)	1
  (177, 1303)	1
  (177, 1518)	1
  (177, 718)	3
  (177, 872)	1
  (177, 1273)	2
  (177, 1671)	4
  (177, 1633)	2
  (177, 886)	8
  (177, 1242)	1
  (177, 907)	2
  (177, 236)	1
  (177, 1400)	2
  (177, 286)	4
  (177, 1701)	1
  (177, 93)	16
  (177, 807)	3
  (177, 1330)	1
  (177, 1044)	5
  (177, 207)	1
  (177, 579)	2
  (177, 520)	1
  (177, 1435)	1


In [71]:
#tfidf._validate_vocabulary()

for i, feature in enumerate(vectorizer):
    print(i, feature)

TypeError: 'TfidfVectorizer' object is not iterable