## Import Libraries

In [1]:
import pandas as pd
import re
import spacy

import pandas as pd
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.manifold import TSNE

from nltk.tokenize import word_tokenize
import nltk
from nltk.stem import SnowballStemmer


In [4]:
df = pd.read_csv('lyrics.csv', index_col='index')


In [8]:
#let's look at genre value counts

df.genre.value_counts()

Rock             131377
Pop               49444
Hip-Hop           33965
Not Available     29814
Metal             28408
Other             23683
Country           17286
Jazz              17147
Electronic        16205
R&B                5935
Indie              5732
Folk               3241
Name: genre, dtype: int64

In [45]:
#filter DF to four genres
genre_list = ['Rock', 'Pop', 'Country', 'Hip-Hop']


df1 = df.loc[(df['genre'] == 'Rock') | (df['genre'] == 'Pop') | (df['genre'] == 'Country') | (df['genre'] == 'Hip-Hop')]

In [60]:
#check the shape of the new dataframe
df1.shape

(232072, 5)

In [61]:
df1.isna().sum()

song          0
year          0
artist        0
genre         0
lyrics    43134
dtype: int64

In [62]:
#drop missing values from df
df1.dropna(inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [63]:
df1.shape

(188938, 5)

## Cleaning Database

In [51]:
song = df1.lyrics[1]
song

"playin' everything so easy,\nit's like you seem so sure.\nstill your ways, you dont see\ni'm not sure if they're for me.\nthen things come right along our way, though we didn't truly ask.\nit seems as if they're gonna linger\nwith every delight they bring,\njust like what you have truly seemed.\ni'm trying to think of what you really want to say,\neven through my darkest day.\nyou might want to leave me,\nfeeling strange about you\nlike you're gonna let me know,\nwhen words then slipped out of you.\nwhen words dont come so easy to say\nyou just leave me feeling, come what may\nthough i want things coming from your way.\ni say to you, you bore me all the time\nwhen you seem to hold back all in you,\nall that you want to let me know.\nwhy dont you have the courage?\nspeak up and i'll listen,\nif you truly want me to know, then tell me.\nis there something wrong with you\nand you seem fastened there.\nit sounds as if there'll be a melody\nif things in you are let out\nand then i will fee

In [52]:
#create a function that clean and tokenizes lyrics

def clean_tokenize_lyrics(song):
    word_list = []
    tokenized_lyrics = word_tokenize(song)
     #remove all tokens that are not alphabetic
    words = [word for word in tokenized_lyrics if word.isalpha()]
    for word in words:
        lower_word = word.lower()
        word_list.append(lower_word)
    return word_list

In [54]:
# clean_tokenize_lyrics(song)

In [5]:
#clean lyrics for all songs
df1['lyrics'] = df1.lyrics.apply(lambda x: clean_tokenize_lyrics(x))


NameError: name 'df1' is not defined

### Stemm Clean Lyrics

In [7]:
stemmer = SnowballStemmer('english')


In [8]:
df1['cleaned_lyrics'] = df1['lyrics'].apply(lambda x: stemmer.stem(x))

In [9]:
df1['cleaned_lyrics']

0         ['oh', 'baby', 'how', 'you', 'doing', 'you', '...
1         ['playin', 'everything', 'so', 'easy', 'it', '...
2         ['if', 'you', 'search', 'for', 'tenderness', '...
3         ['oh', 'oh', 'oh', 'i', 'oh', 'oh', 'oh', 'i',...
4         ['party', 'the', 'people', 'the', 'people', 't...
                                ...                        
188933    ['i', 'got', 'ta', 'say', 'boy', 'after', 'onl...
188934    ['i', 'helped', 'you', 'find', 'her', 'diamond...
188935    ['look', 'at', 'the', 'couple', 'in', 'the', '...
188936    ['when', 'i', 'fly', 'off', 'this', 'mortal', ...
188937    ['i', 'heard', 'from', 'a', 'friend', 'of', 'a...
Name: cleaned_lyrics, Length: 188938, dtype: object

### Save Clean Database as csv file

In [2]:
# df1.to_csv('clean_lyric_df.csv')
df1 = pd.read_csv('clean_lyric_df.csv')

## NLP Preprocessing

### Vectorize Words

In [66]:
#create function to vectorize song

def count_vectorize(song, vocab=None):
    if vocab:
        unique_words = vocab
    else:
        unique_words = list(set(song))
    
    song_dict = {i:0 for i in unique_words}
    
    for word in song:
        song_dict[word] += 1
    
    return song_dict

In [70]:
BoW = count_vectorize(df1['lyrics'][0])

### Term Frequency

In [69]:
#create term frequency function
def term_frequency(BoW_dict):
    total_word_count = sum(BoW_dict.values())
    
    for ind, val in BoW_dict.items():
        BoW_dict[ind] = val/ total_word_count
    
    return BoW_dict

In [76]:
#create a list of dictionaries
def create_list_of_BoW(song_lyrics):

    list_of_dictionaries = []

    for song in song_lyrics:
        BoW = count_vectorize(song)

        list_of_dictionaries.append(BoW)
    return list_of_dictionaries

list_of_dictionaries = create_list_of_BoW(df1['lyrics'])

### Inverse Document Frequency

In [78]:
def inverse_document_frequency(list_of_dicts):
    vocab_set = set()
    # Iterate through list of dfs and add index to vocab_set
    for d in list_of_dicts:
        for word in d.keys():
            vocab_set.add(word)
    
    # Once vocab set is complete, create an empty dictionary with a key for each word and value of 0.
    full_vocab_dict = {i:0 for i in vocab_set}
    
    # Loop through each word in full_vocab_dict
    for word, val in full_vocab_dict.items():
        docs = 0
        
        # Loop through list of dicts.  Each time a dictionary contains the word, increment docs by 1
        for d in list_of_dicts:
            if word in d:
                docs += 1
        
        # Now that we know denominator for equation, compute and set IDF value for word
        
        full_vocab_dict[word] = np.log((len(list_of_dicts)/ float(docs)))
    
    return full_vocab_dict

In [None]:
inverse_document_frequency(list_of_dictionaries)