## Import Libraries

In [5]:
import pandas as pd
import re
import spacy

import pandas as pd
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.manifold import TSNE

from nltk.tokenize import word_tokenize
import nltk
from nltk.stem import SnowballStemmer
from sklearn.model_selection import train_test_split
import keras
from sklearn.metrics import f1_score


Using TensorFlow backend.


In [None]:
pip install tensorflow


In [None]:
# pip install keras


In [None]:
df = pd.read_csv('lyrics.csv', index_col='index')


In [None]:
#let's look at genre value counts

df.genre.value_counts()

In [None]:
#filter DF to four genres
genre_list = ['Rock', 'Pop', 'Country', 'Hip-Hop']


df1 = df.loc[(df['genre'] == 'Rock') | (df['genre'] == 'Pop') | (df['genre'] == 'Country') | (df['genre'] == 'Hip-Hop')]

In [None]:
#check the shape of the new dataframe
df1.shape

In [None]:
df1.isna().sum()

In [None]:
#drop missing values from df
df1.dropna(inplace=True)


In [None]:
df1.shape

## Cleaning Database

In [None]:
song = df1.lyrics[1]
song

In [None]:
#create a function that clean and tokenizes lyrics

def clean_tokenize_lyrics(song):
    word_list = []
    tokenized_lyrics = word_tokenize(song)
     #remove all tokens that are not alphabetic
    words = [word for word in tokenized_lyrics if word.isalpha()]
    for word in words:
        lower_word = word.lower()
        word_list.append(lower_word)
    return word_list

In [None]:
# clean_tokenize_lyrics(song)

In [None]:
#clean lyrics for all songs
df1['lyrics'] = df1.lyrics.apply(lambda x: clean_tokenize_lyrics(x))


### Stemm Clean Lyrics

In [None]:
stemmer = SnowballStemmer('english')


In [None]:
df1['cleaned_lyrics'] = df1['lyrics'].apply(lambda x: stemmer.stem(x))

In [None]:
df1['cleaned_lyrics']

In [None]:
#remap genres to numerical values
genre_dict = {'Rock':1, 'Pop':2, 'Hip-Hop': 3, 'Country':4}

df1['genre'] = df1['genre'].replace(genre_dict)

In [None]:
df1['genre'].value_counts()

### Save Clean Database as csv file

In [6]:
# df1.to_csv('clean_lyric_df.csv')
df1 = pd.read_csv('clean_lyric_df.csv', index_col=0)

## NLP Preprocessing

### Vectorize Words

In [None]:
#create function to vectorize song

def count_vectorize(song, vocab=None):
    if vocab:
        unique_words = vocab
    else:
        unique_words = list(set(song))
    
    song_dict = {i:0 for i in unique_words}
    
    for word in song:
        song_dict[word] += 1
    
    return song_dict

In [None]:
BoW = count_vectorize(df1['lyrics'][0])

### Term Frequency

In [None]:
#create term frequency function
def term_frequency(BoW_dict):
    total_word_count = sum(BoW_dict.values())
    
    for ind, val in BoW_dict.items():
        BoW_dict[ind] = val/ total_word_count
    
    return BoW_dict

In [None]:
#create a list of dictionaries
def create_list_of_BoW(song_lyrics):

    list_of_dictionaries = []

    for song in song_lyrics:
        BoW = count_vectorize(song)

        list_of_dictionaries.append(BoW)
    return list_of_dictionaries

list_of_dictionaries = create_list_of_BoW(df1['lyrics'])

### Inverse Document Frequency

In [None]:
def inverse_document_frequency(list_of_dicts):
    vocab_set = set()
    # Iterate through list of dfs and add index to vocab_set
    for d in list_of_dicts:
        for word in d.keys():
            vocab_set.add(word)
    
    # Once vocab set is complete, create an empty dictionary with a key for each word and value of 0.
    full_vocab_dict = {i:0 for i in vocab_set}
    
    # Loop through each word in full_vocab_dict
    for word, val in full_vocab_dict.items():
        docs = 0
        
        # Loop through list of dicts.  Each time a dictionary contains the word, increment docs by 1
        for d in list_of_dicts:
            if word in d:
                docs += 1
        
        # Now that we know denominator for equation, compute and set IDF value for word
        
        full_vocab_dict[word] = np.log((len(list_of_dicts)/ float(docs)))
    
    return full_vocab_dict

In [None]:
inverse_document_frequency(list_of_dictionaries)

## Let's Start Modeling!!

### Importing sklean multi-class models

In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report


In [8]:
#define features and target variables

features = df1['cleaned_lyrics']
target = df1['genre']

In [6]:
#train test split dataset 
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=.25, random_state=0)

NameError: name 'features' is not defined

In [4]:
#instantiate piple model
pipeline_model = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])

In [5]:
#fit training set onto model
pipeline_model.fit(X_train, y_train)


NameError: name 'X_train' is not defined

In [None]:
y_pred = pipeline_model.predict(X_test)

In [None]:
accuracy_score(y_pred, y_test)
f1_score(y_pred, y_test)

### Train Test Split (non-stemmed lyrics)

In [13]:
#let's use the non-stemmed lyrics as features

features_nonstemmed = df1['lyrics']
target = df1['genre']
#train test split dataset 
X_train, X_test, y_train, y_test = train_test_split(features_nonstemmed, target, test_size=.25, random_state=0)

In [14]:
#instantiate piple model
pipeline_model = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])

In [15]:
pipeline_model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [17]:
y_pred1 = pipeline_model.predict(X_test)
accuracy_score(y_pred1, y_test)


0.6282417698740341

### Support Vector Machine

In [None]:
from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)

In [None]:
y_pred2 = sgd.predict(X_test)
accuracy_score(y_pred2, y_test)

### Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=-1, C=1e5)),
               ])
logreg.fit(X_train, y_train)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=100000.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=

In [20]:
y_pred3 = logreg.predict(X_test)
accuracy_score(y_pred3, y_test)


0.6923467767545253

In [23]:
f1_score(y_pred3, y_test, average='weighted')

0.6992354442433582

### Classification with Word Embeddings

In [None]:
# pip install gensim

In [None]:
from gensim.models import Word2Vec