### Reading the dataset

In [9]:
import pandas as pd

df = pd.read_csv('train_lyrics_1000.csv')
df2 = pd.read_csv('valid_lyrics_200.csv')
df = pd.concat([df, df2])
df.tail()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """


Unnamed: 0,artist,file,genre,lyrics,mood,title,year
195,Prince,TRAKQEA128F1495E21.h5,Rock,{B-side of Glam Slam}\nSnare drum pounds on th...,happy,Escape ( LP Version),
196,Cavo,TRAKQLN128F932AC25.h5,Rock,Well I will rise\nThe morning comes\nNothing e...,sad,Over Again (Album Version),
197,AFI,TRAKQXJ128F147A028.h5,Rock,"Listen when I say, when I say it's real\nReal ...",happy,Summer Shudder,
198,Vitamin C,TRAKRQW128F427D6E3.h5,Pop,"Imagine a world where the girls, girls rule th...",happy,Girls Against Boys (LP Version),
199,Richard Burton,TRAKSRQ128F4269AE8.h5,Jazz,"Each evening, from December to December\nBefor...",happy,Camelot,


<br>
<br>

### Label Encoder

In [10]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

X_train = df['lyrics'].values 
y_train = df['mood'].values

print('before: %s ...' %y_train[:5])

le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)

print('after: %s ...' %y_train[:5])

before: ['sad' 'happy' 'sad' 'happy' 'sad'] ...
after: [1 0 1 0 1] ...


<br>
<br>

### Porter Stemmer

In [11]:
# Porter Stemmer

import nltk
import string
import re

porter_stemmer = nltk.stem.porter.PorterStemmer()

def porter_tokenizer(text, stemmer=porter_stemmer):
    """
    A Porter-Stemmer-Tokenizer hybrid to splits sentences into words (tokens) 
    and applies the porter stemming algorithm to each of the obtained token. 
    Tokens that are only consisting of punctuation characters are removed as well.
    Only tokens that consist of more than one letter are being kept.
    
    Parameters
    ----------
        
    text : `str`. 
      A sentence that is to split into words.
        
    Returns
    ----------
    
    no_punct : `str`. 
      A list of tokens after stemming and removing Sentence punctuation patterns.
    
    """
    lower_txt = text.lower()
    tokens = nltk.wordpunct_tokenize(lower_txt)
    stems = [porter_stemmer.stem(t) for t in tokens]
    no_punct = [s for s in stems if re.match('^[a-zA-Z]+$', s) is not None]
    return no_punct


In [12]:
# Commented out to prevent overwriting files:
#
# stp = nltk.corpus.stopwords.words('english')
# with open('./stopwords_eng.txt', 'w') as outfile:
#    outfile.write('\n'.join(stp))
    
    
with open('./stopwords_eng.txt', 'r') as infile:
    stop_words = infile.read().splitlines()
print('stop words %s ...' %stop_words[:5])

stop words ['i', 'me', 'my', 'myself', 'we'] ...


## Vectorizer

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

import re
"""
countv = CountVectorizer(
                  binary=False,
                  decode_error="replace",
                  stop_words=stop_words,
                  preprocessor=lambda text: re.sub('[^a-zA-Z]', ' ', text.lower()),
                  ngram_range=(1,1),
                  tokenizer=lambda text: text.split()
                )
"""
countv = CountVectorizer(
                  binary=False,
                  decode_error="replace",
                  stop_words=stop_words,
                  ngram_range=(1,1),

                )


In [14]:
countv = countv.fit(X_train)
X_train_countv = countv.transform(X_train)

In [15]:
from sklearn.naive_bayes import MultinomialNB

clf_countv = MultinomialNB(alpha=1.0, fit_prior=False)
clf_countv = clf_countv.fit(X_train_countv, y_train)

In [16]:
#%matplotlib inline
#import matplotlib.pyplot as plt
#import seaborn as sns
from sklearn import metrics
import numpy as np
#import matplotlib as mpl

cm = metrics.confusion_matrix(y_train, clf_countv.predict(X_train_countv))

print(cm)


[[533  18]
 [ 37 612]]


In [17]:
import shelve
import dill

d = shelve.open('countv_clf')

d['label_encoder'] = le
d['lyrics_countv'] = countv
d['lyrics_clf'] = clf_countv
d.close()

In [18]:
import dill


try:
    d = open('label_encoder.p', 'wb')
    dill.dump(le, d)
finally:
    d.close()
    
    
try:
    d = open('countv.p', 'wb')
    dill.dump(countv, d)
finally:
    d.close()   

try:
    d = open('clf_countv.p', 'wb')
    dill.dump(clf_countv, d)
finally:
    d.close()  
    

In [19]:
import pickle


try:
    d = open('label_encoder.p', 'wb')
    pickle.dump(le, d)
finally:
    d.close()
    
    
try:
    d = open('countv.p', 'wb')
    pickle.dump(countv, d)
finally:
    d.close()   

try:
    d = open('clf_countv.p', 'wb')
    pickle.dump(clf_countv, d)
finally:
    d.close() 