In [106]:
from sklearn import naive_bayes
import nltk
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [115]:
# load data file into a pandas data frame
df = pd.read_csv('data.csv', sep=',', names=['emotion', 'lyrics'])

# shuffle and preview data frame
df.sample(frac=1)

Unnamed: 0,emotion,lyrics
399,sad,If you called and I didn't answer There's a c...
369,sad,Walking across the sitting-room I turn the te...
779,happy,I got these fresh eyes never seen you before ...
387,sad,You won't see me crying Crying over you And y...
1212,happy,I won't lie to you I know he's just not right...
180,sad,I met a girl last night had short blonde hair...
62,sad,I miss the old you The one that held me down ...
560,sad,I won't lie to you I know he's just not right...
1110,happy,You shout it out But I can't hear a word you ...
183,sad,Scott Storch Ooh so they think I wanna die y...


In [116]:
# process text data into vectors
stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stopset)
y = df.emotion
X = vectorizer.fit_transform(df.lyrics)
print(y.shape)
print(X.shape)

(1338,)
(1338, 11317)


In [117]:
# split into training and testing subsets and train the model

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

classifier = naive_bayes.MultinomialNB()
classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [118]:
# test model's accuracy
roc_auc_score(y_test, classifier.predict_proba(X_test)[:,1])

0.749371754062657

In [119]:
# dump the vectorizer and trained classifier
joblib.dump(vectorizer, 'vectorizer.pk1')
joblib.dump(classifier, 'classifier.pk1')

['classifier.pk1']

In [120]:
# sanity test
st_array = np.array(["I heard that you're settled down That you found a girl and you're married now I heard that your dreams came true Guess she gave you things I didn't give to you  Old friend, why are you so shy? Ain't like you to hold back or hide from the light  I hate to turn up out of the blue, uninvited But I couldn't stay away, I couldn't fight it I had hoped you'd see my face and that you'd be reminded That for me, it isn't over  Never mind, I'll find someone like you I wish nothing but the best for you two Don't forget me, I beg, I'll remember you said Sometimes it lasts in love, but sometimes it hurts instead Sometimes it lasts in love, but sometimes it hurts instead, yeah  You know how the time flies Only yesterday was the time of our lives We were born and raised in a summer haze Bound by the surprise of our glory days  I hate to turn up out of the blue, uninvited But I couldn't stay away, I couldn't fight it I had hoped you'd see my face and that you'd be reminded That for me, it isn't over  Never mind, I'll find someone like you I wish nothing but the best for you two Don't forget me, I beg, I'll remember you said Sometimes it lasts in love, but sometimes it hurts instead Nothing compares, no worries or cares  Regrets and mistakes, they're memories made Who would have known how bittersweet this would taste? Never mind, I'll find someone like you  I wish nothing but the best for you Don't forget me, I beg, I'll remember you said Sometimes it lasts in love, but sometimes it hurts instead Never mind, I'll find someone like you I wish nothing but the best for you two Don't forget me, I beg, I'll remember you said Sometimes it lasts in love, but sometimes it hurts instead Sometimes it lasts in love, but sometimes it hurts instead, yeah"])
st_vector = vectorizer.transform(st_array)
print(classifier.predict(st_vector))

['sad']
