## Model training/2

### We use the stored files in the 'dataset' directory as our corpus

In [1]:
import os
import nltk
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

corpusdir = 'dataset/' # Directory of corpus.
song_lyrics = PlaintextCorpusReader(corpusdir, '.*')

In [2]:
categories = ['happy', 'sad']
documents = []

for fileid in song_lyrics.fileids():
    for category in categories:
        if category in fileid:
            if category == 'happy':
                documents.append((list(song_lyrics.words(fileid)), 1))
            else:
                documents.append((list(song_lyrics.words(fileid)), 0))

import random
random.seed(12345)
random.shuffle(documents)



In [3]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
import re

def hasNumbers(inputString):
    return bool(re.search(r'\d', inputString))

# This function does all the textual preprocessing steps
def preprocessing(string_to_process):
    normalized_string = string_to_process.lower().replace("_", "")
    # 1. Tokenize it! This also removes non alphanumeric characters since we're tokenizing words only.
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(normalized_string)
    
    # 2. Remove stopwords
    #filtered_tokens = [word for word in tokens if word not in stopwords.words('english')]
    
    # 3. Remove numbers
    filtered_tokens = [word for word in tokens if not hasNumbers(word)]
    
    # 4. Stemm it!
    stemmer = SnowballStemmer("english", ignore_stopwords=True)
    #stemmer = SnowballStemmer("english")
    #stemmer = LancasterStemmer()
    #stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    
    return stemmed_tokens
    #return filtered_tokens
    



In [4]:
preprocessed_documents = list()

for document in documents:
    i = documents.index(document)
    if (i % 200 == 0):
        print("Document %s " % i)
    document_text = ' '.join(document[0])
    preprocessed_text = preprocessing(document_text)
    preprocessed_documents.append((preprocessed_text, document[1]))
print("done!")

Document 0 
Document 200 
Document 600 
Document 800 
Document 1000 
Document 1200 
Document 1400 
Document 1600 
Document 1800 
Document 2000 
Document 2200 
Document 2400 
Document 2600 
done!


### As Feature Extraction, we use TFIDF

In [5]:
corpus = []
labels = []
for doc in preprocessed_documents:
    txt = " ".join(doc[0])
    corpus.append(txt)
    labels.append(doc[1])
print(len(corpus))
print(len(labels))

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf_matrix=vectorizer.fit_transform(corpus).todense()
tfidf_names=vectorizer.get_feature_names()
print("Number of TFIDF Features: %d"%len(tfidf_names)) #same info can be gathered by using tfidf_matrix.shape
variables = tfidf_matrix

2800
2800
Number of TFIDF Features: 149637


### We apply Chi-Square feature selection in order to reduce the number of features. We select 10000 best features

In [None]:
print("I'm selecting k best")
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
selector = SelectKBest(chi2, k=10000)

print("k best selected and saved")
print("fitting model")
selector = selector.fit(variables, labels)

print("model fitted")

import pickle
pickle.dump(vectorizer, open("vectorizer10000features2.pickle", "wb"))
pickle.dump(selector, open("selector10000features2.pickle", "wb"))

X_new = selector.transform(variables,labels)



I'm selecting k best
k best selected and saved
fitting model


In [7]:
from sklearn.cross_validation import train_test_split
print("splitting")
variables_train, variables_test, labels_train, labels_test  =   train_test_split(X_new, labels, test_size=0.2, random_state=42)

splitting


### We save the numpy arrays containing variables/labels train/test in order to load them later

In [11]:
import numpy as np

np.save('variables_train_10000.npy', variables_train)
np.save('variables_test_10000.npy', variables_test)
np.save('labels_train_10000.npy', labels_train)
np.save('labels_test_10000.npy', labels_test)
#variables_train = np.load('variables_train_10000.npy')
#variables_test = np.load('variables_test_10000.npy')
#labels_train = np.load('labels_train_10000.npy')
#labels_test = np.load('labels_test_10000.npy')

print('Shape of Training Data: '+str(variables_train.shape))
print('Shape of Test Data: '+str(variables_test.shape))

Shape of Training Data: (2240, 10000)
Shape of Test Data: (560, 10000)


In [12]:
from sklearn.naive_bayes import BernoulliNB
from sklearn import cross_validation

from sklearn.metrics import classification_report
import sklearn.metrics


bnb_classifier=BernoulliNB()
bnb_classifier=bnb_classifier.fit(variables_train,labels_train)
bnb_predictions=bnb_classifier.predict(variables_test)
nb_ascore=sklearn.metrics.accuracy_score(labels_test, bnb_predictions)
print("Bernoulli Naive Bayes Accuracy Score: %f" %nb_ascore)
print("Classification Metrics: ")
print(sklearn.metrics.classification_report(labels_test,bnb_predictions))
cv_scores = cross_validation.cross_val_score(bnb_classifier, variables_train, labels_train, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))



Bernoulli Naive Bayes Accuracy Score: 0.887500
Classification Metrics: 
             precision    recall  f1-score   support

          0       0.88      0.90      0.89       285
          1       0.90      0.87      0.88       275

avg / total       0.89      0.89      0.89       560

Accuracy: 0.87 (+/- 0.06)


In [11]:
filename = 'finalized_model_BernoulliNB_10000.sav'
pickle.dump(bnb_classifier, open(filename, 'wb'))

In [20]:
from sklearn.naive_bayes import MultinomialNB
mn_bayes=MultinomialNB()
mn_bayes_fit=mn_bayes.fit(variables_train,labels_train)
prediction_mn=mn_bayes_fit.predict(variables_test)
mn_ascore=sklearn.metrics.accuracy_score(labels_test, prediction_mn) 
print("Multinomial Naive Bayes Accuracy Score: %f" %mn_ascore)
print("Classification Metrics: ")
print(sklearn.metrics.classification_report(labels_test,prediction_mn))
cv_scores = cross_validation.cross_val_score(mn_bayes, variables_train, labels_train, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))
#filename2 = 'finalized_model_MultinomialNB_10000.sav'
#pickle.dump(mn_bayes, open(filename2, 'wb'))

Multinomial Naive Bayes Accuracy Score: 0.844643
Classification Metrics: 
             precision    recall  f1-score   support

          0       0.80      0.92      0.86       285
          1       0.91      0.76      0.83       275

avg / total       0.85      0.84      0.84       560

Accuracy: 0.81 (+/- 0.07)


In [22]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier=RandomForestClassifier(n_estimators=50)
rf_classifier=rf_classifier.fit(variables_train,labels_train)
rf_predictions=rf_classifier.predict(variables_test)
acc=sklearn.metrics.accuracy_score(labels_test, rf_predictions)
print ("Accuracy Score of Random Forests Classifier: %f" %acc)
print("Classification Metrics: ")
print(sklearn.metrics.classification_report(labels_test,rf_predictions))
cv_scores = cross_validation.cross_val_score(rf_classifier, variables_train, labels_train, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))
#filename2 = 'finalized_model_RandomForest_10000.sav'
#pickle.dump(rf_classifier, open(filename2, 'wb'))

Accuracy Score of Random Forests Classifier: 0.732143
Classification Metrics: 
             precision    recall  f1-score   support

          0       0.73      0.74      0.74       285
          1       0.73      0.72      0.73       275

avg / total       0.73      0.73      0.73       560

Accuracy: 0.71 (+/- 0.05)


In [14]:
from sklearn import linear_model

svm_classifier=linear_model.SGDClassifier(loss='hinge',alpha=0.0001)
svm_classifier=svm_classifier.fit(variables_train, labels_train)
svm_predictions=svm_classifier.predict(variables_test)
acc=sklearn.metrics.accuracy_score(labels_test, svm_predictions)
print ("Accuracy Score of Random Forests Classifier: %f" %acc)
print("Classification Metrics: ")
print(sklearn.metrics.classification_report(labels_test,svm_predictions))
cv_scores = cross_validation.cross_val_score(svm_classifier, variables_train, labels_train, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))
filename2 = 'finalized_model_SVM_10000.sav'
pickle.dump(svm_classifier, open(filename2, 'wb'))

Accuracy Score of Random Forests Classifier: 0.798214
Classification Metrics: 
             precision    recall  f1-score   support

          0       0.73      0.96      0.83       285
          1       0.95      0.63      0.75       275

avg / total       0.83      0.80      0.79       560

Accuracy: 0.73 (+/- 0.16)


In [15]:
svm_classifier_enet=linear_model.SGDClassifier(loss='hinge',alpha=0.0001,penalty='elasticnet')
svm_classifier_enet=svm_classifier_enet.fit(variables_train, labels_train)
svm_enet_predictions=svm_classifier_enet.predict(variables_test)
print ("Accuracy Score of Linear SVM Classifier: %f"%sklearn.metrics.accuracy_score(labels_test,svm_enet_predictions))
print("Classification Metrics: ")
print(sklearn.metrics.classification_report(labels_test,svm_enet_predictions))
cv_scores = cross_validation.cross_val_score(svm_classifier_enet, variables_train, labels_train, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))
filename2 = 'finalized_model_SVMenet_10000.sav'
pickle.dump(svm_classifier_enet, open(filename2, 'wb'))


Accuracy Score of Linear SVM Classifier: 0.812500
Classification Metrics: 
             precision    recall  f1-score   support

          0       0.85      0.77      0.81       285
          1       0.78      0.85      0.82       275

avg / total       0.82      0.81      0.81       560

Accuracy: 0.71 (+/- 0.15)


In [16]:
from sklearn.svm import SVC
cl = SVC(C=100.0, gamma=0.01, kernel='rbf')
cl.fit(variables_train, labels_train)
pred=cl.predict(variables_test)
print ("Accuracy Score of SVM Classifier: %f"%sklearn.metrics.accuracy_score(labels_test,pred))
print("Classification Metrics: ")
print(sklearn.metrics.classification_report(labels_test,pred))
cv_scores = cross_validation.cross_val_score(cl, variables_train, labels_train, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))
filename2 = 'finalized_model_SVC_10000.sav'
pickle.dump(cl, open(filename2, 'wb'))

Accuracy Score of SVM Classifier: 0.839286
Classification Metrics: 
             precision    recall  f1-score   support

          0       0.85      0.83      0.84       285
          1       0.83      0.85      0.84       275

avg / total       0.84      0.84      0.84       560

Accuracy: 0.81 (+/- 0.05)


In [56]:
from sklearn import grid_search
parameters = {'kernel': ['rbf'], 'C': [0.1, 1.0, 10.0, 100.0], 'gamma': [10, 1, 0.1, 0.01, 0.001]}
cl = SVC()
clf = grid_search.GridSearchCV(cl, parameters)
clf.fit(variables_train, labels_train)
print(clf.best_params_)


{'kernel': 'rbf', 'gamma': 0.01, 'C': 100.0}
