In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import cross_validate

## CODE

In [2]:
with open('data/M_bow.pickle', 'rb') as handle:
    M_bow = pickle.load(handle)
    
with open('data/M_tfidf.pickle', 'rb') as handle:
    M_tfidf = pickle.load(handle)
    
with open('data/M_svd.pickle', 'rb') as handle:
    M_svd = pickle.load(handle)
    
with open('data/M_nmf.pickle', 'rb') as handle:
    M_nmf = pickle.load(handle)
    
with open('data/M_word2vec.pickle', 'rb') as handle:
    M_word2vec = pickle.load(handle)
    
with open('data/sentiment.pickle', 'rb') as handle:
    y = pickle.load(handle)

### Models

In [3]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler

In [5]:
scaler = MinMaxScaler()
M_svd_positive = scaler.fit_transform(M_svd)
M_word2vec_positive = scaler.fit_transform(M_word2vec)

In [6]:
baseline = sum(y)/len(y)
print('Baseline accuracy: ', baseline)

Baseline accuracy:  0.914674872933972


### Multinomial Naive Bayes

In [7]:
embeddings_names = ['BOW', 'TFIDF', 'SVD', 'NMF', 'Word2Vec']
embeddings = [M_bow, M_tfidf, M_svd_positive, M_nmf, M_word2vec_positive]

In [8]:
naive = MultinomialNB()

for i in range(len(embeddings)):
    cv_results = cross_validate(naive, embeddings[i], y, cv=3, scoring=('accuracy', 'precision', 'recall'))
    print(embeddings_names[i], '\t', np.mean(cv_results['test_accuracy']))

BOW 	 0.9452296163233482
TFIDF 	 0.9213210800244281
SVD 	 0.914674872933972
NMF 	 0.5592889490943588
Word2Vec 	 0.914674872933972


In [9]:
embeddings = [M_bow, M_tfidf, M_svd, M_nmf, M_word2vec]

In [10]:
sgd = SGDClassifier(random_state=9)

for i in range(len(embeddings)):
    cv_results = cross_validate(sgd, embeddings[i], y, cv=3, scoring=('accuracy', 'precision', 'recall'))
    print(embeddings_names[i], '\t', np.mean(cv_results['test_accuracy']))

BOW 	 0.9569507518789084
TFIDF 	 0.9467272741295741
SVD 	 0.8808934342148901
NMF 	 0.914674872933972
Word2Vec 	 0.9516185011221916


In [None]:
sgd = SGDClassifier(loss='log', penalty='elasticnet', random_state=9)

for i in range(len(embeddings)):
    cv_results = cross_validate(sgd, embeddings[i], y, cv=3, scoring=('accuracy', 'precision', 'recall'))
    print(embeddings_names[i], '\t', np.mean(cv_results['test_accuracy']))

BOW 	 0.9533033661541955
TFIDF 	 0.9381332705102343


In [38]:
logreg = LogisticRegression(max_iter=500, random_state=9)

for i in range(len(embeddings)):
    cv_results = cross_validate(logreg, embeddings[i], y, cv=3, scoring=('accuracy', 'precision', 'recall'))
    print(embeddings_names[i], '\t', np.mean(cv_results['test_accuracy']))

BOW 	 0.9469999789421054
TFIDF 	 0.940000028797121
SVD 	 0.9351000187001303
NMF 	 0.9354999987101289
Word2Vec 	 0.9435001388561172


In [46]:
from xgboost import XGBClassifier

model = XGBClassifier(random_state=9)

for i in range(len(embeddings)):
    cv_results = cross_validate(model, embeddings[i], y, cv=3, scoring=('accuracy', 'precision', 'recall'))
    print(embeddings_names[i], '\t', np.mean(cv_results['test_accuracy']))

BOW 	 0.9399997888211136
TFIDF 	 0.9482002189421102
SVD 	 0.9297001085831438
NMF 	 0.9303004385621526
Word2Vec 	 0.9432001688471187
