In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import cross_validate

import functions as f



In [9]:
#####
def load_pickle(path):
    with open(path, 'rb') as handle:
        return pickle.load(handle)

def model_cv(model, embeddings, y):
    results = []
    for i in range(len(embeddings)):
        cv_results = cross_validate(model,
                                    embeddings[i],
                                    y,
                                    cv=5,
                                    scoring=('accuracy', 'precision', 'recall', 'f1'))
        results.append([np.mean(cv_results['test_accuracy']),
                        np.mean(cv_results['test_precision']),
                        np.mean(cv_results['test_recall']),
                        np.mean(cv_results['test_f1'])])
    return np.stack(results)

def df_model_cv(model_cv, embeddings_names, results_names):
    return pd.DataFrame(model_cv, index=embeddings_names, columns=results_names)

## CODE

In [3]:
M_bow = f.load_pickle('data/M_bow_100k.pickle')
M_tfidf = f.load_pickle('data/M_tfidf_100k.pickle')
M_svd = f.load_pickle('data/M_svd_100k.pickle')
M_nmf = f.load_pickle('data/M_nmf_100k.pickle')
M_word2vec = f.load_pickle('data/M_word2vec_100k.pickle')
y = f.load_pickle('data/sentiment_100k.pickle')

## Models

class imbalanceThere is visible class imbalance in the dataset: positive reviews 

In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler

In [5]:
scaler = MinMaxScaler()
M_svd_positive = scaler.fit_transform(M_svd)
M_word2vec_positive = scaler.fit_transform(M_word2vec)

In [6]:
baseline = sum(y)/len(y)
print('Baseline accuracy: ', baseline)

Baseline accuracy:  0.9148


In [10]:
embeddings_names = ['BOW', 'TFIDF', 'SVD', 'NMF', 'Word2Vec']
embeddings = [M_bow, M_tfidf, M_svd, M_nmf, M_word2vec]
embeddings_positive = [M_bow, M_tfidf, M_svd_positive, M_nmf, M_word2vec_positive]
results_names = ['test_acc', 'precision', 'recall', 'f1']

### Multinomial Naive Bayes

In [11]:
naive = MultinomialNB()

naive_cv = f.model_cv(naive, embeddings_positive, y)
f.df_model_cv(naive_cv, embeddings_names, results_names)

Unnamed: 0,test_acc,precision,recall,f1
BOW,0.94622,0.96817,0.973207,0.970682
TFIDF,0.93397,0.934816,0.997366,0.965078
SVD,0.9148,0.9148,1.0,0.955504
NMF,0.80285,0.906519,0.874683,0.890305
Word2Vec,0.9148,0.9148,1.0,0.955504


In [12]:
sgd = SGDClassifier(random_state=9, n_jobs=-1)

sgd_cv = f.model_cv(sgd, embeddings, y)
f.df_model_cv(sgd_cv, embeddings_names, results_names)

Unnamed: 0,test_acc,precision,recall,f1
BOW,0.95623,0.964657,0.988369,0.976368
TFIDF,0.94956,0.950884,0.996327,0.973075
SVD,0.86941,0.922213,0.936445,0.928941
NMF,0.9148,0.9148,1.0,0.955504
Word2Vec,0.94661,0.954643,0.98862,0.97133


In [13]:
sgd_log = SGDClassifier(loss='log', penalty='elasticnet', random_state=9, n_jobs=-1)

sgd_log_cv = f.model_cv(sgd_log, embeddings, y)
f.df_model_cv(sgd_log_cv, embeddings_names, results_names)

Unnamed: 0,test_acc,precision,recall,f1
BOW,0.95286,0.959167,0.990643,0.974651
TFIDF,0.94023,0.940497,0.997792,0.968297
SVD,0.85836,0.923638,0.921371,0.92245
NMF,0.9148,0.9148,1.0,0.955504
Word2Vec,0.94896,0.958128,0.987363,0.972524


In [14]:
logreg = LogisticRegression(max_iter=500, random_state=9, n_jobs=-1)

logreg_cv = f.model_cv(logreg, embeddings, y)
f.df_model_cv(logreg_cv, embeddings_names, results_names)

Unnamed: 0,test_acc,precision,recall,f1
BOW,0.95596,0.966847,0.985658,0.976161
TFIDF,0.95721,0.961425,0.99307,0.976991
SVD,0.9148,0.9148,1.0,0.955504
NMF,0.9148,0.9148,1.0,0.955504
Word2Vec,0.94914,0.95884,0.986762,0.9726


In [None]:
svc = SVC(class_weight='balanced', probability=True, n_jobs=-1)

svc_cv = f.model_cv(svc, embeddings, y)
f.df_model_cv(svc_cv, embeddings_names, results_names)

### Verification

In [17]:
vectorizer = load_pickle('data/tfidf_vectorizer_100k.pickle')

In [18]:
sgd.fit(M_tfidf, y)

SGDClassifier(n_jobs=-1, random_state=9)

In [19]:
review_test_pos = 'This game is amazing ^^, my son plays with it all the time!'
review_test_neg = 'I\'m really disappointed with this game. My son doesn\'t like playing with it.'

In [20]:
review_tokens_test_pos = f.normalize_single_text(review_test_pos)
tfidf_vector_test_pos = vectorizer.transform([' '.join(review_tokens_test_pos)])
sgd.predict(tfidf_vector_test_pos)

array([1])

In [21]:
review_tokens_test_neg = f.normalize_single_text(review_test_neg)
tfidf_vector_test_neg = vectorizer.transform([' '.join(review_tokens_test_neg)])
sgd.predict(tfidf_vector_test_neg)

array([0])

### Train test split & check reviews with incorrect labels

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

df = pd.read_csv('data/reviews_toys_games_100k.csv')

In [23]:
X_train, X_test, y_train, y_test = train_test_split(M_tfidf, np.array(y), test_size=0.33, random_state=9)

In [24]:
sgd.fit(X_train, y_train)

SGDClassifier(n_jobs=-1, random_state=9)

In [25]:
y_pred = sgd.predict(X_test)

In [32]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

In [21]:
y_diff = y_test - y_pred
np.where(y_diff != 0)

(array([   35,    66,    69, ..., 32961, 32978, 32983], dtype=int64),)

In [24]:
i = 69
print(df['review'][i])
print('\nreal label:', y_test[i], '\npredicted label:', y_pred[i])

Son really loves it.

real label: 0 
predicted label: 1


In [25]:
i = 32978
print(df['review'][i])
print('\nreal label:', y_test[i], '\npredicted label:', y_pred[i])

Arrived quickly,  just as described.

real label: 0 
predicted label: 1


In [26]:
i = 32983
print(df['review'][i])
print('\nreal label:', y_test[i], '\npredicted label:', y_pred[i])

Great product and quality. Fast shipping

real label: 0 
predicted label: 1


## Over and under sampling

In [27]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import Pipeline, make_pipeline

In [28]:
imba_pipeline = make_pipeline(RandomUnderSampler(random_state=9),
                              MultinomialNB())

pip_cv = f.model_cv(imba_pipeline, embeddings_positive, y)
f.df_model_cv(pip_cv, embeddings_names, results_names)

Unnamed: 0,train_acc,test_acc,precision,recall,f1,roc_auc
BOW,0.89673,0.89106,0.988827,0.890982,0.937357,0.943244
TFIDF,0.81419,0.80703,0.994465,0.793474,0.882665,0.956293
SVD,0.596615,0.59652,0.979323,0.570999,0.721385,0.769999
NMF,0.46547,0.46552,0.963136,0.432291,0.596725,0.580237
Word2Vec,0.77846,0.77864,0.987666,0.76761,0.86384,0.909421


In [29]:
models = [
    SGDClassifier(random_state=9),
    SGDClassifier(loss='log', penalty='elasticnet', random_state=9),
    LogisticRegression(max_iter=500, random_state=9)]

for model in models:
    imba_pipeline = make_pipeline(RandomUnderSampler(random_state=9), model)

    pip_cv = f.model_cv(imba_pipeline, embeddings, y)
    display(f.df_model_cv(pip_cv, embeddings_names, results_names))

Unnamed: 0,train_acc,test_acc,precision,recall,f1,roc_auc
BOW,0.91312,0.89885,0.98727,0.901049,0.942189,0.94167
TFIDF,0.901,0.89107,0.990971,0.889025,0.937228,0.964396
SVD,0.61717,0.61506,0.958143,0.616976,0.6979,0.692987
NMF,0.694305,0.69338,0.95897,0.695311,0.803651,0.755872
Word2Vec,0.90158,0.89957,0.986297,0.902766,0.94266,0.950376


Unnamed: 0,train_acc,test_acc,precision,recall,f1,roc_auc
BOW,0.91386,0.90243,0.988056,0.904274,0.944301,0.948884
TFIDF,0.889525,0.88435,0.991102,0.881493,0.933086,0.963672
SVD,0.795895,0.79636,0.936304,0.835089,0.880819,0.755115
NMF,0.459135,0.459049,0.988154,0.414593,0.568838,0.761177
Word2Vec,0.8938,0.89148,0.987604,0.892578,0.937684,0.951775


Unnamed: 0,train_acc,test_acc,precision,recall,f1,roc_auc
BOW,0.916465,0.90455,0.988446,0.906253,0.945565,0.95
TFIDF,0.893065,0.88636,0.991388,0.88345,0.934308,0.964434
SVD,0.635855,0.63572,0.975242,0.617468,0.756167,0.770566
NMF,0.63366,0.63309,0.976321,0.613806,0.753734,0.773027
Word2Vec,0.88271,0.88153,0.989644,0.8797,0.931431,0.953461


In [30]:
imba_pipeline = make_pipeline(RandomOverSampler(random_state=9),
                              MultinomialNB())

pip_cv = f.model_cv(imba_pipeline, embeddings_positive, y)
f.df_model_cv(pip_cv, embeddings_names, results_names)

Unnamed: 0,train_acc,test_acc,precision,recall,f1,roc_auc
BOW,0.91737,0.9101,0.985773,0.914932,0.94903,0.935315
TFIDF,0.90393,0.88528,0.989082,0.884357,0.933791,0.956739
SVD,0.597005,0.59682,0.979388,0.571294,0.721638,0.77003
NMF,0.46465,0.46462,0.963584,0.431056,0.595635,0.579891
Word2Vec,0.777965,0.77788,0.987693,0.766747,0.863307,0.909251


In [31]:
models = [
    SGDClassifier(random_state=9),
    SGDClassifier(loss='log', penalty='elasticnet', random_state=9),
    LogisticRegression(max_iter=500, random_state=9)]

for model in models:
    imba_pipeline = make_pipeline(RandomOverSampler(random_state=9), model)

    pip_cv = f.model_cv(imba_pipeline, embeddings, y)
    display(f.df_model_cv(pip_cv, embeddings_names, results_names))

Unnamed: 0,train_acc,test_acc,precision,recall,f1,roc_auc
BOW,0.950275,0.93103,0.9848,0.939101,0.961407,0.947087
TFIDF,0.92624,0.91465,0.989785,0.916156,0.951544,0.966883
SVD,0.632175,0.63208,0.971946,0.615883,0.752542,0.765195
NMF,0.526596,0.527889,0.98301,0.493942,0.64728,0.762038
Word2Vec,0.88328,0.88112,0.989632,0.879274,0.931121,0.953436


Unnamed: 0,train_acc,test_acc,precision,recall,f1,roc_auc
BOW,0.940035,0.92809,0.986651,0.934029,0.959619,0.954345
TFIDF,0.89825,0.89426,0.991101,0.892425,0.939177,0.964402
SVD,0.581165,0.58004,0.978776,0.553071,0.705643,0.764284
NMF,0.62324,0.62351,0.975986,0.603389,0.745358,0.770711
Word2Vec,0.870885,0.87015,0.990675,0.866211,0.924236,0.95411


Unnamed: 0,train_acc,test_acc,precision,recall,f1,roc_auc
BOW,0.96064,0.93491,0.983071,0.945125,0.963723,0.941157
TFIDF,0.94857,0.93062,0.987196,0.936303,0.961073,0.968217
SVD,0.6363,0.63643,0.97506,0.618387,0.756802,0.770721
NMF,0.634505,0.63411,0.976133,0.615074,0.754634,0.773089
Word2Vec,0.8855,0.88452,0.989802,0.88286,0.933274,0.954442


In [32]:
imba_pipeline = make_pipeline(SMOTE(random_state=9),
                              MultinomialNB())

pip_cv = f.model_cv(imba_pipeline, embeddings_positive, y)
f.df_model_cv(pip_cv, embeddings_names, results_names)

Unnamed: 0,train_acc,test_acc,precision,recall,f1,roc_auc
BOW,0.921895,0.9152,0.977209,0.928968,0.952478,0.900934
TFIDF,0.92335,0.90324,0.987509,0.905684,0.944825,0.959009
SVD,0.59761,0.59743,0.979358,0.571994,0.722188,0.770017
NMF,0.466335,0.46619,0.962883,0.433177,0.597522,0.580255
Word2Vec,0.772965,0.77274,0.98813,0.760713,0.859634,0.909639


In [33]:
models = [
    SGDClassifier(random_state=9),
    SGDClassifier(loss='log', penalty='elasticnet', random_state=9),
    LogisticRegression(max_iter=500, random_state=9)]

for model in models:
    imba_pipeline = make_pipeline(SMOTE(random_state=9), model)

    pip_cv = f.model_cv(imba_pipeline, embeddings, y)
    display(f.df_model_cv(pip_cv, embeddings_names, results_names))

Unnamed: 0,train_acc,test_acc,precision,recall,f1,roc_auc
BOW,0.913685,0.90168,0.978447,0.912626,0.944386,0.909564
TFIDF,0.9075,0.8977,0.988611,0.898524,0.941416,0.958589
SVD,0.57157,0.572,0.979722,0.544283,0.694818,0.767898
NMF,0.560065,0.55849,0.979346,0.530312,0.674656,0.764904
Word2Vec,0.89734,0.89653,0.988336,0.897486,0.940702,0.953833


Unnamed: 0,train_acc,test_acc,precision,recall,f1,roc_auc
BOW,0.895435,0.8892,0.980137,0.897059,0.936754,0.91286
TFIDF,0.884675,0.88134,0.987636,0.881318,0.931447,0.951793
SVD,0.65408,0.65451,0.96949,0.642961,0.771798,0.767337
NMF,0.62138,0.62107,0.975272,0.601246,0.742849,0.769732
Word2Vec,0.893495,0.89265,0.98894,0.892643,0.93827,0.954362


Unnamed: 0,train_acc,test_acc,precision,recall,f1,roc_auc
BOW,0.92204,0.90316,0.979876,0.912888,0.945192,0.912699
TFIDF,0.934305,0.91638,0.986592,0.921108,0.952723,0.961833
SVD,0.635135,0.63536,0.975297,0.617031,0.755856,0.770452
NMF,0.63389,0.63355,0.976359,0.614298,0.754118,0.772938
Word2Vec,0.89473,0.89368,0.988542,0.894141,0.938973,0.954442
