In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import cross_validate

import functions as f



In [2]:
def model_cv(model, embeddings, y):
    results = []
    for i in range(len(embeddings)):
        cv_results = cross_validate(model,
                                    embeddings[i],
                                    y,
                                    cv=3,
                                    scoring=('accuracy', 'precision', 'recall', 'f1', 'roc_auc'),
                                    return_train_score=True)
        results.append([np.mean(cv_results['train_accuracy']),
                        np.mean(cv_results['test_accuracy']),
                        np.mean(cv_results['test_precision']),
                        np.mean(cv_results['test_recall']),
                        np.mean(cv_results['test_f1']),
                        np.mean(cv_results['test_roc_auc'])])
    return np.stack(results)

def df_model_cv(model_cv, embeddings_names, results_names):
    return pd.DataFrame(model_cv, index=embeddings_names, columns=results_names)

## CODE

In [3]:
with open('data/M_bow_10k.pickle', 'rb') as handle:
    M_bow = pickle.load(handle)
    
with open('data/M_tfidf_10k.pickle', 'rb') as handle:
    M_tfidf = pickle.load(handle)
    
with open('data/M_svd_10k.pickle', 'rb') as handle:
    M_svd = pickle.load(handle)
    
with open('data/M_nmf_10k.pickle', 'rb') as handle:
    M_nmf = pickle.load(handle)
    
with open('data/M_word2vec_10k.pickle', 'rb') as handle:
    M_word2vec = pickle.load(handle)
    
with open('data/sentiment_10k.pickle', 'rb') as handle:
    y = pickle.load(handle)

## Models

In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler

In [5]:
scaler = MinMaxScaler()
M_svd_positive = scaler.fit_transform(M_svd)
M_word2vec_positive = scaler.fit_transform(M_word2vec)

In [6]:
baseline = sum(y)/len(y)
print('Baseline accuracy: ', baseline)

Baseline accuracy:  0.914674872933972


In [7]:
embeddings_names = ['BOW', 'TFIDF', 'SVD', 'NMF', 'Word2Vec']
embeddings = [M_bow, M_tfidf, M_svd, M_nmf, M_word2vec]
embeddings_positive = [M_bow, M_tfidf, M_svd_positive, M_nmf, M_word2vec_positive]
results_names = ['train_acc', 'test_acc', 'precision', 'recall', 'f1', 'roc_auc']

### Multinomial Naive Bayes

In [90]:
naive = MultinomialNB()

naive_cv = model_cv(naive, embeddings_positive, y)
df_model_cv(naive_cv, embeddings_names, results_names)

Unnamed: 0,train_acc,test_acc,precision,recall,f1,roc_auc
BOW,0.96635,0.9031,0.960989,0.934479,0.947453,0.801898
TFIDF,0.9356,0.9356,0.9356,1.0,0.966729,0.758994
SVD,0.9356,0.9356,0.9356,1.0,0.966729,0.742737
NMF,0.93225,0.9297,0.935481,0.993373,0.963554,0.743876
Word2Vec,0.9356,0.9356,0.9356,1.0,0.966729,0.859119


In [91]:
sgd = SGDClassifier(random_state=9)

sgd_cv = model_cv(sgd, embeddings, y)
df_model_cv(sgd_cv, embeddings_names, results_names)

Unnamed: 0,train_acc,test_acc,precision,recall,f1,roc_auc
BOW,0.9956,0.939,0.963256,0.971889,0.967547,0.855695
TFIDF,0.98955,0.9514,0.956574,0.993159,0.974517,0.921458
SVD,0.917451,0.9192,0.946017,0.969004,0.957329,0.783898
NMF,0.9355,0.9355,0.935594,0.999893,0.966675,0.754858
Word2Vec,0.95135,0.9427,0.951852,0.988777,0.969963,0.899941


In [92]:
sgd_log = SGDClassifier(loss='log', penalty='elasticnet', random_state=9)

sgd_log_cv = model_cv(sgd_log, embeddings, y)
df_model_cv(sgd_log_cv, embeddings_names, results_names)

Unnamed: 0,train_acc,test_acc,precision,recall,f1,roc_auc
BOW,0.9905,0.9448,0.961173,0.980653,0.970791,0.87154
TFIDF,0.95625,0.9436,0.944222,0.998717,0.970705,0.928908
SVD,0.92525,0.925701,0.940665,0.982579,0.961145,0.776103
NMF,0.93555,0.9354,0.935674,0.999679,0.966618,0.777218
Word2Vec,0.9527,0.9437,0.95557,0.985677,0.97038,0.905494


In [11]:
logreg = LogisticRegression(max_iter=500, random_state=9)

logreg_cv = model_cv(logreg, embeddings, y)
df_model_cv(logreg_cv, embeddings_names, results_names)

Unnamed: 0,train_acc,test_acc,precision,recall,f1,roc_auc
BOW,0.963791,0.960207,0.968273,0.988905,0.978476,0.959265
TFIDF,0.967481,0.964057,0.971313,0.989941,0.980538,0.974572
SVD,0.914656,0.914652,0.914673,0.999974,0.955424,0.7724
NMF,0.914675,0.914675,0.914675,1.0,0.955436,0.774198
Word2Vec,0.954231,0.953983,0.962621,0.988059,0.975173,0.960675


In [94]:
svc = SVC(class_weight='balanced', probability=True)

svc_cv = model_cv(svc, embeddings, y)
df_model_cv(svc_cv, embeddings_names, results_names)

Unnamed: 0,train_acc,test_acc,precision,recall,f1,roc_auc
BOW,0.982,0.9364,0.965186,0.966971,0.966014,0.906253
TFIDF,0.9987,0.9478,0.951643,0.994762,0.972722,0.92831
SVD,0.623051,0.611102,0.984858,0.593416,0.740405,0.803818
NMF,0.654351,0.647902,0.984224,0.633818,0.770986,0.808532
Word2Vec,0.913251,0.875998,0.980944,0.884778,0.930178,0.911183


### Verification

In [14]:
with open('data/tfidf_vectorizer.pickle', 'rb') as handle:
    vectorizer = pickle.load(handle)

In [15]:
sgd.fit(M_tfidf, y)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=9, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [16]:
review_test_pos = 'This game is amazing ^^, my son plays with it all the time!'
review_test_neg = 'I\'m really disappointed with this game. My son doesn\'t like playing with it.'

In [17]:
review_tokens_test_pos = f.normalize_single_text(review_test_pos)
tfidf_vector_test_pos = vectorizer.transform([' '.join(review_tokens_test_pos)])
sgd.predict(tfidf_vector_test_pos)

array([1])

In [18]:
review_tokens_test_neg = f.normalize_single_text(review_test_neg)
tfidf_vector_test_neg = vectorizer.transform([' '.join(review_tokens_test_neg)])
sgd.predict(tfidf_vector_test_neg)

array([0])

### Train test split & check reviews with incorrect labels

In [31]:
from sklearn.model_selection import train_test_split
df = pd.read_csv('data/reviews_toys_games.csv')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(M_tfidf, np.array(y), test_size=0.33, random_state=9)

In [32]:
sgd.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=9, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [29]:
y_pred = sgd.predict(X_test)

In [35]:
y_diff = y_test - y_pred
np.where(y_diff != 0)

(array([  10,   55,   75,  152,  162,  201,  242,  245,  262,  285,  305,
         310,  316,  333,  351,  353,  373,  390,  406,  452,  483,  500,
         520,  540,  547,  550,  586,  595,  598,  615,  624,  645,  675,
         691,  701,  718,  743,  761,  791,  799,  821,  825,  840,  843,
         861,  889,  893,  973,  974, 1013, 1015, 1038, 1141, 1182, 1257,
        1260, 1326, 1364, 1380, 1399, 1412, 1445, 1457, 1508, 1536, 1575,
        1591, 1638, 1661, 1668, 1671, 1681, 1686, 1688, 1712, 1725, 1769,
        1771, 1802, 1874, 1881, 1886, 1906, 1908, 1919, 1931, 1944, 1955,
        1972, 2001, 2011, 2035, 2043, 2079, 2111, 2119, 2155, 2233, 2252,
        2272, 2316, 2317, 2332, 2335, 2364, 2416, 2433, 2483, 2515, 2557,
        2563, 2571, 2585, 2603, 2624, 2628, 2631, 2644, 2652, 2666, 2709,
        2749, 2757, 2805, 2904, 2916, 2955, 2983, 2988, 3002, 3042, 3045,
        3050, 3083, 3084, 3126, 3148, 3158, 3177, 3196, 3197, 3218, 3248,
        3263, 3281], dtype=int64),)

In [52]:
i = 10
print(df['review'][i])
print('\nreal label:', y_test[i], '\npredicted label:', y_pred[i])

This is indeed a small book. This particular ROBOT sticker book was at best, OK.
Why? Well, there's really no space to create multiple robots. Stickers are not easy to remove.
Not recommended unless you just gotta have it to entertain a child. But this may only last ten minutes.
Plus side, small enough to stick in purse for travel.

real label: 0 
predicted label: 1


In [53]:
i = 2011
print(df['review'][i])
print('\nreal label:', y_test[i], '\npredicted label:', y_pred[i])

Okay, so we are new to the "Elf on the Shelf" tradition. My granddaughter is only two so I let her open the box. I had her hold the elf while I read the story, which, you more experienced shelfers know that is a no-no. We were able to fix it with, "Christopher will have to go back to the North Pole tonight to get his magic back", whew! That being said, the book is big and the illustrations are beautifully colorful. Christopher looks EXACTLY like the illustrations. On one particular page he is pictured actual size, face on, my granddaughter noticed and placed Christopher on top of his picture. I think it just helps with the fantasy. The doll is super cute and seems to be well made. I think the bendable arms will be helpful in coming up with creative ways to place him. The holidays are all about traditions and I think this is a really fun one. I'm very happy with the purchase.

real label: 1 
predicted label: 0


In [59]:
i = 2416
print(df['review'][i])
print('\nreal label:', y_test[i], '\npredicted label:', y_pred[i])

adorable

real label: 0 
predicted label: 1


## Over and under sampling

In [8]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import Pipeline, make_pipeline

In [9]:
imba_pipeline = make_pipeline(RandomUnderSampler(random_state=9),
                              MultinomialNB())

pip_cv = model_cv(imba_pipeline, embeddings_positive, y)
df_model_cv(pip_cv, embeddings_names, results_names)

Unnamed: 0,train_acc,test_acc,precision,recall,f1,roc_auc
BOW,0.902938,0.90043,0.988952,0.901211,0.943038,0.948348
TFIDF,0.834933,0.826068,0.993839,0.814906,0.895283,0.958482
SVD,0.599032,0.597889,0.979688,0.572254,0.722366,0.771071
NMF,0.46411,0.463309,0.966213,0.428233,0.593206,0.583979
Word2Vec,0.787901,0.785602,0.988557,0.774574,0.868521,0.915482


In [10]:
models = [
    SGDClassifier(random_state=9),
    SGDClassifier(loss='log', penalty='elasticnet', random_state=9),
    LogisticRegression(max_iter=500, random_state=9)]

for model in models:
    imba_pipeline = make_pipeline(RandomUnderSampler(random_state=9), model)

    pip_cv = model_cv(imba_pipeline, embeddings, y)
    display(df_model_cv(pip_cv, embeddings_names, results_names))

Unnamed: 0,train_acc,test_acc,precision,recall,f1,roc_auc
BOW,0.922943,0.919295,0.990052,0.921024,0.954273,0.962724
TFIDF,0.903495,0.899841,0.992508,0.897271,0.942471,0.96988
SVD,0.659022,0.657437,0.944364,0.669729,0.761295,0.674372
NMF,0.69968,0.704101,0.963222,0.704405,0.810524,0.771589
Word2Vec,0.890871,0.893663,0.990229,0.892567,0.938821,0.958927


Unnamed: 0,train_acc,test_acc,precision,recall,f1,roc_auc
BOW,0.923714,0.921445,0.989466,0.923954,0.95558,0.962516
TFIDF,0.889593,0.886756,0.991684,0.883602,0.934504,0.964957
SVD,0.526995,0.520776,0.960139,0.509812,0.600637,0.626163
NMF,0.615288,0.610342,0.977135,0.588323,0.732213,0.761878
Word2Vec,0.890676,0.886931,0.990905,0.884517,0.934606,0.959305


Unnamed: 0,train_acc,test_acc,precision,recall,f1,roc_auc
BOW,0.925581,0.920748,0.989925,0.922747,0.955146,0.96229
TFIDF,0.917536,0.912637,0.992192,0.911663,0.950208,0.97331
SVD,0.765936,0.768009,0.93548,0.806661,0.858701,0.726286
NMF,0.632657,0.631295,0.977415,0.611027,0.751898,0.774648
Word2Vec,0.894947,0.892918,0.99076,0.891246,0.938319,0.960582


In [19]:
imba_pipeline = make_pipeline(RandomOverSampler(random_state=9),
                              MultinomialNB())

pip_cv = model_cv(imba_pipeline, embeddings_positive, y)
df_model_cv(pip_cv, embeddings_names, results_names)

Unnamed: 0,train_acc,test_acc,precision,recall,f1,roc_auc
BOW,0.9433,0.894399,0.9678,0.917805,0.942002,0.775204
TFIDF,0.9434,0.873098,0.977905,0.884563,0.928589,0.8958
SVD,0.550601,0.550303,0.97303,0.534089,0.687116,0.743286
NMF,0.592601,0.578301,0.982258,0.559325,0.712149,0.739283
Word2Vec,0.7455,0.729602,0.98367,0.722957,0.833203,0.860506


In [20]:
models = [
    SGDClassifier(random_state=9),
    SGDClassifier(loss='log', penalty='elasticnet', random_state=9),
    LogisticRegression(max_iter=500, random_state=9)]

for model in models:
    imba_pipeline = make_pipeline(RandomOverSampler(random_state=9), model)

    pip_cv = model_cv(imba_pipeline, embeddings, y)
    display(df_model_cv(pip_cv, embeddings_names, results_names))

Unnamed: 0,train_acc,test_acc,precision,recall,f1,roc_auc
BOW,0.9972,0.933499,0.966657,0.962162,0.964352,0.844497
TFIDF,0.99185,0.942,0.96796,0.970179,0.96903,0.920024
SVD,0.460961,0.437782,0.988582,0.403993,0.547987,0.695939
NMF,0.647301,0.628301,0.982345,0.614486,0.750709,0.770811
Word2Vec,0.86435,0.845498,0.983695,0.849082,0.911154,0.899181


Unnamed: 0,train_acc,test_acc,precision,recall,f1,roc_auc
BOW,0.9921,0.934299,0.97126,0.958207,0.964617,0.865307
TFIDF,0.96765,0.9229,0.976486,0.940358,0.958001,0.928051
SVD,0.845503,0.848498,0.955686,0.879437,0.91523,0.768985
NMF,0.61069,0.605723,0.983567,0.589475,0.71856,0.799981
Word2Vec,0.8698,0.851698,0.981868,0.857416,0.9153,0.898403


Unnamed: 0,train_acc,test_acc,precision,recall,f1,roc_auc
BOW,0.9931,0.934099,0.972282,0.956925,0.964471,0.878621
TFIDF,0.9741,0.925499,0.975501,0.944206,0.959518,0.928236
SVD,0.665201,0.656403,0.984732,0.642691,0.777586,0.815574
NMF,0.649951,0.642203,0.983574,0.628047,0.766365,0.80017
Word2Vec,0.8593,0.841898,0.983516,0.845339,0.908971,0.901525


In [23]:
imba_pipeline = make_pipeline(SMOTE(random_state=9),
                              MultinomialNB())

pip_cv = model_cv(imba_pipeline, embeddings_positive, y)
df_model_cv(pip_cv, embeddings_names, results_names)

Unnamed: 0,train_acc,test_acc,precision,recall,f1,roc_auc
BOW,0.9407,0.887298,0.963958,0.91385,0.938084,0.752847
TFIDF,0.94255,0.874297,0.977781,0.88606,0.929302,0.894202
SVD,0.559351,0.557703,0.972476,0.542533,0.69408,0.741908
NMF,0.599851,0.588301,0.981895,0.570442,0.720912,0.741072
Word2Vec,0.7454,0.729702,0.984083,0.722744,0.833283,0.859227


In [24]:
models = [
    SGDClassifier(random_state=9),
    SGDClassifier(loss='log', penalty='elasticnet', random_state=9),
    LogisticRegression(max_iter=500, random_state=9)]

for model in models:
    imba_pipeline = make_pipeline(SMOTE(random_state=9), model)

    pip_cv = model_cv(imba_pipeline, embeddings, y)
    display(df_model_cv(pip_cv, embeddings_names, results_names))

Unnamed: 0,train_acc,test_acc,precision,recall,f1,roc_auc
BOW,0.97065,0.902499,0.969865,0.92454,0.946614,0.810358
TFIDF,0.99145,0.941199,0.968044,0.969217,0.96858,0.91873
SVD,0.835147,0.827112,0.962145,0.849515,0.898572,0.803122
NMF,0.702848,0.71341,0.97673,0.711307,0.818701,0.803668
Word2Vec,0.879551,0.854897,0.980435,0.862335,0.917135,0.896827


Unnamed: 0,train_acc,test_acc,precision,recall,f1,roc_auc
BOW,0.9424,0.876299,0.969943,0.895575,0.93118,0.814535
TFIDF,0.9698,0.928299,0.97421,0.948588,0.961153,0.926944
SVD,0.548438,0.540126,0.982638,0.521198,0.643003,0.755011
NMF,0.702402,0.700101,0.976407,0.696348,0.812674,0.795524
Word2Vec,0.865001,0.843497,0.983106,0.847474,0.909862,0.898639


Unnamed: 0,train_acc,test_acc,precision,recall,f1,roc_auc
BOW,0.9472,0.882599,0.97027,0.902202,0.934923,0.813839
TFIDF,0.97565,0.930699,0.973984,0.951474,0.962509,0.92757
SVD,0.682151,0.673803,0.983303,0.662572,0.791514,0.815945
NMF,0.662751,0.656403,0.982183,0.644401,0.777966,0.800802
Word2Vec,0.872701,0.856698,0.982722,0.86212,0.918292,0.902619
