In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import cross_validate

import functions as f



In [4]:
def model_cv(model, embeddings, y):
    results = []
    for i in range(len(embeddings)):
        cv_results = cross_validate(model,
                                    embeddings[i],
                                    y,
                                    cv=3,
                                    scoring=('accuracy', 'precision', 'recall'),
                                    return_train_score=True)
        results.append([np.mean(cv_results['train_accuracy']),
                        np.mean(cv_results['test_accuracy']),
                        np.mean(cv_results['test_precision']),
                        np.mean(cv_results['test_recall'])])
    return np.stack(results)

def df_model_cv(model_cv, embeddings_names, results_names):
    return pd.DataFrame(model_cv, index=embeddings_names, columns=results_names)

## CODE

In [5]:
with open('data/M_bow_10k.pickle', 'rb') as handle:
    M_bow = pickle.load(handle)
    
with open('data/M_tfidf_10k.pickle', 'rb') as handle:
    M_tfidf = pickle.load(handle)
    
with open('data/M_svd_10k.pickle', 'rb') as handle:
    M_svd = pickle.load(handle)
    
with open('data/M_nmf_10k.pickle', 'rb') as handle:
    M_nmf = pickle.load(handle)
    
with open('data/M_word2vec_10k.pickle', 'rb') as handle:
    M_word2vec = pickle.load(handle)
    
with open('data/sentiment_10k.pickle', 'rb') as handle:
    y = pickle.load(handle)

## Models

In [6]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler

In [7]:
scaler = MinMaxScaler()
M_svd_positive = scaler.fit_transform(M_svd)
M_word2vec_positive = scaler.fit_transform(M_word2vec)

In [8]:
baseline = sum(y)/len(y)
print('Baseline accuracy: ', baseline)

Baseline accuracy:  0.9356


In [9]:
embeddings_names = ['BOW', 'TFIDF', 'SVD', 'NMF', 'Word2Vec']
embeddings = [M_bow, M_tfidf, M_svd, M_nmf, M_word2vec]
embeddings_positive = [M_bow, M_tfidf, M_svd_positive, M_nmf, M_word2vec_positive]
results_names = ['train_acc', 'test_acc', 'precision', 'recall']

### Multinomial Naive Bayes

In [10]:
naive = MultinomialNB()

naive_cv = model_cv(naive, embeddings_positive, y)
df_model_cv(naive_cv, embeddings_names, results_names)

Unnamed: 0,train_acc,test_acc,precision,recall
BOW,0.96635,0.9031,0.960989,0.934479
TFIDF,0.9356,0.9356,0.9356,1.0
SVD,0.9356,0.9356,0.9356,1.0
NMF,0.93225,0.9297,0.935481,0.993373
Word2Vec,0.9356,0.9356,0.9356,1.0


In [11]:
sgd = SGDClassifier(random_state=9)

sgd_cv = model_cv(sgd, embeddings, y)
df_model_cv(sgd_cv, embeddings_names, results_names)

Unnamed: 0,train_acc,test_acc,precision,recall
BOW,0.9956,0.939,0.963256,0.971889
TFIDF,0.98955,0.9514,0.956574,0.993159
SVD,0.917451,0.9192,0.946017,0.969004
NMF,0.9355,0.9355,0.935594,0.999893
Word2Vec,0.95135,0.9427,0.951852,0.988777


In [12]:
sgd_log = SGDClassifier(loss='log', penalty='elasticnet', random_state=9)

sgd_log_cv = model_cv(sgd_log, embeddings, y)
df_model_cv(sgd_log_cv, embeddings_names, results_names)

Unnamed: 0,train_acc,test_acc,precision,recall
BOW,0.9905,0.9448,0.961173,0.980653
TFIDF,0.95625,0.9436,0.944222,0.998717
SVD,0.92525,0.925701,0.940665,0.982579
NMF,0.93555,0.9354,0.935674,0.999679
Word2Vec,0.9527,0.9437,0.95557,0.985677


In [13]:
logreg = LogisticRegression(max_iter=500, random_state=9)

logreg_cv = model_cv(logreg, embeddings, y)
df_model_cv(logreg_cv, embeddings_names, results_names)

Unnamed: 0,train_acc,test_acc,precision,recall
BOW,0.98725,0.947,0.959236,0.98525
TFIDF,0.94765,0.94,0.940177,0.999466
SVD,0.9354,0.9351,0.935568,0.999466
NMF,0.93565,0.9355,0.935594,0.999893
Word2Vec,0.95235,0.9435,0.954985,0.986105


### Verification

In [14]:
with open('data/tfidf_vectorizer.pickle', 'rb') as handle:
    vectorizer = pickle.load(handle)

In [15]:
sgd.fit(M_tfidf, y)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=9, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [16]:
review_test_pos = 'This game is amazing ^^, my son plays with it all the time!'
review_test_neg = 'I\'m really disappointed with this game. My son doesn\'t like playing with it.'

In [17]:
review_tokens_test_pos = f.normalize_single_text(review_test_pos)
tfidf_vector_test_pos = vectorizer.transform([' '.join(review_tokens_test_pos)])
sgd.predict(tfidf_vector_test_pos)

array([1])

In [18]:
review_tokens_test_neg = f.normalize_single_text(review_test_neg)
tfidf_vector_test_neg = vectorizer.transform([' '.join(review_tokens_test_neg)])
sgd.predict(tfidf_vector_test_neg)

array([0])

### Train test split & check reviews with incorrect labels

In [31]:
from sklearn.model_selection import train_test_split
df = pd.read_csv('data/reviews_toys_games.csv')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(M_tfidf, np.array(y), test_size=0.33, random_state=9)

In [32]:
sgd.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=9, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [29]:
y_pred = sgd.predict(X_test)

In [35]:
y_diff = y_test - y_pred
np.where(y_diff != 0)

(array([  10,   55,   75,  152,  162,  201,  242,  245,  262,  285,  305,
         310,  316,  333,  351,  353,  373,  390,  406,  452,  483,  500,
         520,  540,  547,  550,  586,  595,  598,  615,  624,  645,  675,
         691,  701,  718,  743,  761,  791,  799,  821,  825,  840,  843,
         861,  889,  893,  973,  974, 1013, 1015, 1038, 1141, 1182, 1257,
        1260, 1326, 1364, 1380, 1399, 1412, 1445, 1457, 1508, 1536, 1575,
        1591, 1638, 1661, 1668, 1671, 1681, 1686, 1688, 1712, 1725, 1769,
        1771, 1802, 1874, 1881, 1886, 1906, 1908, 1919, 1931, 1944, 1955,
        1972, 2001, 2011, 2035, 2043, 2079, 2111, 2119, 2155, 2233, 2252,
        2272, 2316, 2317, 2332, 2335, 2364, 2416, 2433, 2483, 2515, 2557,
        2563, 2571, 2585, 2603, 2624, 2628, 2631, 2644, 2652, 2666, 2709,
        2749, 2757, 2805, 2904, 2916, 2955, 2983, 2988, 3002, 3042, 3045,
        3050, 3083, 3084, 3126, 3148, 3158, 3177, 3196, 3197, 3218, 3248,
        3263, 3281], dtype=int64),)

In [52]:
i = 10
print(df['review'][i])
print('\nreal label:', y_test[i], '\npredicted label:', y_pred[i])

This is indeed a small book. This particular ROBOT sticker book was at best, OK.
Why? Well, there's really no space to create multiple robots. Stickers are not easy to remove.
Not recommended unless you just gotta have it to entertain a child. But this may only last ten minutes.
Plus side, small enough to stick in purse for travel.

real label: 0 
predicted label: 1


In [53]:
i = 2011
print(df['review'][i])
print('\nreal label:', y_test[i], '\npredicted label:', y_pred[i])

Okay, so we are new to the "Elf on the Shelf" tradition. My granddaughter is only two so I let her open the box. I had her hold the elf while I read the story, which, you more experienced shelfers know that is a no-no. We were able to fix it with, "Christopher will have to go back to the North Pole tonight to get his magic back", whew! That being said, the book is big and the illustrations are beautifully colorful. Christopher looks EXACTLY like the illustrations. On one particular page he is pictured actual size, face on, my granddaughter noticed and placed Christopher on top of his picture. I think it just helps with the fantasy. The doll is super cute and seems to be well made. I think the bendable arms will be helpful in coming up with creative ways to place him. The holidays are all about traditions and I think this is a really fun one. I'm very happy with the purchase.

real label: 1 
predicted label: 0


In [59]:
i = 2416
print(df['review'][i])
print('\nreal label:', y_test[i], '\npredicted label:', y_pred[i])

adorable

real label: 0 
predicted label: 1
