In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_validate

import functions as f



In [3]:
df = pd.read_csv('data/reviews_toys_games_100k.csv', dtype={'review': np.str_, 'sentiment': int})

reviews = df['review'].astype('U').values
y = df['sentiment'].to_list()

1. uni and bi grams + stop_words excluded
2. uni and bi grams + stop words included
3. uni, bi and trigrams + stop words included

In [4]:
M_bow_1 = CountVectorizer(stop_words='english', ngram_range=(1,2), max_features=6000).fit_transform(reviews)
M_bow_2 = CountVectorizer(ngram_range=(1,2), max_features=6000).fit_transform(reviews)
M_bow_3 = CountVectorizer(ngram_range=(1,3), max_features=6000).fit_transform(reviews)

M_tfidf_1 = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_features=6000).fit_transform(reviews)
M_tfidf_2 = TfidfVectorizer(ngram_range=(1,2), max_features=6000).fit_transform(reviews)
M_tfidf_3 = TfidfVectorizer(ngram_range=(1,3), max_features=6000).fit_transform(reviews)

In [5]:
embeddings_names = ['M_bow_1', 'M_bow_2', 'M_bow_3', 'M_tfidf_1', 'M_tfidf_2', 'M_tfidf_3']
embeddings = [M_bow_1, M_bow_2, M_bow_3, M_tfidf_1, M_tfidf_2, M_tfidf_3]
results_names = ['test_acc', 'precision', 'recall', 'f1']

In [6]:
sgd = SGDClassifier(random_state=9, n_jobs=-1)

sgd_cv = f.model_cv(sgd, embeddings, y)
f.df_model_cv(sgd_cv, embeddings_names, results_names)

Unnamed: 0,test_acc,precision,recall,f1
M_bow_1,0.95922,0.966973,0.989211,0.977965
M_bow_2,0.96663,0.976483,0.987309,0.98186
M_bow_3,0.96599,0.976743,0.986314,0.981502
M_tfidf_1,0.95067,0.951147,0.9973,0.973677
M_tfidf_2,0.95853,0.959045,0.997256,0.977777
M_tfidf_3,0.95816,0.958817,0.997092,0.97758


### SMOTE

In [54]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline, make_pipeline

In [55]:
imba_pipeline = make_pipeline(SMOTE(random_state=9),
                              SGDClassifier(random_state=9, n_jobs=-1))

pip_cv = f.model_cv(imba_pipeline, embeddings, y)
f.df_model_cv(pip_cv, embeddings_names, results_names)

Unnamed: 0,test_acc,precision,recall,f1
M_bow_1,0.89982,0.979435,0.909587,0.943218
M_bow_1,0.93632,0.981705,0.948098,0.964578
M_bow_1,0.93654,0.981555,0.948459,0.964718
M_tfidf_1,0.91021,0.988017,0.912921,0.948983
M_tfidf_2,0.92335,0.991151,0.924464,0.956647
M_tfidf_3,0.92167,0.99103,0.922726,0.955659


### Grid Search

In [7]:
from sklearn.model_selection import GridSearchCV

In [11]:
parameters = {
    'loss': ('hinge', 'log'),
    'penalty': ('l1', 'l2'),
    'alpha':(0.0001, 0.0005), 
    'max_iter':[800, 1000, 1200]}

sgd = SGDClassifier(random_state=9, n_jobs=-1)
clf = GridSearchCV(sgd, parameters)
clf.fit(M_bow_2, y)

GridSearchCV(estimator=SGDClassifier(n_jobs=-1, random_state=9),
             param_grid={'alpha': (0.0001, 0.0005), 'loss': ('hinge', 'log'),
                         'max_iter': [800, 1000, 1200],
                         'penalty': ('l1', 'l2')})

In [12]:
clf.best_params_

{'alpha': 0.0001, 'loss': 'hinge', 'max_iter': 800, 'penalty': 'l2'}

In [18]:
sgd_best_params = SGDClassifier(max_iter=800, random_state=9, n_jobs=-1)
sgd_best_cv = model_cv(sgd, [M_bow_2], y)
df_model_cv(sgd_best_cv, ['M_bow_2'], results_names)

Unnamed: 0,test_acc,precision,recall,f1
M_bow_2,0.96663,0.976483,0.987309,0.98186
