In [1]:
import pandas as pd
import nltk
import re
import random
import numpy as np
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from nltk.corpus import stopwords
from scipy.sparse import hstack

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to /home/ibrahim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ibrahim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ibrahim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
baseline = "baseline"
stem = "stem"
lem = "lem"
bow = "bow"
tfidf = "tfidf"
random_state = 42
testing_n = 5000
large_n = 50000
random.seed(random_state)
data_path = "./data/combined.csv"

# EDA and simple preprocessing

In [3]:
df = pd.read_csv(data_path, low_memory=False)
print(df.isna().sum() / len(df) * 100)

overall            0.000000
verified           0.000000
reviewTime         0.000000
reviewerID         0.000000
asin               0.000000
style             50.710965
reviewerName       0.016280
reviewText         0.049506
summary            0.025620
unixReviewTime     0.000000
vote              84.972458
image             97.894193
dtype: float64


In [4]:
print(df.isna().sum())

overall                0
verified               0
reviewTime             0
reviewerID             0
asin                   0
style             380030
reviewerName         122
reviewText           371
summary              192
unixReviewTime         0
vote              636787
image             733623
dtype: int64


In [5]:
shpae = df.shape
print(df.isna().sum().sum() / (shpae[0] * shpae[1]) * 100)

19.47241852636673


In [6]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5.0,True,"09 1, 2016",A3CIUOJXQ5VDQ2,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Shelly F,As advertised. Reasonably priced,Five Stars,1472688000,,
1,5.0,True,"11 14, 2013",A3H7T87S984REU,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",houserules18,Like the oder and the feel when I put it on my...,Good for the face,1384387200,,
2,1.0,True,"08 18, 2013",A3J034YH7UG4KT,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Adam,I bought this to smell nice after I shave. Wh...,Smells awful,1376784000,,
3,5.0,False,"05 3, 2011",A2UEO5XR3598GI,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Rich K,HEY!! I am an Aqua Velva Man and absolutely lo...,Truth is There IS Nothing Like an AQUA VELVA MAN.,1304380800,25.0,
4,5.0,True,"05 6, 2011",A3SFRT223XXWF7,B00006L9LC,{'Size:': ' 200ml/6.7oz'},C. C. Christian,If you ever want to feel pampered by a shampoo...,Bvlgari Shampoo,1304640000,3.0,


In [7]:
df = df[["overall", "reviewText", "summary", "verified"]]

In [8]:
print(df.isna().sum() / len(df) * 100)

overall       0.000000
reviewText    0.049506
summary       0.025620
verified      0.000000
dtype: float64


In [9]:
df = df.dropna()

In [10]:
print(df.isna().sum() / len(df) * 100)

overall       0.0
reviewText    0.0
summary       0.0
verified      0.0
dtype: float64


In [11]:
df["sentiment"] = df["overall"].apply(lambda x: 1 if x > 3 else -1 if x < 3 else 0)
df["reviewTextWithSummary"] = df["summary"] + " " + df["reviewText"]
df.drop(["overall", "summary"], axis=1, inplace=True)
df.head()

Unnamed: 0,reviewText,verified,sentiment,reviewTextWithSummary
0,As advertised. Reasonably priced,True,1,Five Stars As advertised. Reasonably priced
1,Like the oder and the feel when I put it on my...,True,1,Good for the face Like the oder and the feel w...
2,I bought this to smell nice after I shave. Wh...,True,-1,Smells awful I bought this to smell nice after...
3,HEY!! I am an Aqua Velva Man and absolutely lo...,False,1,Truth is There IS Nothing Like an AQUA VELVA M...
4,If you ever want to feel pampered by a shampoo...,True,1,Bvlgari Shampoo If you ever want to feel pampe...


In [12]:
df["sentiment"].value_counts()

sentiment
 1    657241
 0     47109
-1     44501
Name: count, dtype: int64

In [13]:
df.head()

Unnamed: 0,reviewText,verified,sentiment,reviewTextWithSummary
0,As advertised. Reasonably priced,True,1,Five Stars As advertised. Reasonably priced
1,Like the oder and the feel when I put it on my...,True,1,Good for the face Like the oder and the feel w...
2,I bought this to smell nice after I shave. Wh...,True,-1,Smells awful I bought this to smell nice after...
3,HEY!! I am an Aqua Velva Man and absolutely lo...,False,1,Truth is There IS Nothing Like an AQUA VELVA M...
4,If you ever want to feel pampered by a shampoo...,True,1,Bvlgari Shampoo If you ever want to feel pampe...


# Model experimentation

In [14]:
df_testing = (
    df.groupby("sentiment")
    .apply(lambda x: x.sample(n=testing_n, random_state=random_state, replace=True))
    .reset_index(drop=True)
)
df_testing["sentiment"].value_counts()

sentiment
-1    5000
 0    5000
 1    5000
Name: count, dtype: int64

In [15]:
df_testing.head()

Unnamed: 0,reviewText,verified,sentiment,reviewTextWithSummary
0,Intuit is a despicable company now. This is th...,False,-1,Intuit is One Unethical Company. Any alternati...
1,Very disappointed guitar came damaged with den...,True,-1,Very disappointed Very disappointed guitar cam...
2,The rings did not perform as I had hoped. They...,True,-1,Not what I was hoping for The rings did not pe...
3,My ProFX8 purchased from Amazon in 2015 was po...,False,-1,Mine broke. Very light use. Padded case. Maybe...
4,"Not the greatest, really flimsy.",True,-1,"Two Stars Not the greatest, really flimsy."


In [16]:
print("Dataset size:", len(df_testing))

Dataset size: 15000


In [17]:
STOP_WORDS = set(stopwords.words("english"))

In [18]:
def preprocess_text(sentence, stop, type_proc=None):
    words = []
    for word in sentence.lower().strip().split():

        word = re.sub("\d", "", word)
        word = re.sub("[^\w\s]", "", word)

        if word not in stop and word != "":
            words.append(preprocess_type(word, type_proc))

    return " ".join(words)

In [19]:
def preprocess_type(word, type_proc):
    if type_proc == baseline:
        return word
    elif type_proc == stem:
        return PorterStemmer().stem(word)
    elif type_proc == lem:
        return WordNetLemmatizer().lemmatize(word)
    else:
        raise ValueError("Invalid Preprocessing Type")

In [20]:
def my_train_test_split(cols, test_size, df=df, random_state=random_state):
    x_train, x_test, y_train, y_test = train_test_split(
        df[cols], df["sentiment"], test_size=test_size, random_state=random_state
    )
    return x_train, x_test, y_train, y_test

In [21]:
def apply_preprocessing(proc, x_train, x_test):
    if proc is None:
        return x_train, x_test
    cols = x_train.columns

    textcol = "reviewText"
    if "reviewText" not in cols and "reviewTextWithSummary" in cols:
        textcol = "reviewTextWithSummary"
    x_train[textcol] = x_train[textcol].apply(
        lambda x: preprocess_text(x, STOP_WORDS, proc)
    )
    x_test[textcol] = x_test[textcol].apply(
        lambda x: preprocess_text(x, STOP_WORDS, proc)
    )
    return x_train, x_test

In [22]:
def add_col(x, col):
    col = np.array([col]).T
    return hstack([x, col])

In [23]:
def pipeline(cols, test_size, proc, vectorizer, df=df, random_state=random_state):
    if "reviewText" not in cols and "reviewTextWithSummary" not in cols:
        raise ValueError("Must contain reviewText or reviewTextWithSummary")

    textcol = "reviewText"
    if "reviewText" not in cols and "reviewTextWithSummary" in cols:
        textcol = "reviewTextWithSummary"
    x_train, x_test, y_train, y_test = my_train_test_split(
        cols, test_size, df, random_state
    )
    x_train, x_test = apply_preprocessing(proc, x_train, x_test)
    if vectorizer == bow:
        vectorizer = CountVectorizer()
    elif vectorizer == tfidf:
        vectorizer = TfidfVectorizer()
    else:
        raise ValueError("Invalid Vectorizer")
    x_train_ = vectorizer.fit_transform(x_train[textcol])
    x_test_ = vectorizer.transform(x_test[textcol])

    if "verified" in cols:
        x_train = add_col(x_train_, x_train["verified"])
        x_test = add_col(x_test_, x_test["verified"])
    else:
        x_train = x_train_
        x_test = x_test_
    return x_train, x_test, y_train, y_test

## Testing different configs

In [24]:
param_grid = {
    "C": [0.1, 1, 10, 100, 1000],
    "gamma": [1, 0.1, 0.01, 0.001, 0.0001],
    "kernel": ["rbf", "linear"],
}
n_jobs = -1
verbose = 0
cv = 3

In [25]:
compare_list = pd.DataFrame(
    columns=["Grid Params", "Data config and preprocessing", "Grid Score"]
)

In [26]:
code_gen = False
col_comb = [
    ["reviewText"],
    ["reviewText", "verified"],
    ["reviewTextWithSummary"],
    ["reviewTextWithSummary", "verified"],
]
proc_comb = [None, baseline, stem, lem]
vectorizer_comb = [bow, tfidf]
if code_gen:
    for col in col_comb:
        for proc in proc_comb:
            for vectorizer in vectorizer_comb:
                params = {
                    "col": col,
                    "proc": proc,
                    "vectorizer": vectorizer,
                }
                print(
                    f"""
# %%
x_train, x_test, y_train, y_test = pipeline({col}, 0.25, {proc}, {vectorizer}, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {params}, grid_score]
"""
                )

In [27]:
# below is code genderated by above cell, to make changes to the code, edit the
# above cell and run it, pasting its contents between the markers

In [28]:
####### START OF GENERATED CODE #######
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, None, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'proc': None, 'vectorizer': 'bow'}, grid_score]

              precision    recall  f1-score   support

          -1       0.72      0.69      0.71      1284
           0       0.60      0.56      0.58      1225
           1       0.75      0.84      0.79      1241

    accuracy                           0.70      3750
   macro avg       0.69      0.70      0.69      3750
weighted avg       0.69      0.70      0.69      3750



In [29]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, None, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'proc': None, 'vectorizer': 'tfidf'}, grid_score]

              precision    recall  f1-score   support

          -1       0.74      0.74      0.74      1284
           0       0.62      0.64      0.63      1225
           1       0.82      0.79      0.80      1241

    accuracy                           0.72      3750
   macro avg       0.73      0.72      0.73      3750
weighted avg       0.73      0.72      0.73      3750



In [30]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, baseline, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'proc': 'baseline', 'vectorizer': 'bow'}, grid_score]

              precision    recall  f1-score   support

          -1       0.70      0.68      0.69      1284
           0       0.59      0.54      0.56      1225
           1       0.71      0.80      0.75      1241

    accuracy                           0.67      3750
   macro avg       0.67      0.67      0.67      3750
weighted avg       0.67      0.67      0.67      3750



In [31]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, baseline, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'proc': 'baseline', 'vectorizer': 'tfidf'}, grid_score]

              precision    recall  f1-score   support

          -1       0.71      0.71      0.71      1284
           0       0.60      0.63      0.61      1225
           1       0.80      0.76      0.78      1241

    accuracy                           0.70      3750
   macro avg       0.70      0.70      0.70      3750
weighted avg       0.70      0.70      0.70      3750



In [32]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, stem, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'proc': 'stem', 'vectorizer': 'bow'}, grid_score]

              precision    recall  f1-score   support

          -1       0.70      0.68      0.69      1284
           0       0.59      0.56      0.57      1225
           1       0.72      0.78      0.75      1241

    accuracy                           0.67      3750
   macro avg       0.67      0.67      0.67      3750
weighted avg       0.67      0.67      0.67      3750



In [33]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, stem, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'proc': 'stem', 'vectorizer': 'tfidf'}, grid_score]

              precision    recall  f1-score   support

          -1       0.70      0.69      0.70      1284
           0       0.59      0.63      0.61      1225
           1       0.79      0.75      0.77      1241

    accuracy                           0.69      3750
   macro avg       0.69      0.69      0.69      3750
weighted avg       0.70      0.69      0.69      3750



In [34]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, lem, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'proc': 'lem', 'vectorizer': 'bow'}, grid_score]

              precision    recall  f1-score   support

          -1       0.69      0.64      0.66      1284
           0       0.58      0.54      0.56      1225
           1       0.71      0.81      0.76      1241

    accuracy                           0.66      3750
   macro avg       0.66      0.66      0.66      3750
weighted avg       0.66      0.66      0.66      3750



In [35]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, lem, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'proc': 'lem', 'vectorizer': 'tfidf'}, grid_score]

              precision    recall  f1-score   support

          -1       0.70      0.70      0.70      1284
           0       0.60      0.63      0.61      1225
           1       0.79      0.76      0.77      1241

    accuracy                           0.69      3750
   macro avg       0.70      0.69      0.69      3750
weighted avg       0.70      0.69      0.69      3750



In [36]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, None, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'proc': None, 'vectorizer': 'bow'}, grid_score]

              precision    recall  f1-score   support

          -1       0.72      0.70      0.71      1284
           0       0.61      0.56      0.58      1225
           1       0.76      0.83      0.79      1241

    accuracy                           0.70      3750
   macro avg       0.69      0.70      0.70      3750
weighted avg       0.69      0.70      0.70      3750



In [37]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, None, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'proc': None, 'vectorizer': 'tfidf'}, grid_score]

              precision    recall  f1-score   support

          -1       0.73      0.75      0.74      1284
           0       0.62      0.64      0.63      1225
           1       0.82      0.78      0.80      1241

    accuracy                           0.72      3750
   macro avg       0.73      0.72      0.72      3750
weighted avg       0.73      0.72      0.72      3750



In [38]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, baseline, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'proc': 'baseline', 'vectorizer': 'bow'}, grid_score]

              precision    recall  f1-score   support

          -1       0.70      0.69      0.69      1284
           0       0.60      0.56      0.58      1225
           1       0.73      0.79      0.76      1241

    accuracy                           0.68      3750
   macro avg       0.67      0.68      0.67      3750
weighted avg       0.67      0.68      0.68      3750



In [39]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, baseline, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'proc': 'baseline', 'vectorizer': 'tfidf'}, grid_score]

              precision    recall  f1-score   support

          -1       0.70      0.72      0.71      1284
           0       0.59      0.61      0.60      1225
           1       0.80      0.75      0.77      1241

    accuracy                           0.69      3750
   macro avg       0.70      0.69      0.70      3750
weighted avg       0.70      0.69      0.70      3750



In [40]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, stem, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'proc': 'stem', 'vectorizer': 'bow'}, grid_score]

              precision    recall  f1-score   support

          -1       0.70      0.68      0.69      1284
           0       0.59      0.56      0.57      1225
           1       0.73      0.78      0.76      1241

    accuracy                           0.67      3750
   macro avg       0.67      0.67      0.67      3750
weighted avg       0.67      0.67      0.67      3750



In [41]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, stem, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'proc': 'stem', 'vectorizer': 'tfidf'}, grid_score]

              precision    recall  f1-score   support

          -1       0.70      0.70      0.70      1284
           0       0.59      0.63      0.61      1225
           1       0.80      0.74      0.77      1241

    accuracy                           0.69      3750
   macro avg       0.69      0.69      0.69      3750
weighted avg       0.70      0.69      0.69      3750



In [42]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, lem, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'proc': 'lem', 'vectorizer': 'bow'}, grid_score]

              precision    recall  f1-score   support

          -1       0.68      0.67      0.67      1284
           0       0.59      0.54      0.56      1225
           1       0.72      0.79      0.75      1241

    accuracy                           0.67      3750
   macro avg       0.66      0.67      0.66      3750
weighted avg       0.66      0.67      0.66      3750



In [43]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, lem, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'proc': 'lem', 'vectorizer': 'tfidf'}, grid_score]

              precision    recall  f1-score   support

          -1       0.69      0.71      0.70      1284
           0       0.59      0.62      0.60      1225
           1       0.79      0.75      0.77      1241

    accuracy                           0.69      3750
   macro avg       0.69      0.69      0.69      3750
weighted avg       0.69      0.69      0.69      3750



In [44]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, None, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'proc': None, 'vectorizer': 'bow'}, grid_score]

              precision    recall  f1-score   support

          -1       0.77      0.78      0.78      1284
           0       0.71      0.69      0.70      1225
           1       0.85      0.88      0.86      1241

    accuracy                           0.78      3750
   macro avg       0.78      0.78      0.78      3750
weighted avg       0.78      0.78      0.78      3750



In [45]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, None, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'proc': None, 'vectorizer': 'tfidf'}, grid_score]

              precision    recall  f1-score   support

          -1       0.79      0.79      0.79      1284
           0       0.70      0.74      0.72      1225
           1       0.88      0.84      0.86      1241

    accuracy                           0.79      3750
   macro avg       0.79      0.79      0.79      3750
weighted avg       0.79      0.79      0.79      3750



In [46]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, baseline, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'proc': 'baseline', 'vectorizer': 'bow'}, grid_score]

              precision    recall  f1-score   support

          -1       0.74      0.74      0.74      1284
           0       0.68      0.66      0.67      1225
           1       0.83      0.85      0.84      1241

    accuracy                           0.75      3750
   macro avg       0.75      0.75      0.75      3750
weighted avg       0.75      0.75      0.75      3750



In [47]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, baseline, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'proc': 'baseline', 'vectorizer': 'tfidf'}, grid_score]

              precision    recall  f1-score   support

          -1       0.76      0.77      0.77      1284
           0       0.69      0.72      0.71      1225
           1       0.88      0.82      0.85      1241

    accuracy                           0.77      3750
   macro avg       0.78      0.77      0.77      3750
weighted avg       0.78      0.77      0.77      3750



In [48]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, stem, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'proc': 'stem', 'vectorizer': 'bow'}, grid_score]

              precision    recall  f1-score   support

          -1       0.73      0.72      0.73      1284
           0       0.67      0.65      0.66      1225
           1       0.81      0.85      0.83      1241

    accuracy                           0.74      3750
   macro avg       0.74      0.74      0.74      3750
weighted avg       0.74      0.74      0.74      3750



In [49]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, stem, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'proc': 'stem', 'vectorizer': 'tfidf'}, grid_score]

              precision    recall  f1-score   support

          -1       0.76      0.75      0.76      1284
           0       0.68      0.72      0.70      1225
           1       0.86      0.82      0.84      1241

    accuracy                           0.76      3750
   macro avg       0.77      0.76      0.77      3750
weighted avg       0.77      0.76      0.77      3750



In [50]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, lem, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'proc': 'lem', 'vectorizer': 'bow'}, grid_score]

              precision    recall  f1-score   support

          -1       0.75      0.73      0.74      1284
           0       0.67      0.68      0.68      1225
           1       0.83      0.83      0.83      1241

    accuracy                           0.75      3750
   macro avg       0.75      0.75      0.75      3750
weighted avg       0.75      0.75      0.75      3750



In [51]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, lem, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'proc': 'lem', 'vectorizer': 'tfidf'}, grid_score]

              precision    recall  f1-score   support

          -1       0.76      0.77      0.76      1284
           0       0.69      0.71      0.70      1225
           1       0.87      0.82      0.84      1241

    accuracy                           0.77      3750
   macro avg       0.77      0.77      0.77      3750
weighted avg       0.77      0.77      0.77      3750



In [52]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, None, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'proc': None, 'vectorizer': 'bow'}, grid_score]

              precision    recall  f1-score   support

          -1       0.78      0.79      0.79      1284
           0       0.72      0.70      0.71      1225
           1       0.86      0.87      0.86      1241

    accuracy                           0.79      3750
   macro avg       0.79      0.79      0.79      3750
weighted avg       0.79      0.79      0.79      3750



In [53]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, None, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'proc': None, 'vectorizer': 'tfidf'}, grid_score]

              precision    recall  f1-score   support

          -1       0.78      0.79      0.79      1284
           0       0.71      0.72      0.71      1225
           1       0.88      0.85      0.86      1241

    accuracy                           0.79      3750
   macro avg       0.79      0.79      0.79      3750
weighted avg       0.79      0.79      0.79      3750



In [54]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, baseline, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'proc': 'baseline', 'vectorizer': 'bow'}, grid_score]

              precision    recall  f1-score   support

          -1       0.74      0.75      0.74      1284
           0       0.69      0.66      0.67      1225
           1       0.83      0.85      0.84      1241

    accuracy                           0.75      3750
   macro avg       0.75      0.75      0.75      3750
weighted avg       0.75      0.75      0.75      3750



In [55]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, baseline, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'proc': 'baseline', 'vectorizer': 'tfidf'}, grid_score]

              precision    recall  f1-score   support

          -1       0.76      0.77      0.76      1284
           0       0.69      0.71      0.70      1225
           1       0.86      0.84      0.85      1241

    accuracy                           0.77      3750
   macro avg       0.77      0.77      0.77      3750
weighted avg       0.77      0.77      0.77      3750



In [56]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, stem, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'proc': 'stem', 'vectorizer': 'bow'}, grid_score]

              precision    recall  f1-score   support

          -1       0.73      0.73      0.73      1284
           0       0.67      0.65      0.66      1225
           1       0.81      0.85      0.83      1241

    accuracy                           0.74      3750
   macro avg       0.74      0.74      0.74      3750
weighted avg       0.74      0.74      0.74      3750



In [57]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, stem, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'proc': 'stem', 'vectorizer': 'tfidf'}, grid_score]

              precision    recall  f1-score   support

          -1       0.75      0.75      0.75      1284
           0       0.68      0.69      0.69      1225
           1       0.85      0.83      0.84      1241

    accuracy                           0.76      3750
   macro avg       0.76      0.76      0.76      3750
weighted avg       0.76      0.76      0.76      3750



In [58]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, lem, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'proc': 'lem', 'vectorizer': 'bow'}, grid_score]

              precision    recall  f1-score   support

          -1       0.74      0.73      0.73      1284
           0       0.67      0.68      0.67      1225
           1       0.83      0.83      0.83      1241

    accuracy                           0.75      3750
   macro avg       0.75      0.75      0.75      3750
weighted avg       0.75      0.75      0.75      3750



In [59]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, lem, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
grid_score = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'proc': 'lem', 'vectorizer': 'tfidf'}, grid_score]
#######  END OF GENERATED CODE  #######

              precision    recall  f1-score   support

          -1       0.76      0.76      0.76      1284
           0       0.69      0.70      0.69      1225
           1       0.86      0.83      0.84      1241

    accuracy                           0.77      3750
   macro avg       0.77      0.76      0.77      3750
weighted avg       0.77      0.77      0.77      3750



In [60]:
pd.set_option("display.max_colwidth", None)
compare_list = compare_list.sort_values(by="Grid Score", ascending=False).reset_index(drop=True)
display(compare_list)

Unnamed: 0,Grid Params,Data config and preprocessing,Grid Score
0,"{'C': 1, 'gamma': 1, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary'], 'proc': None, 'vectorizer': 'tfidf'}",0.7896
1,"{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary', 'verified'], 'proc': None, 'vectorizer': 'bow'}",0.787467
2,"{'C': 10, 'gamma': 1, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary', 'verified'], 'proc': None, 'vectorizer': 'tfidf'}",0.7872
3,"{'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary'], 'proc': None, 'vectorizer': 'bow'}",0.7816
4,"{'C': 1, 'gamma': 1, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary'], 'proc': 'baseline', 'vectorizer': 'tfidf'}",0.7728
5,"{'C': 10, 'gamma': 1, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary', 'verified'], 'proc': 'baseline', 'vectorizer': 'tfidf'}",0.770133
6,"{'C': 1, 'gamma': 1, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary'], 'proc': 'lem', 'vectorizer': 'tfidf'}",0.767733
7,"{'C': 10, 'gamma': 1, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary', 'verified'], 'proc': 'lem', 'vectorizer': 'tfidf'}",0.765067
8,"{'C': 1, 'gamma': 1, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary'], 'proc': 'stem', 'vectorizer': 'tfidf'}",0.764267
9,"{'C': 10, 'gamma': 1, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary', 'verified'], 'proc': 'stem', 'vectorizer': 'tfidf'}",0.7608


In [61]:
print(f"Best Configuration on testing dataset (size={len(df_testing)}):")
print("Score :: ", compare_list.loc[0]["Grid Score"])
print("SVC   :: ", compare_list.loc[0]["Grid Params"])
print("data  :: ", compare_list.loc[0]["Data config and preprocessing"])

Best Configuration on testing dataset (size=15000):
Score ::  0.7896
SVC   ::  {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
data  ::  {'col': ['reviewTextWithSummary'], 'proc': None, 'vectorizer': 'tfidf'}


- Across all tests, reviewText with summary performed better than reviewText without summary.
- The RBF kernel performed better than the linear kernel in almost all cases.
- The top configuration was as follows:
  - Data::
    - Columns used: reviewTextWithSummary
    - Text preprocessing step: None
    - Text vectorizer: tfidf
  - SVC::
    - C=1
    - gamma=1
    - kernel=rbf

Using these parameters, lets build a model on a larger dataset.

## Building larger model

In [63]:
df_large = (
    df.groupby("sentiment")
    .apply(lambda x: x.sample(n=large_n, random_state=random_state, replace=True))
    .reset_index(drop=True)
)

In [64]:
sample = [
    "I loved this product, it was amazing",
    "I hated this product, it was terrible",
    "This product was okay, it was fine",
    "I am not sure how I feel about this product",
    "Apple really outdid themselves with this product",
    "The engine was really loud, but otherwise the car was fine",
]

### Top performing model

In [65]:
x_train, x_test, y_train, y_test = train_test_split(
    df_large[["reviewTextWithSummary"]],
    df_large["sentiment"],
    test_size=0.25,
    random_state=random_state,
)

In [66]:
vec_testing_top_config = TfidfVectorizer()
x_train = vec_testing_top_config.fit_transform(x_train["reviewTextWithSummary"])
x_test = vec_testing_top_config.transform(x_test["reviewTextWithSummary"])

In [67]:
svc_testing_top_config = SVC(C=1, gamma=1, kernel="rbf")

In [68]:
svc_testing_top_config.fit(x_train, y_train)

In [69]:
y_pred = svc_testing_top_config.predict(x_test)

In [70]:
sample_ = [vec_testing_top_config.transform([x]) for x in sample]

In [71]:
for s, p in zip(sample, sample_):
    print(s)
    print(svc_testing_top_config.predict(p))
    print()

I loved this product, it was amazing
[1]

I hated this product, it was terrible
[-1]

This product was okay, it was fine
[0]

I am not sure how I feel about this product
[-1]

Apple really outdid themselves with this product
[-1]

The engine was really loud, but otherwise the car was fine
[0]



In [72]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.89      0.90      0.90     12448
           0       0.84      0.87      0.86     12571
           1       0.93      0.89      0.91     12481

    accuracy                           0.89     37500
   macro avg       0.89      0.89      0.89     37500
weighted avg       0.89      0.89      0.89     37500



- Results seem very good with an F1 Score of 0.87
- Interestingly, the top perfomring model did not use any text preprocessing
- The next best performing models that used different text preprocessing were as follows:
  - baseline text preprocessing with tfidf vectorizer: score of 0.7728
  - lemmatized text preprocessing with tfidf vectorizer: score of 0.7677
  - stemmed text preprocessing with tfidf vectorizer: score of 0.7642
- These results are within 2% than the selected model, so it may be worth exploring these models further

### Baseline with tfidf

In [111]:
print("Best configuration using baseline text preprocessing")
print("Score :: ", compare_list.loc[4]["Grid Score"])
print("SVC   :: ", compare_list.loc[4]["Grid Params"])
print("data  :: ", compare_list.loc[4]["Data config and preprocessing"])

Best configuration using baseline text preprocessing
Score ::  0.7728
SVC   ::  {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
data  ::  {'col': ['reviewTextWithSummary'], 'proc': 'baseline', 'vectorizer': 'tfidf'}


In [73]:
x_train, x_test, y_train, y_test = train_test_split(
    df_large[["reviewTextWithSummary"]],
    df_large["sentiment"],
    test_size=0.25,
    random_state=random_state,
)

In [74]:
x_train["reviewTextWithSummary"] = x_train["reviewTextWithSummary"].apply(
    lambda x: preprocess_text(x, STOP_WORDS, baseline)
)
x_test["reviewTextWithSummary"] = x_test["reviewTextWithSummary"].apply(
    lambda x: preprocess_text(x, STOP_WORDS, baseline)
)

In [75]:
vec_baseline_tfidf = TfidfVectorizer()
x_train = vec_baseline_tfidf.fit_transform(x_train["reviewTextWithSummary"])
x_test = vec_baseline_tfidf.transform(x_test["reviewTextWithSummary"])

In [76]:
svc_baseline_tfidf = SVC(C=1, gamma=1, kernel="rbf")

In [77]:
svc_baseline_tfidf.fit(x_train, y_train)

In [78]:
y_pred = svc_baseline_tfidf.predict(x_test)

In [79]:
sample_ = [preprocess_text(x, STOP_WORDS, baseline) for x in sample]
sample_ = [vec_baseline_tfidf.transform([x]) for x in sample_]

In [80]:
for s, p in zip(sample, sample_):
    print(s)
    print(svc_baseline_tfidf.predict(p))
    print()

I loved this product, it was amazing
[1]

I hated this product, it was terrible
[-1]

This product was okay, it was fine
[0]

I am not sure how I feel about this product
[-1]

Apple really outdid themselves with this product
[-1]

The engine was really loud, but otherwise the car was fine
[0]



In [81]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.88      0.90      0.89     12448
           0       0.84      0.85      0.85     12571
           1       0.92      0.89      0.90     12481

    accuracy                           0.88     37500
   macro avg       0.88      0.88      0.88     37500
weighted avg       0.88      0.88      0.88     37500



### Lemmatized with tfidf

In [112]:
print("Best configuration using baseline text preprocessing")
print("Score :: ", compare_list.loc[6]["Grid Score"])
print("SVC   :: ", compare_list.loc[6]["Grid Params"])
print("data  :: ", compare_list.loc[6]["Data config and preprocessing"])

Best configuration using baseline text preprocessing
Score ::  0.7677333333333334
SVC   ::  {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
data  ::  {'col': ['reviewTextWithSummary'], 'proc': 'lem', 'vectorizer': 'tfidf'}


In [82]:
x_train, x_test, y_train, y_test = train_test_split(
    df_large[["reviewTextWithSummary"]],
    df_large["sentiment"],
    test_size=0.25,
    random_state=random_state,
)

In [83]:
x_train["reviewTextWithSummary"] = x_train["reviewTextWithSummary"].apply(
    lambda x: preprocess_text(x, STOP_WORDS, lem)
)
x_test["reviewTextWithSummary"] = x_test["reviewTextWithSummary"].apply(
    lambda x: preprocess_text(x, STOP_WORDS, lem)
)

In [84]:
vec_lematized_tfidf = TfidfVectorizer()
x_train = vec_lematized_tfidf.fit_transform(x_train["reviewTextWithSummary"])
x_test = vec_lematized_tfidf.transform(x_test["reviewTextWithSummary"])

In [85]:
svc_lematized_tfidf = SVC(C=1, gamma=1, kernel="rbf")

In [86]:
svc_lematized_tfidf.fit(x_train, y_train)

In [87]:
y_pred = svc_lematized_tfidf.predict(x_test)

In [88]:
sample_ = [preprocess_text(x, STOP_WORDS, lem) for x in sample]
sample_ = [vec_lematized_tfidf.transform([x]) for x in sample_]

In [89]:
for s, p in zip(sample, sample_):
    print(s)
    print(svc_lematized_tfidf.predict(p))
    print()

I loved this product, it was amazing
[1]

I hated this product, it was terrible
[-1]

This product was okay, it was fine
[0]

I am not sure how I feel about this product
[-1]

Apple really outdid themselves with this product
[-1]

The engine was really loud, but otherwise the car was fine
[0]



In [90]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.88      0.90      0.89     12448
           0       0.84      0.85      0.84     12571
           1       0.92      0.88      0.90     12481

    accuracy                           0.88     37500
   macro avg       0.88      0.88      0.88     37500
weighted avg       0.88      0.88      0.88     37500



### Stemmed with tfidf

In [113]:
print("Best configuration using baseline text preprocessing")
print("Score :: ", compare_list.loc[8]["Grid Score"])
print("SVC   :: ", compare_list.loc[8]["Grid Params"])
print("data  :: ", compare_list.loc[8]["Data config and preprocessing"])

Best configuration using baseline text preprocessing
Score ::  0.7642666666666666
SVC   ::  {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
data  ::  {'col': ['reviewTextWithSummary'], 'proc': 'stem', 'vectorizer': 'tfidf'}


In [91]:
x_train, x_test, y_train, y_test = train_test_split(
    df_large[["reviewTextWithSummary"]],
    df_large["sentiment"],
    test_size=0.25,
    random_state=random_state,
)

In [92]:
x_train["reviewTextWithSummary"] = x_train["reviewTextWithSummary"].apply(
    lambda x: preprocess_text(x, STOP_WORDS, stem)
)
x_test["reviewTextWithSummary"] = x_test["reviewTextWithSummary"].apply(
    lambda x: preprocess_text(x, STOP_WORDS, stem)
)

In [93]:
vec_stemmed_tfidf = TfidfVectorizer()
x_train = vec_stemmed_tfidf.fit_transform(x_train["reviewTextWithSummary"])
x_test = vec_stemmed_tfidf.transform(x_test["reviewTextWithSummary"])

In [94]:
svc_stemmed_tfidf = SVC(C=1, gamma=1, kernel="rbf")

In [95]:
svc_stemmed_tfidf.fit(x_train, y_train)

In [96]:
y_pred = svc_stemmed_tfidf.predict(x_test)

In [97]:
sample_ = [preprocess_text(x, STOP_WORDS, stem) for x in sample]
sample_ = [vec_stemmed_tfidf.transform([x]) for x in sample_]

In [98]:
for s, p in zip(sample, sample_):
    print(s)
    print(svc_stemmed_tfidf.predict(p))
    print()

I loved this product, it was amazing
[1]

I hated this product, it was terrible
[-1]

This product was okay, it was fine
[0]

I am not sure how I feel about this product
[0]

Apple really outdid themselves with this product
[-1]

The engine was really loud, but otherwise the car was fine
[0]



In [99]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.88      0.89      0.89     12448
           0       0.84      0.85      0.84     12571
           1       0.91      0.88      0.90     12481

    accuracy                           0.88     37500
   macro avg       0.88      0.88      0.88     37500
weighted avg       0.88      0.88      0.88     37500



- All models perform within a percent of each other based on their own test sets
- lets compare each of these models on the full dataset

## Testing on full dataset

In [100]:
x = df[["reviewTextWithSummary"]]
y = df["sentiment"]

In [101]:
x_top_config = vec_testing_top_config.transform(x["reviewTextWithSummary"])
x_baseline_tfidf = vec_baseline_tfidf.transform(
    x["reviewTextWithSummary"].apply(lambda x: preprocess_text(x, STOP_WORDS, baseline))
)
x_lematized_tfidf = vec_lematized_tfidf.transform(
    x["reviewTextWithSummary"].apply(lambda x: preprocess_text(x, STOP_WORDS, lem))
)
x_stemmed_tfidf = vec_stemmed_tfidf.transform(
    x["reviewTextWithSummary"].apply(lambda x: preprocess_text(x, STOP_WORDS, stem))
)

In [102]:
y_pred_top_config = svc_testing_top_config.predict(x_top_config)
y_pred_baseline_tfidf = svc_baseline_tfidf.predict(x_baseline_tfidf)
y_pred_lematized_tfidf = svc_lematized_tfidf.predict(x_lematized_tfidf)
y_pred_stemmed_tfidf = svc_stemmed_tfidf.predict(x_stemmed_tfidf)

In [103]:
print("Top Config")
print(classification_report(y, y_pred_top_config))
score_top_config = f1_score(y, y_pred_top_config, average="weighted")

Top Config
              precision    recall  f1-score   support

          -1       0.67      0.90      0.77     44501
           0       0.41      0.86      0.56     47109
           1       0.99      0.89      0.94    657241

    accuracy                           0.89    748851
   macro avg       0.69      0.89      0.76    748851
weighted avg       0.94      0.89      0.91    748851



In [104]:
print("Baseline with tfidf")
print(classification_report(y, y_pred_baseline_tfidf))
score_baseline_tfidf = f1_score(y, y_pred_baseline_tfidf, average="weighted")

Baseline with tfidf
              precision    recall  f1-score   support

          -1       0.62      0.90      0.73     44501
           0       0.41      0.85      0.55     47109
           1       0.99      0.89      0.94    657241

    accuracy                           0.88    748851
   macro avg       0.67      0.88      0.74    748851
weighted avg       0.93      0.88      0.90    748851



In [105]:
print("Lematized with tfidf")
print(classification_report(y, y_pred_lematized_tfidf))
score_lematized_tfidf = f1_score(y, y_pred_lematized_tfidf, average="weighted")

Lematized with tfidf
              precision    recall  f1-score   support

          -1       0.61      0.89      0.72     44501
           0       0.41      0.85      0.55     47109
           1       0.99      0.88      0.94    657241

    accuracy                           0.88    748851
   macro avg       0.67      0.88      0.74    748851
weighted avg       0.93      0.88      0.90    748851



In [106]:
print("Stemmed with tfidf")
print(classification_report(y, y_pred_stemmed_tfidf))
score_stemmed_tfidf = f1_score(y, y_pred_stemmed_tfidf, average="weighted")

Stemmed with tfidf
              precision    recall  f1-score   support

          -1       0.60      0.89      0.72     44501
           0       0.40      0.85      0.54     47109
           1       0.99      0.88      0.93    657241

    accuracy                           0.88    748851
   macro avg       0.66      0.87      0.73    748851
weighted avg       0.93      0.88      0.90    748851



In [114]:
print("Scores")
print("Overall top config (No preprocessing) :: ", score_top_config)
print("Baseline preprocessing top config     :: ", score_baseline_tfidf)
print("Lemmatized preprocessing top config   :: ", score_lematized_tfidf)
print("Stemmed preprocessing top config      :: ", score_stemmed_tfidf)

Scores
Overall top config (No preprocessing) ::  0.9067389973023244
Baseline preprocessing top config     ::  0.8998075765013855
Lemmatized preprocessing top config   ::  0.8983562736247437
Stemmed preprocessing top config      ::  0.8963572243555369


## Conclusion
- Performance ranking is the same as the experimental results from before
- All selected models perform at a very high level, all around 88% accurate with F1 scores around 0.9
- The overall top performing model was the one that did not use any text preprocessing with the following:
  - Text was not preprocessesed
  - Text was vectorized using the tfidf vectorizer
  - SVC with C=1, gamma=1, kernel=rbf
  - Weighted F1 score = 0.9067