In [1]:
import pandas as pd
import nltk
import re
import random
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import GridSearchCV
from nltk.corpus import stopwords

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to /home/ibrahim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ibrahim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ibrahim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
baseline = "baseline"
stem = "stem"
lem = "lem"
bow = "bow"
tfidf = "tfidf"
random_state = 42
small_n = 2500
large_n = 40000
random.seed(random_state)
data_path = "./data/combined.csv"

# EDA and simple preprocessing

In [3]:
df = pd.read_csv(data_path, low_memory=False)
print(df.isna().sum() / len(df) * 100)

overall            0.000000
verified           0.000000
reviewTime         0.000000
reviewerID         0.000000
asin               0.000000
style             50.710965
reviewerName       0.016280
reviewText         0.049506
summary            0.025620
unixReviewTime     0.000000
vote              84.972458
image             97.894193
dtype: float64


In [4]:
print(df.isna().sum())

overall                0
verified               0
reviewTime             0
reviewerID             0
asin                   0
style             380030
reviewerName         122
reviewText           371
summary              192
unixReviewTime         0
vote              636787
image             733623
dtype: int64


In [5]:
shpae = df.shape
print(df.isna().sum().sum() / (shpae[0] * shpae[1]) * 100)

19.47241852636673


In [6]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5.0,True,"09 1, 2016",A3CIUOJXQ5VDQ2,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Shelly F,As advertised. Reasonably priced,Five Stars,1472688000,,
1,5.0,True,"11 14, 2013",A3H7T87S984REU,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",houserules18,Like the oder and the feel when I put it on my...,Good for the face,1384387200,,
2,1.0,True,"08 18, 2013",A3J034YH7UG4KT,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Adam,I bought this to smell nice after I shave. Wh...,Smells awful,1376784000,,
3,5.0,False,"05 3, 2011",A2UEO5XR3598GI,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Rich K,HEY!! I am an Aqua Velva Man and absolutely lo...,Truth is There IS Nothing Like an AQUA VELVA MAN.,1304380800,25.0,
4,5.0,True,"05 6, 2011",A3SFRT223XXWF7,B00006L9LC,{'Size:': ' 200ml/6.7oz'},C. C. Christian,If you ever want to feel pampered by a shampoo...,Bvlgari Shampoo,1304640000,3.0,


In [7]:
df = df[["overall", "reviewText", "summary", "verified"]]

In [8]:
print(df.isna().sum() / len(df) * 100)

overall       0.000000
reviewText    0.049506
summary       0.025620
verified      0.000000
dtype: float64


In [9]:
df = df.dropna()

In [10]:
print(df.isna().sum() / len(df) * 100)

overall       0.0
reviewText    0.0
summary       0.0
verified      0.0
dtype: float64


In [11]:
df["sentiment"] = df["overall"].apply(lambda x: 1 if x > 3 else -1 if x < 3 else 0)
df["reviewTextWithSummary"] = df["summary"] + " " + df["reviewText"]
df.drop(["overall", "summary"], axis=1, inplace=True)
df.head()

Unnamed: 0,reviewText,verified,sentiment,reviewTextWithSummary
0,As advertised. Reasonably priced,True,1,Five Stars As advertised. Reasonably priced
1,Like the oder and the feel when I put it on my...,True,1,Good for the face Like the oder and the feel w...
2,I bought this to smell nice after I shave. Wh...,True,-1,Smells awful I bought this to smell nice after...
3,HEY!! I am an Aqua Velva Man and absolutely lo...,False,1,Truth is There IS Nothing Like an AQUA VELVA M...
4,If you ever want to feel pampered by a shampoo...,True,1,Bvlgari Shampoo If you ever want to feel pampe...


In [12]:
df["sentiment"].value_counts()

sentiment
 1    657241
 0     47109
-1     44501
Name: count, dtype: int64

In [13]:
df.head()

Unnamed: 0,reviewText,verified,sentiment,reviewTextWithSummary
0,As advertised. Reasonably priced,True,1,Five Stars As advertised. Reasonably priced
1,Like the oder and the feel when I put it on my...,True,1,Good for the face Like the oder and the feel w...
2,I bought this to smell nice after I shave. Wh...,True,-1,Smells awful I bought this to smell nice after...
3,HEY!! I am an Aqua Velva Man and absolutely lo...,False,1,Truth is There IS Nothing Like an AQUA VELVA M...
4,If you ever want to feel pampered by a shampoo...,True,1,Bvlgari Shampoo If you ever want to feel pampe...


# Model experimentation

In [14]:
df_balanced_small = (
    df.groupby("sentiment")
    .apply(lambda x: x.sample(n=small_n, random_state=random_state, replace=True))
    .reset_index(drop=True)
)
df_balanced_small["sentiment"].value_counts()

sentiment
-1    2500
 0    2500
 1    2500
Name: count, dtype: int64

In [15]:
df_balanced_small.head()

Unnamed: 0,reviewText,verified,sentiment,reviewTextWithSummary
0,Intuit is a despicable company now. This is th...,False,-1,Intuit is One Unethical Company. Any alternati...
1,Very disappointed guitar came damaged with den...,True,-1,Very disappointed Very disappointed guitar cam...
2,The rings did not perform as I had hoped. They...,True,-1,Not what I was hoping for The rings did not pe...
3,My ProFX8 purchased from Amazon in 2015 was po...,False,-1,Mine broke. Very light use. Padded case. Maybe...
4,"Not the greatest, really flimsy.",True,-1,"Two Stars Not the greatest, really flimsy."


In [16]:
STOP_WORDS = set(stopwords.words("english"))

In [17]:
def preprocess_text(sentence, stop, type_proc=None):
    words = []
    for word in sentence.lower().strip().split():

        word = re.sub("\d", "", word)
        word = re.sub("[^\w\s]", "", word)

        if word not in stop and word != "":
            words.append(preprocess_type(word, type_proc))

    return " ".join(words)

In [18]:
def preprocess_type(word, type_proc):
    if type_proc == baseline:
        return word
    elif type_proc == stem:
        return PorterStemmer().stem(word)
    elif type_proc == lem:
        return WordNetLemmatizer().lemmatize(word)
    else:
        raise ValueError("Invalid Preprocessing Type")

In [19]:
def my_train_test_split(cols, test_size, df=df, random_state=random_state):
    x_train, x_test, y_train, y_test = train_test_split(
        df[cols], df["sentiment"], test_size=test_size, random_state=random_state
    )
    return x_train, x_test, y_train, y_test

In [20]:
def apply_preprocessing(proc, x_train, x_test):
    if proc is None:
        return x_train, x_test
    cols = x_train.columns

    textcol = "reviewText"
    if "reviewText" not in cols and "reviewTextWithSummary" in cols:
        textcol = "reviewTextWithSummary"
    x_train[textcol] = x_train[textcol].apply(
        lambda x: preprocess_text(x, STOP_WORDS, proc)
    )
    x_test[textcol] = x_test[textcol].apply(
        lambda x: preprocess_text(x, STOP_WORDS, proc)
    )
    return x_train, x_test

In [21]:
def pipeline(cols, test_size, proc, vectorizer, df=df, random_state=random_state):
    if "reviewText" not in cols and "reviewTextWithSummary" not in cols:
        raise ValueError("Must contain reviewText or reviewTextWithSummary")

    textcol = "reviewText"
    if "reviewText" not in cols and "reviewTextWithSummary" in cols:
        textcol = "reviewTextWithSummary"
    x_train, x_test, y_train, y_test = my_train_test_split(
        cols, test_size, df, random_state
    )
    x_train, x_test = apply_preprocessing(proc, x_train, x_test)
    if vectorizer == bow:
        vectorizer = CountVectorizer()
    elif vectorizer == tfidf:
        vectorizer = TfidfVectorizer()
    else:
        raise ValueError("Invalid Vectorizer")
    x_train = vectorizer.fit_transform(x_train[textcol])
    x_test = vectorizer.transform(x_test[textcol])
    return x_train, x_test, y_train, y_test

## Small Balanced Dataset

In [22]:
param_grid = {
    "C": [0.1, 1, 10, 100, 1000],
    "gamma": [1, 0.1, 0.01, 0.001, 0.0001],
    "kernel": ["rbf", "linear"],
}
n_jobs = -1
verbose = 0
cv = 3

In [23]:
compare_list = pd.DataFrame(columns=["Params", "Config", "Accuracy Score"])

In [24]:
code_gen = False
col_comb = [
    ["reviewText"],
    ["reviewText", "verified"],
    ["reviewTextWithSummary"],
    ["reviewTextWithSummary", "verified"],
]
proc_comb = [None, baseline, stem, lem]
vectorizer_comb = [bow, tfidf]
if code_gen:
    for col in col_comb:
        for proc in proc_comb:
            for vectorizer in vectorizer_comb:
                params = {
                    "col": col,
                    "test_size": 0.25,
                    "proc": proc,
                    "vectorizer": vectorizer,
                }
                print(
                    f"""
# %%
x_train, x_test, y_train, y_test = pipeline({col}, 0.25, {proc}, {vectorizer}, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {params}, accuracy]
"""
                )

In [25]:
# below is code genderated by above cell, to make changes to the code, edit the
# above cell and run it, pasting its contents between the markers

In [26]:
####### START OF GENERATED CODE #######
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, None, bow, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'test_size': 0.25, 'proc': None, 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.73      0.67      0.70       661
           0       0.59      0.54      0.56       606
           1       0.71      0.84      0.77       608

    accuracy                           0.68      1875
   macro avg       0.68      0.68      0.68      1875
weighted avg       0.68      0.68      0.68      1875



In [27]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, None, tfidf, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'test_size': 0.25, 'proc': None, 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.75      0.71      0.73       661
           0       0.59      0.63      0.61       606
           1       0.79      0.79      0.79       608

    accuracy                           0.71      1875
   macro avg       0.71      0.71      0.71      1875
weighted avg       0.71      0.71      0.71      1875



In [28]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, baseline, bow, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'test_size': 0.25, 'proc': 'baseline', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.68      0.66      0.67       661
           0       0.58      0.52      0.55       606
           1       0.70      0.79      0.74       608

    accuracy                           0.66      1875
   macro avg       0.65      0.66      0.65      1875
weighted avg       0.65      0.66      0.65      1875



In [29]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, baseline, tfidf, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'test_size': 0.25, 'proc': 'baseline', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.72      0.69      0.70       661
           0       0.58      0.61      0.59       606
           1       0.76      0.75      0.76       608

    accuracy                           0.68      1875
   macro avg       0.68      0.68      0.68      1875
weighted avg       0.69      0.68      0.68      1875



In [30]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, stem, bow, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'test_size': 0.25, 'proc': 'stem', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.67      0.64      0.65       661
           0       0.57      0.48      0.52       606
           1       0.68      0.81      0.74       608

    accuracy                           0.64      1875
   macro avg       0.64      0.64      0.64      1875
weighted avg       0.64      0.64      0.64      1875



In [31]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, stem, tfidf, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'test_size': 0.25, 'proc': 'stem', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.70      0.69      0.69       661
           0       0.57      0.60      0.58       606
           1       0.78      0.75      0.76       608

    accuracy                           0.68      1875
   macro avg       0.68      0.68      0.68      1875
weighted avg       0.68      0.68      0.68      1875



In [32]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, lem, bow, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'test_size': 0.25, 'proc': 'lem', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.68      0.66      0.67       661
           0       0.57      0.51      0.54       606
           1       0.69      0.79      0.74       608

    accuracy                           0.65      1875
   macro avg       0.65      0.65      0.65      1875
weighted avg       0.65      0.65      0.65      1875



In [33]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, lem, tfidf, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'test_size': 0.25, 'proc': 'lem', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.70      0.69      0.69       661
           0       0.58      0.61      0.59       606
           1       0.78      0.76      0.77       608

    accuracy                           0.68      1875
   macro avg       0.69      0.68      0.69      1875
weighted avg       0.69      0.68      0.69      1875



In [34]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, None, bow, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'test_size': 0.25, 'proc': None, 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.73      0.67      0.70       661
           0       0.59      0.54      0.56       606
           1       0.71      0.84      0.77       608

    accuracy                           0.68      1875
   macro avg       0.68      0.68      0.68      1875
weighted avg       0.68      0.68      0.68      1875



In [35]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, None, tfidf, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'test_size': 0.25, 'proc': None, 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.75      0.71      0.73       661
           0       0.59      0.63      0.61       606
           1       0.79      0.79      0.79       608

    accuracy                           0.71      1875
   macro avg       0.71      0.71      0.71      1875
weighted avg       0.71      0.71      0.71      1875



In [36]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, baseline, bow, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'test_size': 0.25, 'proc': 'baseline', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.68      0.66      0.67       661
           0       0.58      0.52      0.55       606
           1       0.70      0.79      0.74       608

    accuracy                           0.66      1875
   macro avg       0.65      0.66      0.65      1875
weighted avg       0.65      0.66      0.65      1875



In [37]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, baseline, tfidf, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'test_size': 0.25, 'proc': 'baseline', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.72      0.69      0.70       661
           0       0.58      0.61      0.59       606
           1       0.76      0.75      0.76       608

    accuracy                           0.68      1875
   macro avg       0.68      0.68      0.68      1875
weighted avg       0.69      0.68      0.68      1875



In [38]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, stem, bow, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'test_size': 0.25, 'proc': 'stem', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.67      0.64      0.65       661
           0       0.57      0.48      0.52       606
           1       0.68      0.81      0.74       608

    accuracy                           0.64      1875
   macro avg       0.64      0.64      0.64      1875
weighted avg       0.64      0.64      0.64      1875



In [39]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, stem, tfidf, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'test_size': 0.25, 'proc': 'stem', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.70      0.69      0.69       661
           0       0.57      0.60      0.58       606
           1       0.78      0.75      0.76       608

    accuracy                           0.68      1875
   macro avg       0.68      0.68      0.68      1875
weighted avg       0.68      0.68      0.68      1875



In [40]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, lem, bow, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'test_size': 0.25, 'proc': 'lem', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.68      0.66      0.67       661
           0       0.57      0.51      0.54       606
           1       0.69      0.79      0.74       608

    accuracy                           0.65      1875
   macro avg       0.65      0.65      0.65      1875
weighted avg       0.65      0.65      0.65      1875



In [41]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, lem, tfidf, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'test_size': 0.25, 'proc': 'lem', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.70      0.69      0.69       661
           0       0.58      0.61      0.59       606
           1       0.78      0.76      0.77       608

    accuracy                           0.68      1875
   macro avg       0.69      0.68      0.69      1875
weighted avg       0.69      0.68      0.69      1875



In [42]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, None, bow, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'test_size': 0.25, 'proc': None, 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.78      0.78      0.78       661
           0       0.68      0.65      0.67       606
           1       0.82      0.86      0.84       608

    accuracy                           0.76      1875
   macro avg       0.76      0.76      0.76      1875
weighted avg       0.76      0.76      0.76      1875



In [43]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, None, tfidf, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'test_size': 0.25, 'proc': None, 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.80      0.79      0.79       661
           0       0.67      0.71      0.69       606
           1       0.87      0.83      0.85       608

    accuracy                           0.78      1875
   macro avg       0.78      0.78      0.78      1875
weighted avg       0.78      0.78      0.78      1875



In [44]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, baseline, bow, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'test_size': 0.25, 'proc': 'baseline', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.74      0.76      0.75       661
           0       0.69      0.60      0.64       606
           1       0.77      0.84      0.80       608

    accuracy                           0.74      1875
   macro avg       0.73      0.74      0.73      1875
weighted avg       0.73      0.74      0.73      1875



In [45]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, baseline, tfidf, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'test_size': 0.25, 'proc': 'baseline', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.76      0.77      0.76       661
           0       0.67      0.68      0.68       606
           1       0.83      0.80      0.82       608

    accuracy                           0.75      1875
   macro avg       0.75      0.75      0.75      1875
weighted avg       0.75      0.75      0.75      1875



In [46]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, stem, bow, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'test_size': 0.25, 'proc': 'stem', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.73      0.75      0.74       661
           0       0.69      0.60      0.64       606
           1       0.77      0.85      0.80       608

    accuracy                           0.73      1875
   macro avg       0.73      0.73      0.73      1875
weighted avg       0.73      0.73      0.73      1875



In [47]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, stem, tfidf, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'test_size': 0.25, 'proc': 'stem', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.76      0.76      0.76       661
           0       0.66      0.68      0.67       606
           1       0.84      0.82      0.83       608

    accuracy                           0.75      1875
   macro avg       0.76      0.75      0.75      1875
weighted avg       0.76      0.75      0.75      1875



In [48]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, lem, bow, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'test_size': 0.25, 'proc': 'lem', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.73      0.74      0.74       661
           0       0.67      0.59      0.63       606
           1       0.76      0.83      0.79       608

    accuracy                           0.72      1875
   macro avg       0.72      0.72      0.72      1875
weighted avg       0.72      0.72      0.72      1875



In [49]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, lem, tfidf, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'test_size': 0.25, 'proc': 'lem', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.74      0.75      0.75       661
           0       0.66      0.67      0.66       606
           1       0.84      0.81      0.82       608

    accuracy                           0.75      1875
   macro avg       0.75      0.75      0.75      1875
weighted avg       0.75      0.75      0.75      1875



In [50]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, None, bow, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'test_size': 0.25, 'proc': None, 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.78      0.78      0.78       661
           0       0.68      0.65      0.67       606
           1       0.82      0.86      0.84       608

    accuracy                           0.76      1875
   macro avg       0.76      0.76      0.76      1875
weighted avg       0.76      0.76      0.76      1875



In [51]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, None, tfidf, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'test_size': 0.25, 'proc': None, 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.80      0.79      0.79       661
           0       0.67      0.71      0.69       606
           1       0.87      0.83      0.85       608

    accuracy                           0.78      1875
   macro avg       0.78      0.78      0.78      1875
weighted avg       0.78      0.78      0.78      1875



In [52]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, baseline, bow, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'test_size': 0.25, 'proc': 'baseline', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.74      0.76      0.75       661
           0       0.69      0.60      0.64       606
           1       0.77      0.84      0.80       608

    accuracy                           0.74      1875
   macro avg       0.73      0.74      0.73      1875
weighted avg       0.73      0.74      0.73      1875



In [53]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, baseline, tfidf, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'test_size': 0.25, 'proc': 'baseline', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.76      0.77      0.76       661
           0       0.67      0.68      0.68       606
           1       0.83      0.80      0.82       608

    accuracy                           0.75      1875
   macro avg       0.75      0.75      0.75      1875
weighted avg       0.75      0.75      0.75      1875



In [54]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, stem, bow, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'test_size': 0.25, 'proc': 'stem', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.73      0.75      0.74       661
           0       0.69      0.60      0.64       606
           1       0.77      0.85      0.80       608

    accuracy                           0.73      1875
   macro avg       0.73      0.73      0.73      1875
weighted avg       0.73      0.73      0.73      1875



In [55]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, stem, tfidf, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'test_size': 0.25, 'proc': 'stem', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.76      0.76      0.76       661
           0       0.66      0.68      0.67       606
           1       0.84      0.82      0.83       608

    accuracy                           0.75      1875
   macro avg       0.76      0.75      0.75      1875
weighted avg       0.76      0.75      0.75      1875



In [56]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, lem, bow, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'test_size': 0.25, 'proc': 'lem', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.73      0.74      0.74       661
           0       0.67      0.59      0.63       606
           1       0.76      0.83      0.79       608

    accuracy                           0.72      1875
   macro avg       0.72      0.72      0.72      1875
weighted avg       0.72      0.72      0.72      1875



In [57]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, lem, tfidf, df_balanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'test_size': 0.25, 'proc': 'lem', 'vectorizer': 'tfidf'}, accuracy]
#######  END OF GENERATED CODE  #######

              precision    recall  f1-score   support

          -1       0.74      0.75      0.75       661
           0       0.66      0.67      0.66       606
           1       0.84      0.81      0.82       608

    accuracy                           0.75      1875
   macro avg       0.75      0.75      0.75      1875
weighted avg       0.75      0.75      0.75      1875



In [58]:
compare_list = compare_list.sort_values(
    by="Accuracy Score", ascending=False
).reset_index(drop=True)
display(compare_list)

Unnamed: 0,Params,Config,Accuracy Score
0,"{'C': 1, 'gamma': 1, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary'], 'test_size'...",0.776533
1,"{'C': 1, 'gamma': 1, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary', 'verified'],...",0.776533
2,"{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary'], 'test_size'...",0.762667
3,"{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary', 'verified'],...",0.762667
4,"{'C': 1, 'gamma': 1, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary', 'verified'],...",0.754133
5,"{'C': 1, 'gamma': 1, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary'], 'test_size'...",0.754133
6,"{'C': 1, 'gamma': 1, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary', 'verified'],...",0.752
7,"{'C': 1, 'gamma': 1, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary'], 'test_size'...",0.752
8,"{'C': 1, 'gamma': 1, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary'], 'test_size'...",0.746133
9,"{'C': 1, 'gamma': 1, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary', 'verified'],...",0.746133


In [59]:
print("Best Configuration on small balnaced dataset")
print("Score :: ", compare_list.loc[0]["Accuracy Score"])
print("SVC   :: ", compare_list.loc[0]["Params"])
print("data  :: ", compare_list.loc[0]["Config"])

Best Configuration on small balnaced dataset
Score ::  0.7765333333333333
SVC   ::  {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
data  ::  {'col': ['reviewTextWithSummary'], 'test_size': 0.25, 'proc': None, 'vectorizer': 'tfidf'}


In [60]:
compare_list.to_csv("./results/svm_compare_list_small_balanced.csv", index=False)

- Across all tests, reviewText with summary performed better than without summary.
- Whether the verified column was included or not did not have any significant impact on the accuracy.
- The RBF kernel performed better than the linear kernel in almost all cases.
- Models performed better when the C value was = 1

In [61]:
# 3 times the small_n to create an unbalanced dataset of the same size as the balanced dataset
df_unbalanced_small = df.sample(n=3 * small_n, random_state=random_state, replace=True)
df_unbalanced_small["sentiment"].value_counts()

sentiment
 1    6607
 0     456
-1     437
Name: count, dtype: int64

## Small Unbalanced Dataset

In [62]:
param_grid = {
    "C": [0.1, 1, 10, 100, 1000],
    "gamma": [1, 0.1, 0.01, 0.001, 0.0001],
    "kernel": ["rbf", "linear"],
}
n_jobs = -1
verbose = 0
cv = 3

In [63]:
compare_list = pd.DataFrame(columns=["Params", "Config", "Accuracy Score"])

In [64]:
code_gen = False
col_comb = [
    ["reviewText"],
    ["reviewText", "verified"],
    ["reviewTextWithSummary"],
    ["reviewTextWithSummary", "verified"],
]
proc_comb = [None, baseline, stem, lem]
vectorizer_comb = [bow, tfidf]
if code_gen:
    for col in col_comb:
        for proc in proc_comb:
            for vectorizer in vectorizer_comb:
                params = {
                    "col": col,
                    "test_size": 0.25,
                    "proc": proc,
                    "vectorizer": vectorizer,
                }
                print(
                    f"""
# %%
x_train, x_test, y_train, y_test = pipeline({col}, 0.25, {proc}, {vectorizer}, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {params}, accuracy]
"""
                )

In [65]:
# below is code genderated by above cell, to make changes to the code, edit the
# above cell and run it, pasting its contents between the markers

In [66]:

####### START OF GENERATED CODE #######
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, None, bow, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'test_size': 0.25, 'proc': None, 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.59      0.20      0.30       110
           0       0.41      0.06      0.10       117
           1       0.90      0.99      0.94      1648

    accuracy                           0.89      1875
   macro avg       0.63      0.42      0.45      1875
weighted avg       0.85      0.89      0.85      1875



In [67]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, None, tfidf, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'test_size': 0.25, 'proc': None, 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.70      0.28      0.40       110
           0       0.27      0.03      0.05       117
           1       0.90      0.99      0.94      1648

    accuracy                           0.89      1875
   macro avg       0.63      0.43      0.46      1875
weighted avg       0.85      0.89      0.86      1875



In [68]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, baseline, bow, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'test_size': 0.25, 'proc': 'baseline', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.67      0.18      0.29       110
           0       0.47      0.07      0.12       117
           1       0.89      0.99      0.94      1648

    accuracy                           0.89      1875
   macro avg       0.68      0.41      0.45      1875
weighted avg       0.85      0.89      0.85      1875



In [69]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, baseline, tfidf, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'test_size': 0.25, 'proc': 'baseline', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.69      0.25      0.36       110
           0       0.30      0.06      0.10       117
           1       0.90      0.99      0.94      1648

    accuracy                           0.89      1875
   macro avg       0.63      0.43      0.47      1875
weighted avg       0.85      0.89      0.85      1875



In [70]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, stem, bow, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'test_size': 0.25, 'proc': 'stem', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.62      0.19      0.29       110
           0       0.29      0.03      0.06       117
           1       0.89      0.99      0.94      1648

    accuracy                           0.88      1875
   macro avg       0.60      0.41      0.43      1875
weighted avg       0.84      0.88      0.85      1875



In [71]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, stem, tfidf, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'test_size': 0.25, 'proc': 'stem', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.66      0.25      0.36       110
           0       0.29      0.03      0.06       117
           1       0.90      0.99      0.94      1648

    accuracy                           0.89      1875
   macro avg       0.61      0.42      0.45      1875
weighted avg       0.84      0.89      0.85      1875



In [72]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, lem, bow, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'test_size': 0.25, 'proc': 'lem', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.53      0.17      0.26       110
           0       0.33      0.05      0.09       117
           1       0.89      0.99      0.94      1648

    accuracy                           0.88      1875
   macro avg       0.59      0.40      0.43      1875
weighted avg       0.84      0.88      0.85      1875



In [73]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, lem, tfidf, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'test_size': 0.25, 'proc': 'lem', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.69      0.25      0.36       110
           0       0.24      0.03      0.06       117
           1       0.90      0.99      0.94      1648

    accuracy                           0.89      1875
   macro avg       0.61      0.42      0.45      1875
weighted avg       0.84      0.89      0.85      1875



In [74]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, None, bow, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'test_size': 0.25, 'proc': None, 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.59      0.20      0.30       110
           0       0.41      0.06      0.10       117
           1       0.90      0.99      0.94      1648

    accuracy                           0.89      1875
   macro avg       0.63      0.42      0.45      1875
weighted avg       0.85      0.89      0.85      1875



In [75]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, None, tfidf, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'test_size': 0.25, 'proc': None, 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.70      0.28      0.40       110
           0       0.27      0.03      0.05       117
           1       0.90      0.99      0.94      1648

    accuracy                           0.89      1875
   macro avg       0.63      0.43      0.46      1875
weighted avg       0.85      0.89      0.86      1875



In [76]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, baseline, bow, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'test_size': 0.25, 'proc': 'baseline', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.67      0.18      0.29       110
           0       0.47      0.07      0.12       117
           1       0.89      0.99      0.94      1648

    accuracy                           0.89      1875
   macro avg       0.68      0.41      0.45      1875
weighted avg       0.85      0.89      0.85      1875



In [77]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, baseline, tfidf, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'test_size': 0.25, 'proc': 'baseline', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.69      0.25      0.36       110
           0       0.30      0.06      0.10       117
           1       0.90      0.99      0.94      1648

    accuracy                           0.89      1875
   macro avg       0.63      0.43      0.47      1875
weighted avg       0.85      0.89      0.85      1875



In [78]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, stem, bow, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'test_size': 0.25, 'proc': 'stem', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.62      0.19      0.29       110
           0       0.29      0.03      0.06       117
           1       0.89      0.99      0.94      1648

    accuracy                           0.88      1875
   macro avg       0.60      0.41      0.43      1875
weighted avg       0.84      0.88      0.85      1875



In [79]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, stem, tfidf, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'test_size': 0.25, 'proc': 'stem', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.66      0.25      0.36       110
           0       0.29      0.03      0.06       117
           1       0.90      0.99      0.94      1648

    accuracy                           0.89      1875
   macro avg       0.61      0.42      0.45      1875
weighted avg       0.84      0.89      0.85      1875



In [80]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, lem, bow, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'test_size': 0.25, 'proc': 'lem', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.53      0.17      0.26       110
           0       0.33      0.05      0.09       117
           1       0.89      0.99      0.94      1648

    accuracy                           0.88      1875
   macro avg       0.59      0.40      0.43      1875
weighted avg       0.84      0.88      0.85      1875



In [81]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, lem, tfidf, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'test_size': 0.25, 'proc': 'lem', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.69      0.25      0.36       110
           0       0.24      0.03      0.06       117
           1       0.90      0.99      0.94      1648

    accuracy                           0.89      1875
   macro avg       0.61      0.42      0.45      1875
weighted avg       0.84      0.89      0.85      1875



In [82]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, None, bow, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'test_size': 0.25, 'proc': None, 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.84      0.42      0.56       110
           0       0.78      0.34      0.48       117
           1       0.92      0.99      0.96      1648

    accuracy                           0.92      1875
   macro avg       0.85      0.58      0.66      1875
weighted avg       0.91      0.92      0.90      1875



In [83]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, None, tfidf, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'test_size': 0.25, 'proc': None, 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.72      0.42      0.53       110
           0       0.81      0.33      0.47       117
           1       0.93      0.99      0.96      1648

    accuracy                           0.92      1875
   macro avg       0.82      0.58      0.65      1875
weighted avg       0.91      0.92      0.90      1875



In [84]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, baseline, bow, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'test_size': 0.25, 'proc': 'baseline', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.75      0.37      0.50       110
           0       0.82      0.36      0.50       117
           1       0.92      0.99      0.96      1648

    accuracy                           0.92      1875
   macro avg       0.83      0.57      0.65      1875
weighted avg       0.91      0.92      0.90      1875



In [85]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, baseline, tfidf, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'test_size': 0.25, 'proc': 'baseline', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.83      0.35      0.50       110
           0       0.79      0.32      0.46       117
           1       0.92      0.99      0.95      1648

    accuracy                           0.91      1875
   macro avg       0.85      0.56      0.64      1875
weighted avg       0.90      0.91      0.90      1875



In [86]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, stem, bow, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'test_size': 0.25, 'proc': 'stem', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.73      0.41      0.52       110
           0       0.69      0.36      0.47       117
           1       0.93      0.99      0.96      1648

    accuracy                           0.91      1875
   macro avg       0.78      0.58      0.65      1875
weighted avg       0.90      0.91      0.90      1875



In [87]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, stem, tfidf, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'test_size': 0.25, 'proc': 'stem', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.72      0.38      0.50       110
           0       0.70      0.32      0.44       117
           1       0.92      0.99      0.95      1648

    accuracy                           0.91      1875
   macro avg       0.78      0.56      0.63      1875
weighted avg       0.90      0.91      0.90      1875



In [88]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, lem, bow, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'test_size': 0.25, 'proc': 'lem', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.76      0.38      0.51       110
           0       0.75      0.34      0.47       117
           1       0.92      0.99      0.96      1648

    accuracy                           0.91      1875
   macro avg       0.81      0.57      0.65      1875
weighted avg       0.90      0.91      0.90      1875



In [89]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, lem, tfidf, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'test_size': 0.25, 'proc': 'lem', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.77      0.37      0.50       110
           0       0.71      0.34      0.46       117
           1       0.92      0.99      0.95      1648

    accuracy                           0.91      1875
   macro avg       0.80      0.57      0.64      1875
weighted avg       0.90      0.91      0.90      1875



In [90]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, None, bow, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'test_size': 0.25, 'proc': None, 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.84      0.42      0.56       110
           0       0.78      0.34      0.48       117
           1       0.92      0.99      0.96      1648

    accuracy                           0.92      1875
   macro avg       0.85      0.58      0.66      1875
weighted avg       0.91      0.92      0.90      1875



In [91]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, None, tfidf, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'test_size': 0.25, 'proc': None, 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.72      0.42      0.53       110
           0       0.81      0.33      0.47       117
           1       0.93      0.99      0.96      1648

    accuracy                           0.92      1875
   macro avg       0.82      0.58      0.65      1875
weighted avg       0.91      0.92      0.90      1875



In [92]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, baseline, bow, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'test_size': 0.25, 'proc': 'baseline', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.75      0.37      0.50       110
           0       0.82      0.36      0.50       117
           1       0.92      0.99      0.96      1648

    accuracy                           0.92      1875
   macro avg       0.83      0.57      0.65      1875
weighted avg       0.91      0.92      0.90      1875



In [93]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, baseline, tfidf, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'test_size': 0.25, 'proc': 'baseline', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.83      0.35      0.50       110
           0       0.79      0.32      0.46       117
           1       0.92      0.99      0.95      1648

    accuracy                           0.91      1875
   macro avg       0.85      0.56      0.64      1875
weighted avg       0.90      0.91      0.90      1875



In [94]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, stem, bow, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'test_size': 0.25, 'proc': 'stem', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.73      0.41      0.52       110
           0       0.69      0.36      0.47       117
           1       0.93      0.99      0.96      1648

    accuracy                           0.91      1875
   macro avg       0.78      0.58      0.65      1875
weighted avg       0.90      0.91      0.90      1875



In [95]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, stem, tfidf, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'test_size': 0.25, 'proc': 'stem', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.72      0.38      0.50       110
           0       0.70      0.32      0.44       117
           1       0.92      0.99      0.95      1648

    accuracy                           0.91      1875
   macro avg       0.78      0.56      0.63      1875
weighted avg       0.90      0.91      0.90      1875



In [96]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, lem, bow, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'test_size': 0.25, 'proc': 'lem', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.76      0.38      0.51       110
           0       0.75      0.34      0.47       117
           1       0.92      0.99      0.96      1648

    accuracy                           0.91      1875
   macro avg       0.81      0.57      0.65      1875
weighted avg       0.90      0.91      0.90      1875



In [97]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, lem, tfidf, df_unbalanced_small)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'test_size': 0.25, 'proc': 'lem', 'vectorizer': 'tfidf'}, accuracy]
#######  END OF GENERATED CODE  #######

              precision    recall  f1-score   support

          -1       0.77      0.37      0.50       110
           0       0.71      0.34      0.46       117
           1       0.92      0.99      0.95      1648

    accuracy                           0.91      1875
   macro avg       0.80      0.57      0.64      1875
weighted avg       0.90      0.91      0.90      1875



In [98]:
compare_list = compare_list.sort_values(
    by="Accuracy Score", ascending=False
).reset_index(drop=True)
display(compare_list)

Unnamed: 0,Params,Config,Accuracy Score
0,"{'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary'], 'test_size'...",0.917867
1,"{'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary', 'verified'],...",0.917867
2,"{'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary'], 'test_size'...",0.916267
3,"{'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary', 'verified'],...",0.916267
4,"{'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary', 'verified'],...",0.916267
5,"{'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary'], 'test_size'...",0.916267
6,"{'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary', 'verified'],...",0.914133
7,"{'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary'], 'test_size'...",0.914133
8,"{'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary', 'verified'],...",0.9136
9,"{'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary'], 'test_size'...",0.9136


In [99]:
compare_list.to_csv("./results/svm_compare_list_small_unbalanced.csv", index=False)

In [100]:
print("Best Configuration on small unbalanced dataset")
print("Score :: ", compare_list.loc[0]["Accuracy Score"])
print("SVC   :: ", compare_list.loc[0]["Params"])
print("data  :: ", compare_list.loc[0]["Config"])

Best Configuration on small unbalanced dataset
Score ::  0.9178666666666667
SVC   ::  {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
data  ::  {'col': ['reviewTextWithSummary'], 'test_size': 0.25, 'proc': None, 'vectorizer': 'bow'}


...

## Comparing the best models from each experiment on the full dataset

In [101]:
df_balanced_large = (
    df.groupby("sentiment")
    .apply(lambda x: x.sample(n=large_n, random_state=random_state, replace=True))
    .reset_index(drop=True)
)

In [102]:
svc_balanced_small = SVC(C=1, gamma=1, kernel="rbf")
x_train, x_test, y_train, y_test = pipeline(
    ["reviewTextWithSummary"], 0.25, None, tfidf, df_balanced_large
)
svc_balanced_small.fit(x_train, y_train)
y_pred = svc_balanced_small.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred, average="weighted")
f1 = f1_score(y_test, y_pred, average="weighted")

              precision    recall  f1-score   support

          -1       0.88      0.89      0.88     10050
           0       0.82      0.85      0.83     10071
           1       0.92      0.88      0.90      9879

    accuracy                           0.87     30000
   macro avg       0.87      0.87      0.87     30000
weighted avg       0.87      0.87      0.87     30000



In [103]:
df_unbalanced_large = df.sample(n=3 * large_n, random_state=random_state, replace=True)

In [104]:
svc_unbalanced_small = SVC(C=10, gamma=0.01, kernel="rbf")
x_train, x_test, y_train, y_test = pipeline(
    ["reviewTextWithSummary"], 0.25, None, bow, df_unbalanced_large
)
svc_unbalanced_small.fit(x_train, y_train)
y_pred = svc_unbalanced_small.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred, average="weighted")
f1 = f1_score(y_test, y_pred, average="weighted")

              precision    recall  f1-score   support

          -1       0.78      0.61      0.69      1757
           0       0.73      0.46      0.56      1810
           1       0.95      0.99      0.97     26433

    accuracy                           0.93     30000
   macro avg       0.82      0.69      0.74     30000
weighted avg       0.93      0.93      0.93     30000

