In [1]:
import pandas as pd
import nltk
import re
import random
import numpy as np
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, accuracy_score, precision_score, roc_auc_score
from nltk.corpus import stopwords
from scipy.sparse import hstack
import warnings
warnings.filterwarnings("ignore")

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to /home/ibrahim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ibrahim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ibrahim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
baseline = "baseline"
stem = "stem"
lem = "lem"
bow = "bow"
tfidf = "tfidf"
random_state = 42
testing_frac = 0.025
random.seed(random_state)
data_path = "../data/combined.csv"

In [3]:
df = pd.read_csv(data_path, low_memory=False)
df = df.drop_duplicates()
df = df[["overall", "reviewText", "summary", "verified"]]
df.dropna(inplace=True)
df["sentiment"] = df["overall"].apply(lambda x: 1 if x > 3 else -1 if x < 3 else 0)
df["reviewTextWithSummary"] = df["summary"] + " " + df["reviewText"]
df.drop(["overall", "summary"], axis=1, inplace=True)
df.head()

Unnamed: 0,reviewText,verified,sentiment,reviewTextWithSummary
0,As advertised. Reasonably priced,True,1,Five Stars As advertised. Reasonably priced
1,Like the oder and the feel when I put it on my...,True,1,Good for the face Like the oder and the feel w...
2,I bought this to smell nice after I shave. Wh...,True,-1,Smells awful I bought this to smell nice after...
3,HEY!! I am an Aqua Velva Man and absolutely lo...,False,1,Truth is There IS Nothing Like an AQUA VELVA M...
4,If you ever want to feel pampered by a shampoo...,True,1,Bvlgari Shampoo If you ever want to feel pampe...


# Model experimentation

In [4]:
df_testing = df.sample(frac=testing_frac, random_state=random_state)
df_testing["sentiment"].value_counts()

sentiment
 1    15160
 0     1094
-1     1012
Name: count, dtype: int64

In [5]:
df_testing.head()

Unnamed: 0,reviewText,verified,sentiment,reviewTextWithSummary
300850,I LOVE this stuff! It's challenging but fun to...,True,1,Beautiful I LOVE this stuff! It's challenging ...
13053,"Another ""boss"" BOSS pedal! This chorus has be...",False,1,A workingman's pedal that will work for you! A...
324582,ok,True,1,Five Stars ok
462346,This is a great product if you want to make yo...,True,1,Jelly roll strips This is a great product if y...
458698,good,True,0,good good


In [6]:
print("Dataset size:", len(df_testing))

Dataset size: 17266


In [7]:
STOP_WORDS = set(stopwords.words("english"))

In [8]:
def preprocess_text(sentence, stop, type_proc=None):
    words = []
    for word in sentence.lower().strip().split():

        word = re.sub("\d", "", word)
        word = re.sub("[^\w\s]", "", word)

        if word not in stop and word != "":
            words.append(preprocess_type(word, type_proc))

    return " ".join(words)

In [9]:
def preprocess_type(word, type_proc):
    if type_proc == baseline:
        return word
    elif type_proc == stem:
        return PorterStemmer().stem(word)
    elif type_proc == lem:
        return WordNetLemmatizer().lemmatize(word)
    else:
        raise ValueError("Invalid Preprocessing Type")

In [10]:
def my_train_test_split(cols, test_size, df=df, random_state=random_state):
    x_train, x_test, y_train, y_test = train_test_split(
        df[cols], df["sentiment"], test_size=test_size, random_state=random_state
    )
    return x_train, x_test, y_train, y_test

In [11]:
def apply_preprocessing(proc, x_train, x_test):
    if proc is None:
        return x_train, x_test
    cols = x_train.columns

    textcol = "reviewText"
    if "reviewText" not in cols and "reviewTextWithSummary" in cols:
        textcol = "reviewTextWithSummary"
    x_train[textcol] = x_train[textcol].apply(
        lambda x: preprocess_text(x, STOP_WORDS, proc)
    )
    x_test[textcol] = x_test[textcol].apply(
        lambda x: preprocess_text(x, STOP_WORDS, proc)
    )
    return x_train, x_test

In [12]:
def add_col(x, col):
    col = np.array([col]).T
    return hstack([x, col])

In [13]:
def pipeline(cols, test_size, proc, vectorizer, df=df, random_state=random_state):
    if "reviewText" not in cols and "reviewTextWithSummary" not in cols:
        raise ValueError("Must contain reviewText or reviewTextWithSummary")

    textcol = "reviewText"
    if "reviewText" not in cols and "reviewTextWithSummary" in cols:
        textcol = "reviewTextWithSummary"
    x_train, x_test, y_train, y_test = my_train_test_split(
        cols, test_size, df, random_state
    )
    x_train, x_test = apply_preprocessing(proc, x_train, x_test)
    if vectorizer == bow:
        vectorizer = CountVectorizer()
    elif vectorizer == tfidf:
        vectorizer = TfidfVectorizer()
    else:
        raise ValueError("Invalid Vectorizer")
    x_train_ = vectorizer.fit_transform(x_train[textcol])
    x_test_ = vectorizer.transform(x_test[textcol])

    if "verified" in cols:
        x_train = add_col(x_train_, x_train["verified"])
        x_test = add_col(x_test_, x_test["verified"])
    else:
        x_train = x_train_
        x_test = x_test_
    return x_train, x_test, y_train, y_test

## Testing different configs

In [14]:
compare_list = pd.DataFrame(
    columns=["Data config and preprocessing", "Accuracy", "F1 Score", "Precision", "ROC AUC"]
)

In [15]:
code_gen = False
col_comb = [
    ["reviewText"],
    ["reviewText", "verified"],
    ["reviewTextWithSummary"],
    ["reviewTextWithSummary", "verified"],
]
proc_comb = [None, baseline, stem, lem]
vectorizer_comb = [bow, tfidf]
if code_gen:
    for col in col_comb:
        for proc in proc_comb:
            for vectorizer in vectorizer_comb:
                params = {
                    "col": col,
                    "proc": proc,
                    "vectorizer": vectorizer,
                }
                print(
                    f"""
# %%
x_train, x_test, y_train, y_test = pipeline({col}, 0.25, {proc}, {vectorizer}, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{params}, accuracy, f1, precision, roc_auc]
"""
                )

In [16]:
# below is code genderated by above cell, to make changes to the code, edit the
# above cell and run it, pasting its contents between the markers

In [17]:
####### START OF GENERATED CODE #######
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, None, bow, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText'], 'proc': None, 'vectorizer': 'bow'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       1.00      0.01      0.02       260
           0       1.00      0.00      0.01       259
           1       0.88      1.00      0.94      3798

    accuracy                           0.88      4317
   macro avg       0.96      0.34      0.32      4317
weighted avg       0.89      0.88      0.83      4317



In [18]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, None, tfidf, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText'], 'proc': None, 'vectorizer': 'tfidf'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       0.76      0.07      0.13       260
           0       1.00      0.00      0.01       259
           1       0.88      1.00      0.94      3798

    accuracy                           0.88      4317
   macro avg       0.88      0.36      0.36      4317
weighted avg       0.88      0.88      0.83      4317



In [19]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, baseline, bow, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText'], 'proc': 'baseline', 'vectorizer': 'bow'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       1.00      0.03      0.05       260
           0       1.00      0.00      0.01       259
           1       0.88      1.00      0.94      3798

    accuracy                           0.88      4317
   macro avg       0.96      0.34      0.33      4317
weighted avg       0.90      0.88      0.83      4317



In [20]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, baseline, tfidf, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText'], 'proc': 'baseline', 'vectorizer': 'tfidf'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       0.88      0.06      0.11       260
           0       1.00      0.00      0.01       259
           1       0.88      1.00      0.94      3798

    accuracy                           0.88      4317
   macro avg       0.92      0.35      0.35      4317
weighted avg       0.89      0.88      0.83      4317



In [21]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, stem, bow, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText'], 'proc': 'stem', 'vectorizer': 'bow'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       0.73      0.04      0.08       260
           0       1.00      0.00      0.01       259
           1       0.88      1.00      0.94      3798

    accuracy                           0.88      4317
   macro avg       0.87      0.35      0.34      4317
weighted avg       0.88      0.88      0.83      4317



In [22]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, stem, tfidf, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText'], 'proc': 'stem', 'vectorizer': 'tfidf'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       0.81      0.10      0.17       260
           0       0.00      0.00      0.00       259
           1       0.89      1.00      0.94      3798

    accuracy                           0.88      4317
   macro avg       0.56      0.37      0.37      4317
weighted avg       0.83      0.88      0.84      4317



In [23]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, lem, bow, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText'], 'proc': 'lem', 'vectorizer': 'bow'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       1.00      0.02      0.05       260
           0       1.00      0.00      0.01       259
           1       0.88      1.00      0.94      3798

    accuracy                           0.88      4317
   macro avg       0.96      0.34      0.33      4317
weighted avg       0.90      0.88      0.83      4317



In [24]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, lem, tfidf, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText'], 'proc': 'lem', 'vectorizer': 'tfidf'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       0.89      0.06      0.12       260
           0       1.00      0.00      0.01       259
           1       0.88      1.00      0.94      3798

    accuracy                           0.88      4317
   macro avg       0.92      0.35      0.35      4317
weighted avg       0.89      0.88      0.83      4317



In [25]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, None, bow, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText', 'verified'], 'proc': None, 'vectorizer': 'bow'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       1.00      0.01      0.02       260
           0       1.00      0.00      0.01       259
           1       0.88      1.00      0.94      3798

    accuracy                           0.88      4317
   macro avg       0.96      0.34      0.32      4317
weighted avg       0.89      0.88      0.83      4317



In [26]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, None, tfidf, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText', 'verified'], 'proc': None, 'vectorizer': 'tfidf'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       0.82      0.07      0.13       260
           0       0.00      0.00      0.00       259
           1       0.88      1.00      0.94      3798

    accuracy                           0.88      4317
   macro avg       0.57      0.36      0.36      4317
weighted avg       0.83      0.88      0.83      4317



In [27]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, baseline, bow, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText', 'verified'], 'proc': 'baseline', 'vectorizer': 'bow'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       1.00      0.03      0.05       260
           0       1.00      0.00      0.01       259
           1       0.88      1.00      0.94      3798

    accuracy                           0.88      4317
   macro avg       0.96      0.34      0.33      4317
weighted avg       0.90      0.88      0.83      4317



In [28]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, baseline, tfidf, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText', 'verified'], 'proc': 'baseline', 'vectorizer': 'tfidf'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       0.87      0.05      0.09       260
           0       0.60      0.01      0.02       259
           1       0.88      1.00      0.94      3798

    accuracy                           0.88      4317
   macro avg       0.78      0.35      0.35      4317
weighted avg       0.87      0.88      0.83      4317



In [29]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, stem, bow, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText', 'verified'], 'proc': 'stem', 'vectorizer': 'bow'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       0.73      0.04      0.08       260
           0       1.00      0.00      0.01       259
           1       0.88      1.00      0.94      3798

    accuracy                           0.88      4317
   macro avg       0.87      0.35      0.34      4317
weighted avg       0.88      0.88      0.83      4317



In [30]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, stem, tfidf, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText', 'verified'], 'proc': 'stem', 'vectorizer': 'tfidf'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       0.75      0.07      0.13       260
           0       0.50      0.01      0.02       259
           1       0.88      1.00      0.94      3798

    accuracy                           0.88      4317
   macro avg       0.71      0.36      0.36      4317
weighted avg       0.85      0.88      0.83      4317



In [31]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, lem, bow, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText', 'verified'], 'proc': 'lem', 'vectorizer': 'bow'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       1.00      0.02      0.05       260
           0       1.00      0.00      0.01       259
           1       0.88      1.00      0.94      3798

    accuracy                           0.88      4317
   macro avg       0.96      0.34      0.33      4317
weighted avg       0.90      0.88      0.83      4317



In [32]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, lem, tfidf, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText', 'verified'], 'proc': 'lem', 'vectorizer': 'tfidf'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       0.75      0.05      0.09       260
           0       0.60      0.01      0.02       259
           1       0.88      1.00      0.94      3798

    accuracy                           0.88      4317
   macro avg       0.74      0.35      0.35      4317
weighted avg       0.86      0.88      0.83      4317



In [33]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, None, bow, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary'], 'proc': None, 'vectorizer': 'bow'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       0.85      0.18      0.30       260
           0       0.92      0.25      0.40       259
           1       0.91      1.00      0.95      3798

    accuracy                           0.90      4317
   macro avg       0.89      0.48      0.55      4317
weighted avg       0.90      0.90      0.88      4317



In [34]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, None, tfidf, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary'], 'proc': None, 'vectorizer': 'tfidf'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       0.82      0.26      0.40       260
           0       0.97      0.22      0.36       259
           1       0.91      1.00      0.95      3798

    accuracy                           0.91      4317
   macro avg       0.90      0.49      0.57      4317
weighted avg       0.91      0.91      0.88      4317



In [35]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, baseline, bow, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary'], 'proc': 'baseline', 'vectorizer': 'bow'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       0.92      0.22      0.35       260
           0       0.92      0.25      0.39       259
           1       0.91      1.00      0.95      3798

    accuracy                           0.91      4317
   macro avg       0.91      0.49      0.56      4317
weighted avg       0.91      0.91      0.88      4317



In [36]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, baseline, tfidf, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary'], 'proc': 'baseline', 'vectorizer': 'tfidf'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       0.91      0.22      0.36       260
           0       0.92      0.23      0.37       259
           1       0.91      1.00      0.95      3798

    accuracy                           0.91      4317
   macro avg       0.91      0.48      0.56      4317
weighted avg       0.91      0.91      0.88      4317



In [37]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, stem, bow, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary'], 'proc': 'stem', 'vectorizer': 'bow'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       0.87      0.23      0.36       260
           0       0.92      0.25      0.39       259
           1       0.91      1.00      0.95      3798

    accuracy                           0.91      4317
   macro avg       0.90      0.49      0.57      4317
weighted avg       0.91      0.91      0.88      4317



In [38]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, stem, tfidf, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary'], 'proc': 'stem', 'vectorizer': 'tfidf'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       0.85      0.21      0.34       260
           0       0.90      0.23      0.37       259
           1       0.91      1.00      0.95      3798

    accuracy                           0.90      4317
   macro avg       0.88      0.48      0.55      4317
weighted avg       0.90      0.90      0.88      4317



In [39]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, lem, bow, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary'], 'proc': 'lem', 'vectorizer': 'bow'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       0.92      0.21      0.34       260
           0       0.92      0.25      0.40       259
           1       0.91      1.00      0.95      3798

    accuracy                           0.91      4317
   macro avg       0.91      0.49      0.56      4317
weighted avg       0.91      0.91      0.88      4317



In [40]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, lem, tfidf, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary'], 'proc': 'lem', 'vectorizer': 'tfidf'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       0.86      0.19      0.31       260
           0       0.90      0.23      0.37       259
           1       0.90      1.00      0.95      3798

    accuracy                           0.90      4317
   macro avg       0.89      0.47      0.54      4317
weighted avg       0.90      0.90      0.88      4317



In [41]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, None, bow, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary', 'verified'], 'proc': None, 'vectorizer': 'bow'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       0.84      0.18      0.29       260
           0       0.92      0.25      0.40       259
           1       0.91      1.00      0.95      3798

    accuracy                           0.90      4317
   macro avg       0.89      0.48      0.55      4317
weighted avg       0.90      0.90      0.88      4317



In [42]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, None, tfidf, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary', 'verified'], 'proc': None, 'vectorizer': 'tfidf'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       0.83      0.26      0.39       260
           0       0.97      0.22      0.36       259
           1       0.91      1.00      0.95      3798

    accuracy                           0.91      4317
   macro avg       0.90      0.49      0.57      4317
weighted avg       0.91      0.91      0.88      4317



In [43]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, baseline, bow, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary', 'verified'], 'proc': 'baseline', 'vectorizer': 'bow'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       0.92      0.22      0.35       260
           0       0.92      0.25      0.40       259
           1       0.91      1.00      0.95      3798

    accuracy                           0.91      4317
   macro avg       0.91      0.49      0.57      4317
weighted avg       0.91      0.91      0.88      4317



In [44]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, baseline, tfidf, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary', 'verified'], 'proc': 'baseline', 'vectorizer': 'tfidf'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       0.91      0.22      0.36       260
           0       0.94      0.23      0.37       259
           1       0.90      1.00      0.95      3798

    accuracy                           0.91      4317
   macro avg       0.92      0.48      0.56      4317
weighted avg       0.91      0.91      0.88      4317



In [45]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, stem, bow, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary', 'verified'], 'proc': 'stem', 'vectorizer': 'bow'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       0.87      0.23      0.36       260
           0       0.92      0.25      0.39       259
           1       0.91      1.00      0.95      3798

    accuracy                           0.91      4317
   macro avg       0.90      0.49      0.57      4317
weighted avg       0.91      0.91      0.88      4317



In [46]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, stem, tfidf, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary', 'verified'], 'proc': 'stem', 'vectorizer': 'tfidf'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       0.87      0.22      0.35       260
           0       0.92      0.23      0.37       259
           1       0.91      1.00      0.95      3798

    accuracy                           0.91      4317
   macro avg       0.90      0.48      0.56      4317
weighted avg       0.90      0.91      0.88      4317



In [47]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, lem, bow, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary', 'verified'], 'proc': 'lem', 'vectorizer': 'bow'}, accuracy, f1, precision, roc_auc]

              precision    recall  f1-score   support

          -1       0.91      0.20      0.33       260
           0       0.92      0.25      0.40       259
           1       0.91      1.00      0.95      3798

    accuracy                           0.91      4317
   macro avg       0.91      0.49      0.56      4317
weighted avg       0.91      0.91      0.88      4317



In [48]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, lem, tfidf, df_testing)
model = SVC(probability=True)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
y_pred_prob = model.predict_proba(x_test)
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class="ovr")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary', 'verified'], 'proc': 'lem', 'vectorizer': 'tfidf'}, accuracy, f1, precision, roc_auc]
#######  END OF GENERATED CODE  #######

              precision    recall  f1-score   support

          -1       0.90      0.20      0.33       260
           0       0.92      0.23      0.37       259
           1       0.90      1.00      0.95      3798

    accuracy                           0.90      4317
   macro avg       0.91      0.48      0.55      4317
weighted avg       0.90      0.90      0.88      4317



In [49]:
pd.set_option("display.max_colwidth", None)
compare_list = compare_list.sort_values(by="ROC AUC", ascending=False).reset_index(drop=True)
display(compare_list)
compare_list.to_csv("./svm_testing_results.csv", index=False)compare_list.to_csv("../results/svm_testing_results.csv", index=False)

Unnamed: 0,Data config and preprocessing,Accuracy,F1 Score,Precision,ROC AUC
0,"{'col': ['reviewTextWithSummary'], 'proc': None, 'vectorizer': 'tfidf'}",0.908038,0.883097,0.907021,0.941108
1,"{'col': ['reviewTextWithSummary', 'verified'], 'proc': None, 'vectorizer': 'tfidf'}",0.907111,0.881818,0.906466,0.939225
2,"{'col': ['reviewTextWithSummary'], 'proc': 'stem', 'vectorizer': 'tfidf'}",0.904563,0.877833,0.901429,0.925283
3,"{'col': ['reviewTextWithSummary', 'verified'], 'proc': None, 'vectorizer': 'bow'}",0.904332,0.876798,0.901577,0.924912
4,"{'col': ['reviewTextWithSummary'], 'proc': None, 'vectorizer': 'bow'}",0.904563,0.87718,0.902672,0.924875
5,"{'col': ['reviewTextWithSummary', 'verified'], 'proc': 'stem', 'vectorizer': 'tfidf'}",0.905027,0.878543,0.903995,0.924169
6,"{'col': ['reviewTextWithSummary'], 'proc': 'baseline', 'vectorizer': 'tfidf'}",0.905722,0.879275,0.906551,0.92393
7,"{'col': ['reviewTextWithSummary'], 'proc': 'lem', 'vectorizer': 'tfidf'}",0.903405,0.875656,0.901057,0.923389
8,"{'col': ['reviewTextWithSummary', 'verified'], 'proc': 'baseline', 'vectorizer': 'tfidf'}",0.90549,0.879018,0.907016,0.922264
9,"{'col': ['reviewTextWithSummary', 'verified'], 'proc': 'lem', 'vectorizer': 'tfidf'}",0.9041,0.876574,0.904651,0.921265


- Further testing can be done on using the "reviewTextWithSummary" column over the "reviewText" as it is universally better
- Of the top 10 configurations, 8 of them use TF-IDF vectorizer
- Including the "verified" column does not seem to have a significant, if only a slight negative, impact on the model. Will not be used in further tesing
- Further testing will be done on all of the preprocessing methods (None, Baseline, Stemming, Lemmatization)
- Models for further analysis are as follows:
    - reviewTextWithSummary, None, TF-IDF
    - reviewTextWithSummary, Baseline, TF-IDF
    - reviewTextWithSummary, Stem, TF-IDF
    - reviewTextWithSummary, Lem, TF-IDF