In [70]:
import pandas as pd
import nltk
import re
import random
import numpy as np
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, accuracy_score
from nltk.corpus import stopwords
from scipy.sparse import hstack
import warnings
warnings.filterwarnings("ignore")

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to /home/ibrahim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ibrahim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ibrahim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [71]:
baseline = "baseline"
stem = "stem"
lem = "lem"
bow = "bow"
tfidf = "tfidf"
random_state = 42
testing_n = 5000
random.seed(random_state)
data_path = "./data/combined.csv"

In [72]:
df = pd.read_csv(data_path, low_memory=False)
df = df[["overall", "reviewText", "summary", "verified"]]
df.dropna(inplace=True)
df["sentiment"] = df["overall"].apply(lambda x: 1 if x > 3 else -1 if x < 3 else 0)
df["reviewTextWithSummary"] = df["summary"] + " " + df["reviewText"]
df.drop(["overall", "summary"], axis=1, inplace=True)
df.head()

Unnamed: 0,reviewText,verified,sentiment,reviewTextWithSummary
0,As advertised. Reasonably priced,True,1,Five Stars As advertised. Reasonably priced
1,Like the oder and the feel when I put it on my...,True,1,Good for the face Like the oder and the feel w...
2,I bought this to smell nice after I shave. Wh...,True,-1,Smells awful I bought this to smell nice after...
3,HEY!! I am an Aqua Velva Man and absolutely lo...,False,1,Truth is There IS Nothing Like an AQUA VELVA M...
4,If you ever want to feel pampered by a shampoo...,True,1,Bvlgari Shampoo If you ever want to feel pampe...


# Model experimentation

In [73]:
df_testing = (
    df.groupby("sentiment")
    .apply(lambda x: x.sample(n=testing_n, random_state=random_state, replace=True))
    .reset_index(drop=True)
)
df_testing["sentiment"].value_counts()

sentiment
-1    5000
 0    5000
 1    5000
Name: count, dtype: int64

In [74]:
df_testing.head()

Unnamed: 0,reviewText,verified,sentiment,reviewTextWithSummary
0,Intuit is a despicable company now. This is th...,False,-1,Intuit is One Unethical Company. Any alternati...
1,Very disappointed guitar came damaged with den...,True,-1,Very disappointed Very disappointed guitar cam...
2,The rings did not perform as I had hoped. They...,True,-1,Not what I was hoping for The rings did not pe...
3,My ProFX8 purchased from Amazon in 2015 was po...,False,-1,Mine broke. Very light use. Padded case. Maybe...
4,"Not the greatest, really flimsy.",True,-1,"Two Stars Not the greatest, really flimsy."


In [75]:
print("Dataset size:", len(df_testing))

Dataset size: 15000


In [76]:
STOP_WORDS = set(stopwords.words("english"))

In [77]:
def preprocess_text(sentence, stop, type_proc=None):
    words = []
    for word in sentence.lower().strip().split():

        word = re.sub("\d", "", word)
        word = re.sub("[^\w\s]", "", word)

        if word not in stop and word != "":
            words.append(preprocess_type(word, type_proc))

    return " ".join(words)

In [78]:
def preprocess_type(word, type_proc):
    if type_proc == baseline:
        return word
    elif type_proc == stem:
        return PorterStemmer().stem(word)
    elif type_proc == lem:
        return WordNetLemmatizer().lemmatize(word)
    else:
        raise ValueError("Invalid Preprocessing Type")

In [79]:
def my_train_test_split(cols, test_size, df=df, random_state=random_state):
    x_train, x_test, y_train, y_test = train_test_split(
        df[cols], df["sentiment"], test_size=test_size, random_state=random_state
    )
    return x_train, x_test, y_train, y_test

In [80]:
def apply_preprocessing(proc, x_train, x_test):
    if proc is None:
        return x_train, x_test
    cols = x_train.columns

    textcol = "reviewText"
    if "reviewText" not in cols and "reviewTextWithSummary" in cols:
        textcol = "reviewTextWithSummary"
    x_train[textcol] = x_train[textcol].apply(
        lambda x: preprocess_text(x, STOP_WORDS, proc)
    )
    x_test[textcol] = x_test[textcol].apply(
        lambda x: preprocess_text(x, STOP_WORDS, proc)
    )
    return x_train, x_test

In [81]:
def add_col(x, col):
    col = np.array([col]).T
    return hstack([x, col])

In [82]:
def pipeline(cols, test_size, proc, vectorizer, df=df, random_state=random_state):
    if "reviewText" not in cols and "reviewTextWithSummary" not in cols:
        raise ValueError("Must contain reviewText or reviewTextWithSummary")

    textcol = "reviewText"
    if "reviewText" not in cols and "reviewTextWithSummary" in cols:
        textcol = "reviewTextWithSummary"
    x_train, x_test, y_train, y_test = my_train_test_split(
        cols, test_size, df, random_state
    )
    x_train, x_test = apply_preprocessing(proc, x_train, x_test)
    if vectorizer == bow:
        vectorizer = CountVectorizer()
    elif vectorizer == tfidf:
        vectorizer = TfidfVectorizer()
    else:
        raise ValueError("Invalid Vectorizer")
    x_train_ = vectorizer.fit_transform(x_train[textcol])
    x_test_ = vectorizer.transform(x_test[textcol])

    if "verified" in cols:
        x_train = add_col(x_train_, x_train["verified"])
        x_test = add_col(x_test_, x_test["verified"])
    else:
        x_train = x_train_
        x_test = x_test_
    return x_train, x_test, y_train, y_test

## Testing different configs

In [84]:
compare_list = pd.DataFrame(
    columns=["Data config and preprocessing", "Accuracy", "F1 Score"]
)

In [85]:
code_gen = False
col_comb = [
    ["reviewText"],
    ["reviewText", "verified"],
    ["reviewTextWithSummary"],
    ["reviewTextWithSummary", "verified"],
]
proc_comb = [None, baseline, stem, lem]
vectorizer_comb = [bow, tfidf]
if code_gen:
    for col in col_comb:
        for proc in proc_comb:
            for vectorizer in vectorizer_comb:
                params = {
                    "col": col,
                    "proc": proc,
                    "vectorizer": vectorizer,
                }
                print(
                    f"""
# %%
x_train, x_test, y_train, y_test = pipeline({col}, 0.25, {proc}, {vectorizer}, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{params}, accuracy, f1]
"""
                )

In [86]:
# below is code genderated by above cell, to make changes to the code, edit the
# above cell and run it, pasting its contents between the markers

In [87]:
####### START OF GENERATED CODE #######
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, None, bow, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText'], 'proc': None, 'vectorizer': 'bow'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.70      0.67      0.68      1284
           0       0.60      0.55      0.58      1225
           1       0.71      0.79      0.75      1241

    accuracy                           0.67      3750
   macro avg       0.67      0.67      0.67      3750
weighted avg       0.67      0.67      0.67      3750



In [88]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, None, tfidf, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText'], 'proc': None, 'vectorizer': 'tfidf'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.74      0.74      0.74      1284
           0       0.62      0.64      0.63      1225
           1       0.82      0.79      0.80      1241

    accuracy                           0.72      3750
   macro avg       0.73      0.72      0.72      3750
weighted avg       0.73      0.72      0.73      3750



In [89]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, baseline, bow, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText'], 'proc': 'baseline', 'vectorizer': 'bow'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.66      0.69      0.68      1284
           0       0.59      0.52      0.55      1225
           1       0.72      0.77      0.75      1241

    accuracy                           0.66      3750
   macro avg       0.66      0.66      0.66      3750
weighted avg       0.66      0.66      0.66      3750



In [90]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, baseline, tfidf, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText'], 'proc': 'baseline', 'vectorizer': 'tfidf'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.71      0.71      0.71      1284
           0       0.60      0.63      0.61      1225
           1       0.80      0.76      0.78      1241

    accuracy                           0.70      3750
   macro avg       0.70      0.70      0.70      3750
weighted avg       0.70      0.70      0.70      3750



In [91]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, stem, bow, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText'], 'proc': 'stem', 'vectorizer': 'bow'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.66      0.65      0.66      1284
           0       0.58      0.56      0.57      1225
           1       0.73      0.77      0.75      1241

    accuracy                           0.66      3750
   macro avg       0.66      0.66      0.66      3750
weighted avg       0.66      0.66      0.66      3750



In [92]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, stem, tfidf, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText'], 'proc': 'stem', 'vectorizer': 'tfidf'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.70      0.69      0.70      1284
           0       0.59      0.63      0.61      1225
           1       0.79      0.75      0.77      1241

    accuracy                           0.69      3750
   macro avg       0.70      0.69      0.69      3750
weighted avg       0.70      0.69      0.69      3750



In [93]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, lem, bow, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText'], 'proc': 'lem', 'vectorizer': 'bow'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.66      0.66      0.66      1284
           0       0.58      0.53      0.55      1225
           1       0.71      0.77      0.74      1241

    accuracy                           0.65      3750
   macro avg       0.65      0.65      0.65      3750
weighted avg       0.65      0.65      0.65      3750



In [94]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, lem, tfidf, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText'], 'proc': 'lem', 'vectorizer': 'tfidf'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.70      0.70      0.70      1284
           0       0.60      0.63      0.61      1225
           1       0.79      0.76      0.77      1241

    accuracy                           0.69      3750
   macro avg       0.70      0.69      0.69      3750
weighted avg       0.70      0.69      0.69      3750



In [95]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, None, bow, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText', 'verified'], 'proc': None, 'vectorizer': 'bow'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.70      0.67      0.68      1284
           0       0.60      0.55      0.57      1225
           1       0.70      0.79      0.75      1241

    accuracy                           0.67      3750
   macro avg       0.67      0.67      0.67      3750
weighted avg       0.67      0.67      0.67      3750



In [96]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, None, tfidf, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText', 'verified'], 'proc': None, 'vectorizer': 'tfidf'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.73      0.74      0.73      1284
           0       0.61      0.63      0.62      1225
           1       0.82      0.79      0.80      1241

    accuracy                           0.72      3750
   macro avg       0.72      0.72      0.72      3750
weighted avg       0.72      0.72      0.72      3750



In [97]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, baseline, bow, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText', 'verified'], 'proc': 'baseline', 'vectorizer': 'bow'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.66      0.69      0.68      1284
           0       0.59      0.52      0.55      1225
           1       0.72      0.77      0.74      1241

    accuracy                           0.66      3750
   macro avg       0.66      0.66      0.66      3750
weighted avg       0.66      0.66      0.66      3750



In [98]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, baseline, tfidf, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText', 'verified'], 'proc': 'baseline', 'vectorizer': 'tfidf'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.70      0.71      0.71      1284
           0       0.58      0.61      0.60      1225
           1       0.80      0.75      0.77      1241

    accuracy                           0.69      3750
   macro avg       0.69      0.69      0.69      3750
weighted avg       0.70      0.69      0.69      3750



In [99]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, stem, bow, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText', 'verified'], 'proc': 'stem', 'vectorizer': 'bow'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.66      0.65      0.66      1284
           0       0.58      0.55      0.56      1225
           1       0.72      0.77      0.74      1241

    accuracy                           0.66      3750
   macro avg       0.65      0.66      0.66      3750
weighted avg       0.65      0.66      0.66      3750



In [100]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, stem, tfidf, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText', 'verified'], 'proc': 'stem', 'vectorizer': 'tfidf'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.70      0.69      0.69      1284
           0       0.59      0.63      0.61      1225
           1       0.79      0.75      0.77      1241

    accuracy                           0.69      3750
   macro avg       0.69      0.69      0.69      3750
weighted avg       0.69      0.69      0.69      3750



In [101]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, lem, bow, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText', 'verified'], 'proc': 'lem', 'vectorizer': 'bow'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.66      0.66      0.66      1284
           0       0.58      0.52      0.55      1225
           1       0.71      0.77      0.74      1241

    accuracy                           0.65      3750
   macro avg       0.65      0.65      0.65      3750
weighted avg       0.65      0.65      0.65      3750



In [102]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, lem, tfidf, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewText', 'verified'], 'proc': 'lem', 'vectorizer': 'tfidf'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.70      0.70      0.70      1284
           0       0.59      0.61      0.60      1225
           1       0.79      0.75      0.77      1241

    accuracy                           0.69      3750
   macro avg       0.69      0.69      0.69      3750
weighted avg       0.69      0.69      0.69      3750



In [103]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, None, bow, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary'], 'proc': None, 'vectorizer': 'bow'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.74      0.78      0.76      1284
           0       0.69      0.69      0.69      1225
           1       0.86      0.82      0.84      1241

    accuracy                           0.76      3750
   macro avg       0.76      0.76      0.76      3750
weighted avg       0.76      0.76      0.76      3750



In [104]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, None, tfidf, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary'], 'proc': None, 'vectorizer': 'tfidf'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.79      0.79      0.79      1284
           0       0.70      0.74      0.72      1225
           1       0.88      0.84      0.86      1241

    accuracy                           0.79      3750
   macro avg       0.79      0.79      0.79      3750
weighted avg       0.79      0.79      0.79      3750



In [105]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, baseline, bow, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary'], 'proc': 'baseline', 'vectorizer': 'bow'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.72      0.78      0.75      1284
           0       0.69      0.65      0.67      1225
           1       0.84      0.80      0.82      1241

    accuracy                           0.75      3750
   macro avg       0.75      0.74      0.75      3750
weighted avg       0.75      0.75      0.75      3750



In [106]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, baseline, tfidf, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary'], 'proc': 'baseline', 'vectorizer': 'tfidf'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.76      0.77      0.77      1284
           0       0.69      0.72      0.71      1225
           1       0.88      0.82      0.85      1241

    accuracy                           0.77      3750
   macro avg       0.78      0.77      0.77      3750
weighted avg       0.78      0.77      0.77      3750



In [107]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, stem, bow, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary'], 'proc': 'stem', 'vectorizer': 'bow'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.72      0.76      0.74      1284
           0       0.69      0.68      0.68      1225
           1       0.84      0.81      0.82      1241

    accuracy                           0.75      3750
   macro avg       0.75      0.75      0.75      3750
weighted avg       0.75      0.75      0.75      3750



In [108]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, stem, tfidf, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary'], 'proc': 'stem', 'vectorizer': 'tfidf'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.76      0.75      0.76      1284
           0       0.68      0.72      0.70      1225
           1       0.86      0.82      0.84      1241

    accuracy                           0.76      3750
   macro avg       0.77      0.76      0.77      3750
weighted avg       0.77      0.76      0.77      3750



In [109]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, lem, bow, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary'], 'proc': 'lem', 'vectorizer': 'bow'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.71      0.77      0.74      1284
           0       0.68      0.66      0.67      1225
           1       0.85      0.80      0.82      1241

    accuracy                           0.75      3750
   macro avg       0.75      0.74      0.75      3750
weighted avg       0.75      0.75      0.75      3750



In [110]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, lem, tfidf, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary'], 'proc': 'lem', 'vectorizer': 'tfidf'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.76      0.77      0.76      1284
           0       0.69      0.71      0.70      1225
           1       0.87      0.82      0.84      1241

    accuracy                           0.77      3750
   macro avg       0.77      0.77      0.77      3750
weighted avg       0.77      0.77      0.77      3750



In [111]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, None, bow, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary', 'verified'], 'proc': None, 'vectorizer': 'bow'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.75      0.78      0.76      1284
           0       0.69      0.68      0.69      1225
           1       0.86      0.83      0.84      1241

    accuracy                           0.76      3750
   macro avg       0.76      0.76      0.76      3750
weighted avg       0.76      0.76      0.76      3750



In [112]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, None, tfidf, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary', 'verified'], 'proc': None, 'vectorizer': 'tfidf'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.77      0.78      0.77      1284
           0       0.69      0.72      0.70      1225
           1       0.89      0.83      0.86      1241

    accuracy                           0.78      3750
   macro avg       0.78      0.78      0.78      3750
weighted avg       0.78      0.78      0.78      3750



In [113]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, baseline, bow, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary', 'verified'], 'proc': 'baseline', 'vectorizer': 'bow'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.72      0.78      0.75      1284
           0       0.69      0.65      0.67      1225
           1       0.84      0.80      0.82      1241

    accuracy                           0.75      3750
   macro avg       0.75      0.75      0.75      3750
weighted avg       0.75      0.75      0.75      3750



In [114]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, baseline, tfidf, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary', 'verified'], 'proc': 'baseline', 'vectorizer': 'tfidf'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.76      0.78      0.77      1284
           0       0.69      0.72      0.70      1225
           1       0.88      0.82      0.85      1241

    accuracy                           0.77      3750
   macro avg       0.77      0.77      0.77      3750
weighted avg       0.77      0.77      0.77      3750



In [115]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, stem, bow, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary', 'verified'], 'proc': 'stem', 'vectorizer': 'bow'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.72      0.76      0.74      1284
           0       0.68      0.67      0.68      1225
           1       0.84      0.81      0.82      1241

    accuracy                           0.75      3750
   macro avg       0.75      0.75      0.75      3750
weighted avg       0.75      0.75      0.75      3750



In [116]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, stem, tfidf, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary', 'verified'], 'proc': 'stem', 'vectorizer': 'tfidf'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.75      0.75      0.75      1284
           0       0.67      0.71      0.69      1225
           1       0.86      0.81      0.84      1241

    accuracy                           0.76      3750
   macro avg       0.76      0.76      0.76      3750
weighted avg       0.76      0.76      0.76      3750



In [117]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, lem, bow, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary', 'verified'], 'proc': 'lem', 'vectorizer': 'bow'}, accuracy, f1]

              precision    recall  f1-score   support

          -1       0.72      0.77      0.74      1284
           0       0.68      0.66      0.67      1225
           1       0.84      0.80      0.82      1241

    accuracy                           0.75      3750
   macro avg       0.75      0.74      0.75      3750
weighted avg       0.75      0.75      0.75      3750



In [118]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, lem, tfidf, df_testing)
model = SVC()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
compare_list.loc[len(compare_list)] = [{'col': ['reviewTextWithSummary', 'verified'], 'proc': 'lem', 'vectorizer': 'tfidf'}, accuracy, f1]
#######  END OF GENERATED CODE  #######

              precision    recall  f1-score   support

          -1       0.75      0.77      0.76      1284
           0       0.68      0.71      0.70      1225
           1       0.87      0.82      0.84      1241

    accuracy                           0.76      3750
   macro avg       0.77      0.76      0.77      3750
weighted avg       0.77      0.76      0.77      3750



In [119]:
pd.set_option("display.max_colwidth", None)
compare_list = compare_list.sort_values(by="F1 Score", ascending=False).reset_index(drop=True)
display(compare_list)

Unnamed: 0,Data config and preprocessing,Accuracy,F1 Score
0,"{'col': ['reviewTextWithSummary'], 'proc': None, 'vectorizer': 'tfidf'}",0.7896,0.790689
1,"{'col': ['reviewTextWithSummary', 'verified'], 'proc': None, 'vectorizer': 'tfidf'}",0.778133,0.779604
2,"{'col': ['reviewTextWithSummary'], 'proc': 'baseline', 'vectorizer': 'tfidf'}",0.7728,0.77409
3,"{'col': ['reviewTextWithSummary', 'verified'], 'proc': 'baseline', 'vectorizer': 'tfidf'}",0.7704,0.77172
4,"{'col': ['reviewTextWithSummary'], 'proc': 'lem', 'vectorizer': 'tfidf'}",0.767733,0.76885
5,"{'col': ['reviewTextWithSummary', 'verified'], 'proc': 'lem', 'vectorizer': 'tfidf'}",0.764533,0.765854
6,"{'col': ['reviewTextWithSummary'], 'proc': 'stem', 'vectorizer': 'tfidf'}",0.764267,0.765399
7,"{'col': ['reviewTextWithSummary'], 'proc': None, 'vectorizer': 'bow'}",0.762933,0.763543
8,"{'col': ['reviewTextWithSummary', 'verified'], 'proc': None, 'vectorizer': 'bow'}",0.762933,0.763422
9,"{'col': ['reviewTextWithSummary', 'verified'], 'proc': 'stem', 'vectorizer': 'tfidf'}",0.758133,0.759533


- Further testing can be done on using the "reviewTextWithSummary" column over the "reviewText" as it is universally better
- Of the top 10 configurations, 8 of them use TF-IDF vectorizer
- Including the "verified" column does not seem to have a significant, if only a slight negative, impact on the model. Will not be used in further tesing
- Further testing will be done on all of the preprocessing methods (None, Baseline, Stemming, Lemmatization)
- Models for further analysis are as follows:
    - reviewTextWithSummary, None, TF-IDF
    - reviewTextWithSummary, Baseline, TF-IDF
    - reviewTextWithSummary, Stem, TF-IDF
    - reviewTextWithSummary, Lem, TF-IDF