In [1]:
import pandas as pd
import nltk
import re
import random
import numpy as np
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from nltk.corpus import stopwords
from scipy.sparse import hstack

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to /home/ibrahim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ibrahim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ibrahim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
baseline = "baseline"
stem = "stem"
lem = "lem"
bow = "bow"
tfidf = "tfidf"
random_state = 42
testing_n = 50
large_n = 100
random.seed(random_state)
data_path = "./data/combined.csv"

# EDA and simple preprocessing

In [3]:
df = pd.read_csv(data_path, low_memory=False)
print(df.isna().sum() / len(df) * 100)

overall            0.000000
verified           0.000000
reviewTime         0.000000
reviewerID         0.000000
asin               0.000000
style             50.710965
reviewerName       0.016280
reviewText         0.049506
summary            0.025620
unixReviewTime     0.000000
vote              84.972458
image             97.894193
dtype: float64


In [4]:
print(df.isna().sum())

overall                0
verified               0
reviewTime             0
reviewerID             0
asin                   0
style             380030
reviewerName         122
reviewText           371
summary              192
unixReviewTime         0
vote              636787
image             733623
dtype: int64


In [5]:
shpae = df.shape
print(df.isna().sum().sum() / (shpae[0] * shpae[1]) * 100)

19.47241852636673


In [6]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5.0,True,"09 1, 2016",A3CIUOJXQ5VDQ2,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Shelly F,As advertised. Reasonably priced,Five Stars,1472688000,,
1,5.0,True,"11 14, 2013",A3H7T87S984REU,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",houserules18,Like the oder and the feel when I put it on my...,Good for the face,1384387200,,
2,1.0,True,"08 18, 2013",A3J034YH7UG4KT,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Adam,I bought this to smell nice after I shave. Wh...,Smells awful,1376784000,,
3,5.0,False,"05 3, 2011",A2UEO5XR3598GI,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Rich K,HEY!! I am an Aqua Velva Man and absolutely lo...,Truth is There IS Nothing Like an AQUA VELVA MAN.,1304380800,25.0,
4,5.0,True,"05 6, 2011",A3SFRT223XXWF7,B00006L9LC,{'Size:': ' 200ml/6.7oz'},C. C. Christian,If you ever want to feel pampered by a shampoo...,Bvlgari Shampoo,1304640000,3.0,


In [7]:
df = df[["overall", "reviewText", "summary", "verified"]]

In [8]:
print(df.isna().sum() / len(df) * 100)

overall       0.000000
reviewText    0.049506
summary       0.025620
verified      0.000000
dtype: float64


In [9]:
df = df.dropna()

In [10]:
print(df.isna().sum() / len(df) * 100)

overall       0.0
reviewText    0.0
summary       0.0
verified      0.0
dtype: float64


In [11]:
df["sentiment"] = df["overall"].apply(lambda x: 1 if x > 3 else -1 if x < 3 else 0)
df["reviewTextWithSummary"] = df["summary"] + " " + df["reviewText"]
df.drop(["overall", "summary"], axis=1, inplace=True)
df.head()

Unnamed: 0,reviewText,verified,sentiment,reviewTextWithSummary
0,As advertised. Reasonably priced,True,1,Five Stars As advertised. Reasonably priced
1,Like the oder and the feel when I put it on my...,True,1,Good for the face Like the oder and the feel w...
2,I bought this to smell nice after I shave. Wh...,True,-1,Smells awful I bought this to smell nice after...
3,HEY!! I am an Aqua Velva Man and absolutely lo...,False,1,Truth is There IS Nothing Like an AQUA VELVA M...
4,If you ever want to feel pampered by a shampoo...,True,1,Bvlgari Shampoo If you ever want to feel pampe...


In [12]:
df["sentiment"].value_counts()

sentiment
 1    657241
 0     47109
-1     44501
Name: count, dtype: int64

In [13]:
df.head()

Unnamed: 0,reviewText,verified,sentiment,reviewTextWithSummary
0,As advertised. Reasonably priced,True,1,Five Stars As advertised. Reasonably priced
1,Like the oder and the feel when I put it on my...,True,1,Good for the face Like the oder and the feel w...
2,I bought this to smell nice after I shave. Wh...,True,-1,Smells awful I bought this to smell nice after...
3,HEY!! I am an Aqua Velva Man and absolutely lo...,False,1,Truth is There IS Nothing Like an AQUA VELVA M...
4,If you ever want to feel pampered by a shampoo...,True,1,Bvlgari Shampoo If you ever want to feel pampe...


# Model experimentation

In [14]:
df_testing = (
    df.groupby("sentiment")
    .apply(lambda x: x.sample(n=testing_n, random_state=random_state, replace=True))
    .reset_index(drop=True)
)
df_testing["sentiment"].value_counts()

sentiment
-1    50
 0    50
 1    50
Name: count, dtype: int64

In [15]:
df_testing.head()

Unnamed: 0,reviewText,verified,sentiment,reviewTextWithSummary
0,Intuit is a despicable company now. This is th...,False,-1,Intuit is One Unethical Company. Any alternati...
1,Very disappointed guitar came damaged with den...,True,-1,Very disappointed Very disappointed guitar cam...
2,The rings did not perform as I had hoped. They...,True,-1,Not what I was hoping for The rings did not pe...
3,My ProFX8 purchased from Amazon in 2015 was po...,False,-1,Mine broke. Very light use. Padded case. Maybe...
4,"Not the greatest, really flimsy.",True,-1,"Two Stars Not the greatest, really flimsy."


In [16]:
print("Dataset size:", len(df_testing))

Dataset size: 150


In [17]:
STOP_WORDS = set(stopwords.words("english"))

In [18]:
def preprocess_text(sentence, stop, type_proc=None):
    words = []
    for word in sentence.lower().strip().split():

        word = re.sub("\d", "", word)
        word = re.sub("[^\w\s]", "", word)

        if word not in stop and word != "":
            words.append(preprocess_type(word, type_proc))

    return " ".join(words)

In [19]:
def preprocess_type(word, type_proc):
    if type_proc == baseline:
        return word
    elif type_proc == stem:
        return PorterStemmer().stem(word)
    elif type_proc == lem:
        return WordNetLemmatizer().lemmatize(word)
    else:
        raise ValueError("Invalid Preprocessing Type")

In [20]:
def my_train_test_split(cols, test_size, df=df, random_state=random_state):
    x_train, x_test, y_train, y_test = train_test_split(
        df[cols], df["sentiment"], test_size=test_size, random_state=random_state
    )
    return x_train, x_test, y_train, y_test

In [21]:
def apply_preprocessing(proc, x_train, x_test):
    if proc is None:
        return x_train, x_test
    cols = x_train.columns

    textcol = "reviewText"
    if "reviewText" not in cols and "reviewTextWithSummary" in cols:
        textcol = "reviewTextWithSummary"
    x_train[textcol] = x_train[textcol].apply(
        lambda x: preprocess_text(x, STOP_WORDS, proc)
    )
    x_test[textcol] = x_test[textcol].apply(
        lambda x: preprocess_text(x, STOP_WORDS, proc)
    )
    return x_train, x_test

In [22]:
def add_col(x, col):
    col = np.array([col]).T
    return hstack([x, col])

In [23]:
def pipeline(cols, test_size, proc, vectorizer, df=df, random_state=random_state):
    if "reviewText" not in cols and "reviewTextWithSummary" not in cols:
        raise ValueError("Must contain reviewText or reviewTextWithSummary")

    textcol = "reviewText"
    if "reviewText" not in cols and "reviewTextWithSummary" in cols:
        textcol = "reviewTextWithSummary"
    x_train, x_test, y_train, y_test = my_train_test_split(
        cols, test_size, df, random_state
    )
    x_train, x_test = apply_preprocessing(proc, x_train, x_test)
    if vectorizer == bow:
        vectorizer = CountVectorizer()
    elif vectorizer == tfidf:
        vectorizer = TfidfVectorizer()
    else:
        raise ValueError("Invalid Vectorizer")
    x_train_ = vectorizer.fit_transform(x_train[textcol])
    x_test_ = vectorizer.transform(x_test[textcol])

    if "verified" in cols:
        x_train = add_col(x_train_, x_train["verified"])
        x_test = add_col(x_test_, x_test["verified"])
    else:
        x_train = x_train_
        x_test = x_test_
    return x_train, x_test, y_train, y_test

## Testing different configs

In [24]:
param_grid = {
    "C": [0.1, 1, 10, 100, 1000],
    "gamma": [1, 0.1, 0.01, 0.001, 0.0001],
    "kernel": ["rbf", "linear"],
}
n_jobs = -1
verbose = 0
cv = 3

In [25]:
compare_list = pd.DataFrame(columns=["Params", "Config", "Accuracy Score"])

In [26]:
code_gen = False
col_comb = [
    ["reviewText"],
    ["reviewText", "verified"],
    ["reviewTextWithSummary"],
    ["reviewTextWithSummary", "verified"],
]
proc_comb = [None, baseline, stem, lem]
vectorizer_comb = [bow, tfidf]
if code_gen:
    for col in col_comb:
        for proc in proc_comb:
            for vectorizer in vectorizer_comb:
                params = {
                    "col": col,
                    "test_size": 0.25,
                    "proc": proc,
                    "vectorizer": vectorizer,
                }
                print(
                    f"""
# %%
x_train, x_test, y_train, y_test = pipeline({col}, 0.25, {proc}, {vectorizer}, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {params}, accuracy]
"""
                )

In [27]:
# below is code genderated by above cell, to make changes to the code, edit the
# above cell and run it, pasting its contents between the markers

In [28]:
####### START OF GENERATED CODE #######
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, None, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'test_size': 0.25, 'proc': None, 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.71      0.33      0.45        15
           0       0.50      0.45      0.48        11
           1       0.52      0.92      0.67        12

    accuracy                           0.55        38
   macro avg       0.58      0.57      0.53        38
weighted avg       0.59      0.55      0.53        38



In [29]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, None, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'test_size': 0.25, 'proc': None, 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.62      0.33      0.43        15
           0       0.47      0.73      0.57        11
           1       0.62      0.67      0.64        12

    accuracy                           0.55        38
   macro avg       0.57      0.58      0.55        38
weighted avg       0.58      0.55      0.54        38



In [30]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, baseline, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'test_size': 0.25, 'proc': 'baseline', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.36      0.27      0.31        15
           0       0.27      0.55      0.36        11
           1       0.60      0.25      0.35        12

    accuracy                           0.34        38
   macro avg       0.41      0.35      0.34        38
weighted avg       0.41      0.34      0.34        38



In [31]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, baseline, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'test_size': 0.25, 'proc': 'baseline', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.40      0.13      0.20        15
           0       0.35      0.64      0.45        11
           1       0.62      0.67      0.64        12

    accuracy                           0.45        38
   macro avg       0.46      0.48      0.43        38
weighted avg       0.45      0.45      0.41        38



In [32]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, stem, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'test_size': 0.25, 'proc': 'stem', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.56      0.33      0.42        15
           0       0.28      0.45      0.34        11
           1       0.64      0.58      0.61        12

    accuracy                           0.45        38
   macro avg       0.49      0.46      0.46        38
weighted avg       0.50      0.45      0.46        38



In [33]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, stem, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'test_size': 0.25, 'proc': 'stem', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.44      0.27      0.33        15
           0       0.32      0.55      0.40        11
           1       0.60      0.50      0.55        12

    accuracy                           0.42        38
   macro avg       0.45      0.44      0.43        38
weighted avg       0.46      0.42      0.42        38



In [34]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, lem, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'test_size': 0.25, 'proc': 'lem', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.30      0.20      0.24        15
           0       0.29      0.64      0.40        11
           1       0.75      0.25      0.38        12

    accuracy                           0.34        38
   macro avg       0.45      0.36      0.34        38
weighted avg       0.44      0.34      0.33        38



In [35]:
x_train, x_test, y_train, y_test = pipeline(['reviewText'], 0.25, lem, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText'], 'test_size': 0.25, 'proc': 'lem', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.40      0.13      0.20        15
           0       0.35      0.73      0.47        11
           1       0.70      0.58      0.64        12

    accuracy                           0.45        38
   macro avg       0.48      0.48      0.44        38
weighted avg       0.48      0.45      0.42        38



In [36]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, None, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'test_size': 0.25, 'proc': None, 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.71      0.33      0.45        15
           0       0.50      0.45      0.48        11
           1       0.52      0.92      0.67        12

    accuracy                           0.55        38
   macro avg       0.58      0.57      0.53        38
weighted avg       0.59      0.55      0.53        38



In [37]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, None, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'test_size': 0.25, 'proc': None, 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.50      0.27      0.35        15
           0       0.44      0.64      0.52        11
           1       0.57      0.67      0.62        12

    accuracy                           0.50        38
   macro avg       0.50      0.52      0.49        38
weighted avg       0.50      0.50      0.48        38



In [38]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, baseline, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'test_size': 0.25, 'proc': 'baseline', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.36      0.27      0.31        15
           0       0.27      0.55      0.36        11
           1       0.60      0.25      0.35        12

    accuracy                           0.34        38
   macro avg       0.41      0.35      0.34        38
weighted avg       0.41      0.34      0.34        38



In [39]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, baseline, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'test_size': 0.25, 'proc': 'baseline', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.50      0.27      0.35        15
           0       0.31      0.45      0.37        11
           1       0.57      0.67      0.62        12

    accuracy                           0.45        38
   macro avg       0.46      0.46      0.44        38
weighted avg       0.47      0.45      0.44        38



In [40]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, stem, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'test_size': 0.25, 'proc': 'stem', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.56      0.33      0.42        15
           0       0.28      0.45      0.34        11
           1       0.64      0.58      0.61        12

    accuracy                           0.45        38
   macro avg       0.49      0.46      0.46        38
weighted avg       0.50      0.45      0.46        38



In [41]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, stem, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'test_size': 0.25, 'proc': 'stem', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.50      0.27      0.35        15
           0       0.32      0.55      0.40        11
           1       0.55      0.50      0.52        12

    accuracy                           0.42        38
   macro avg       0.45      0.44      0.42        38
weighted avg       0.46      0.42      0.42        38



In [42]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, lem, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'test_size': 0.25, 'proc': 'lem', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.30      0.20      0.24        15
           0       0.29      0.64      0.40        11
           1       0.75      0.25      0.38        12

    accuracy                           0.34        38
   macro avg       0.45      0.36      0.34        38
weighted avg       0.44      0.34      0.33        38



In [43]:
x_train, x_test, y_train, y_test = pipeline(['reviewText', 'verified'], 0.25, lem, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewText', 'verified'], 'test_size': 0.25, 'proc': 'lem', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.44      0.27      0.33        15
           0       0.33      0.55      0.41        11
           1       0.73      0.67      0.70        12

    accuracy                           0.47        38
   macro avg       0.50      0.49      0.48        38
weighted avg       0.50      0.47      0.47        38



In [44]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, None, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'test_size': 0.25, 'proc': None, 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.80      0.80      0.80        15
           0       0.80      0.36      0.50        11
           1       0.67      1.00      0.80        12

    accuracy                           0.74        38
   macro avg       0.76      0.72      0.70        38
weighted avg       0.76      0.74      0.71        38



In [45]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, None, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'test_size': 0.25, 'proc': None, 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.70      0.47      0.56        15
           0       0.42      0.45      0.43        11
           1       0.69      0.92      0.79        12

    accuracy                           0.61        38
   macro avg       0.60      0.61      0.59        38
weighted avg       0.61      0.61      0.60        38



In [46]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, baseline, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'test_size': 0.25, 'proc': 'baseline', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.58      0.73      0.65        15
           0       0.50      0.27      0.35        11
           1       0.77      0.83      0.80        12

    accuracy                           0.63        38
   macro avg       0.62      0.61      0.60        38
weighted avg       0.62      0.63      0.61        38



In [47]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, baseline, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'test_size': 0.25, 'proc': 'baseline', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.62      0.33      0.43        15
           0       0.38      0.45      0.42        11
           1       0.71      1.00      0.83        12

    accuracy                           0.58        38
   macro avg       0.57      0.60      0.56        38
weighted avg       0.58      0.58      0.55        38



In [48]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, stem, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'test_size': 0.25, 'proc': 'stem', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.50      0.67      0.57        15
           0       0.38      0.27      0.32        11
           1       0.90      0.75      0.82        12

    accuracy                           0.58        38
   macro avg       0.59      0.56      0.57        38
weighted avg       0.59      0.58      0.58        38



In [49]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, stem, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'test_size': 0.25, 'proc': 'stem', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.56      0.33      0.42        15
           0       0.33      0.45      0.38        11
           1       0.71      0.83      0.77        12

    accuracy                           0.53        38
   macro avg       0.53      0.54      0.52        38
weighted avg       0.54      0.53      0.52        38



In [50]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, lem, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'test_size': 0.25, 'proc': 'lem', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.57      0.80      0.67        15
           0       0.60      0.27      0.37        11
           1       0.83      0.83      0.83        12

    accuracy                           0.66        38
   macro avg       0.67      0.64      0.62        38
weighted avg       0.66      0.66      0.63        38



In [51]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary'], 0.25, lem, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary'], 'test_size': 0.25, 'proc': 'lem', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.50      0.33      0.40        15
           0       0.36      0.45      0.40        11
           1       0.71      0.83      0.77        12

    accuracy                           0.53        38
   macro avg       0.52      0.54      0.52        38
weighted avg       0.53      0.53      0.52        38



In [52]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, None, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'test_size': 0.25, 'proc': None, 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.80      0.80      0.80        15
           0       0.80      0.36      0.50        11
           1       0.67      1.00      0.80        12

    accuracy                           0.74        38
   macro avg       0.76      0.72      0.70        38
weighted avg       0.76      0.74      0.71        38



In [53]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, None, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'test_size': 0.25, 'proc': None, 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.64      0.47      0.54        15
           0       0.42      0.45      0.43        11
           1       0.73      0.92      0.81        12

    accuracy                           0.61        38
   macro avg       0.60      0.61      0.60        38
weighted avg       0.60      0.61      0.60        38



In [54]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, baseline, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'test_size': 0.25, 'proc': 'baseline', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.58      0.73      0.65        15
           0       0.50      0.27      0.35        11
           1       0.77      0.83      0.80        12

    accuracy                           0.63        38
   macro avg       0.62      0.61      0.60        38
weighted avg       0.62      0.63      0.61        38



In [55]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, baseline, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'test_size': 0.25, 'proc': 'baseline', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.62      0.33      0.43        15
           0       0.38      0.45      0.42        11
           1       0.65      0.92      0.76        12

    accuracy                           0.55        38
   macro avg       0.55      0.57      0.54        38
weighted avg       0.56      0.55      0.53        38



In [56]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, stem, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'test_size': 0.25, 'proc': 'stem', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.56      0.67      0.61        15
           0       0.38      0.27      0.32        11
           1       0.92      0.92      0.92        12

    accuracy                           0.63        38
   macro avg       0.62      0.62      0.61        38
weighted avg       0.62      0.63      0.62        38



In [57]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, stem, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'test_size': 0.25, 'proc': 'stem', 'vectorizer': 'tfidf'}, accuracy]

              precision    recall  f1-score   support

          -1       0.45      0.33      0.38        15
           0       0.29      0.36      0.32        11
           1       0.77      0.83      0.80        12

    accuracy                           0.50        38
   macro avg       0.50      0.51      0.50        38
weighted avg       0.51      0.50      0.50        38



In [58]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, lem, bow, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'test_size': 0.25, 'proc': 'lem', 'vectorizer': 'bow'}, accuracy]

              precision    recall  f1-score   support

          -1       0.57      0.80      0.67        15
           0       0.60      0.27      0.37        11
           1       0.83      0.83      0.83        12

    accuracy                           0.66        38
   macro avg       0.67      0.64      0.62        38
weighted avg       0.66      0.66      0.63        38



In [59]:
x_train, x_test, y_train, y_test = pipeline(['reviewTextWithSummary', 'verified'], 0.25, lem, tfidf, df_testing)
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=verbose, n_jobs=n_jobs, cv=cv)
grid.fit(x_train, y_train)
y_pred = grid.predict(x_test)
print(classification_report(y_test, y_pred))
accuracy = grid.score(x_test, y_test)
compare_list.loc[len(compare_list)] = [grid.best_params_, {'col': ['reviewTextWithSummary', 'verified'], 'test_size': 0.25, 'proc': 'lem', 'vectorizer': 'tfidf'}, accuracy]
#######  END OF GENERATED CODE  #######

              precision    recall  f1-score   support

          -1       0.55      0.40      0.46        15
           0       0.38      0.45      0.42        11
           1       0.71      0.83      0.77        12

    accuracy                           0.55        38
   macro avg       0.55      0.56      0.55        38
weighted avg       0.55      0.55      0.55        38



In [60]:
compare_list = compare_list.sort_values(
    by="Accuracy Score", ascending=False
).reset_index(drop=True)
display(compare_list)

Unnamed: 0,Params,Config,Accuracy Score
0,"{'C': 1, 'gamma': 1, 'kernel': 'linear'}","{'col': ['reviewTextWithSummary'], 'test_size'...",0.736842
1,"{'C': 1, 'gamma': 1, 'kernel': 'linear'}","{'col': ['reviewTextWithSummary', 'verified'],...",0.736842
2,"{'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary', 'verified'],...",0.657895
3,"{'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary'], 'test_size'...",0.657895
4,"{'C': 1, 'gamma': 1, 'kernel': 'linear'}","{'col': ['reviewTextWithSummary', 'verified'],...",0.631579
5,"{'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary', 'verified'],...",0.631579
6,"{'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary'], 'test_size'...",0.631579
7,"{'C': 10, 'gamma': 1, 'kernel': 'linear'}","{'col': ['reviewTextWithSummary', 'verified'],...",0.605263
8,"{'C': 1, 'gamma': 1, 'kernel': 'linear'}","{'col': ['reviewTextWithSummary'], 'test_size'...",0.605263
9,"{'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}","{'col': ['reviewTextWithSummary'], 'test_size'...",0.578947


In [61]:
print(f"Best Configuration on testing dataset (size={len(df_testing)}):")
print("Score :: ", compare_list.loc[0]["Accuracy Score"])
print("SVC   :: ", compare_list.loc[0]["Params"])
print("data  :: ", compare_list.loc[0]["Config"])

Best Configuration on testing dataset (size=150):
Score ::  0.7368421052631579
SVC   ::  {'C': 1, 'gamma': 1, 'kernel': 'linear'}
data  ::  {'col': ['reviewTextWithSummary'], 'test_size': 0.25, 'proc': None, 'vectorizer': 'bow'}


In [62]:
compare_list.to_csv("./results/svm_compare_list.csv", index=False)

- Across all tests, reviewText with summary performed better than reviewText without summary.
- The RBF kernel performed better than the linear kernel in almost all cases.
- The top configuration was as follows:
  - Data::
    - Columns used: reviewTextWithSummary
    - Text preprocessing step: None
    - Text vectorizer: tfidf
  - SVC::
    - C=1
    - gamma=1
    - kernel=rbf

Using these parameters, lets build a model on a larger dataset.

## Building larger model

In [63]:
df_large = (
    df.groupby("sentiment")
    .apply(lambda x: x.sample(n=large_n, random_state=random_state, replace=True))
    .reset_index(drop=True)
)

In [64]:
x_train, x_test, y_train, y_test = train_test_split(
    df_large[["reviewTextWithSummary"]],
    df_large["sentiment"],
    test_size=0.25,
    random_state=random_state,
)

In [65]:
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train["reviewTextWithSummary"])
x_test = vectorizer.transform(x_test["reviewTextWithSummary"])

In [66]:
svc_testing_df_large = SVC(C=1, gamma=1, kernel="rbf")

In [67]:
svc_testing_df_large.fit(x_train, y_train)

In [68]:
y_pred = svc_testing_df_large.predict(x_test)

In [69]:
sample = [
    "I loved this product, it was amazing",
    "I hated this product, it was terrible",
    "This product was okay, it was fine",
]

In [70]:
sample_ = [vectorizer.transform([x]) for x in sample]

In [72]:
for s, p in zip(sample, sample_):
    print(s)
    print(svc_testing_df_large.predict(p))
    print()

I loved this product, it was amazing
[1]

I hated this product, it was terrible
[-1]

This product was okay, it was fine
[-1]



In [73]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.69      0.67      0.68        27
           0       0.47      0.65      0.55        23
           1       0.94      0.64      0.76        25

    accuracy                           0.65        75
   macro avg       0.70      0.65      0.66        75
weighted avg       0.71      0.65      0.67        75



- Results seem very good with an F1 Score of 0.87
- Interestingly, the top perfomring model did not use any text preprocessing
- The next best performing models that used different text preprocessing were as follows:
  - baseline text preprocessing with tfidf vectorizer: score of 0.7728
  - lemmatized text preprocessing with tfidf vectorizer: score of 0.7677
  - stemmed text preprocessing with tfidf vectorizer: score of 0.7642
- These results are within 2% than the selected model, so it may be worth exploring these models further

### Baseline with tfidf

In [74]:
x_train, x_test, y_train, y_test = train_test_split(
    df_large[["reviewTextWithSummary"]],
    df_large["sentiment"],
    test_size=0.25,
    random_state=random_state,
)

In [75]:
x_train["reviewTextWithSummary"] = x_train["reviewTextWithSummary"].apply(
    lambda x: preprocess_text(x, STOP_WORDS, baseline)
)
x_test["reviewTextWithSummary"] = x_test["reviewTextWithSummary"].apply(
    lambda x: preprocess_text(x, STOP_WORDS, baseline)
)

In [76]:
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train["reviewTextWithSummary"])
x_test = vectorizer.transform(x_test["reviewTextWithSummary"])

In [77]:
svc_baseline_tfidf = SVC(C=1, gamma=1, kernel="rbf")

In [78]:
svc_baseline_tfidf.fit(x_train, y_train)

In [79]:
y_pred = svc_baseline_tfidf.predict(x_test)

In [80]:
sample = [
    "I loved this product, it was amazing",
    "I hated this product, it was terrible",
    "This product was okay, it was fine",
]

In [81]:
sample_ = [preprocess_text(x, STOP_WORDS, baseline) for x in sample]

In [82]:
sample_ = [vectorizer.transform([x]) for x in sample_]

In [83]:
for s, p in zip(sample, sample_):
    print(s)
    print(svc_baseline_tfidf.predict(p))
    print()

I loved this product, it was amazing
[1]

I hated this product, it was terrible
[-1]

This product was okay, it was fine
[0]



In [84]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.54      0.70      0.61        27
           0       0.44      0.48      0.46        23
           1       0.87      0.52      0.65        25

    accuracy                           0.57        75
   macro avg       0.62      0.57      0.57        75
weighted avg       0.62      0.57      0.58        75



### Lemmatized with tfidf

In [85]:
x_train, x_test, y_train, y_test = train_test_split(
    df_large[["reviewTextWithSummary"]],
    df_large["sentiment"],
    test_size=0.25,
    random_state=random_state,
)

In [86]:
x_train["reviewTextWithSummary"] = x_train["reviewTextWithSummary"].apply(
    lambda x: preprocess_text(x, STOP_WORDS, lem)
)
x_test["reviewTextWithSummary"] = x_test["reviewTextWithSummary"].apply(
    lambda x: preprocess_text(x, STOP_WORDS, lem)
)

In [87]:
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train["reviewTextWithSummary"])
x_test = vectorizer.transform(x_test["reviewTextWithSummary"])

In [88]:
svc_lematized_tfidf = SVC(C=1, gamma=1, kernel="rbf")

In [89]:
svc_lematized_tfidf.fit(x_train, y_train)

In [90]:
y_pred = svc_lematized_tfidf.predict(x_test)

In [91]:
sample = [
    "I loved this product, it was amazing",
    "I hated this product, it was terrible",
    "This product was okay, it was fine",
]

In [92]:
sample_ = [preprocess_text(x, STOP_WORDS, lem) for x in sample]

In [93]:
sample_ = [vectorizer.transform([x]) for x in sample_]

In [94]:
for s, p in zip(sample, sample_):
    print(s)
    print(svc_lematized_tfidf.predict(p))
    print()

I loved this product, it was amazing
[1]

I hated this product, it was terrible
[-1]

This product was okay, it was fine
[0]



In [95]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.54      0.70      0.61        27
           0       0.44      0.48      0.46        23
           1       0.80      0.48      0.60        25

    accuracy                           0.56        75
   macro avg       0.59      0.55      0.56        75
weighted avg       0.60      0.56      0.56        75



### Stemmed with tfidf

In [96]:
x_train, x_test, y_train, y_test = train_test_split(
    df_large[["reviewTextWithSummary"]],
    df_large["sentiment"],
    test_size=0.25,
    random_state=random_state,
)

In [97]:
x_train["reviewTextWithSummary"] = x_train["reviewTextWithSummary"].apply(
    lambda x: preprocess_text(x, STOP_WORDS, stem)
)
x_test["reviewTextWithSummary"] = x_test["reviewTextWithSummary"].apply(
    lambda x: preprocess_text(x, STOP_WORDS, stem)
)

In [98]:
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train["reviewTextWithSummary"])
x_test = vectorizer.transform(x_test["reviewTextWithSummary"])

In [99]:
svc_stemmed_tfidf = SVC(C=1, gamma=1, kernel="rbf")

In [100]:
svc_stemmed_tfidf.fit(x_train, y_train)

In [101]:
y_pred = svc_stemmed_tfidf.predict(x_test)

In [102]:
sample = [
    "I loved this product, it was amazing",
    "I hated this product, it was terrible",
    "This product was okay, it was fine",
]

In [103]:
sample_ = [preprocess_text(x, STOP_WORDS, stem) for x in sample]

In [104]:
sample_ = [vectorizer.transform([x]) for x in sample_]

In [105]:
for s, p in zip(sample, sample_):
    print(s)
    print(svc_stemmed_tfidf.predict(p))
    print()

I loved this product, it was amazing
[1]

I hated this product, it was terrible
[-1]

This product was okay, it was fine
[0]



In [106]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.54      0.74      0.62        27
           0       0.48      0.43      0.45        23
           1       0.82      0.56      0.67        25

    accuracy                           0.59        75
   macro avg       0.61      0.58      0.58        75
weighted avg       0.62      0.59      0.59        75

