In [1]:
import pandas as pd
import nltk
import re
import random
import numpy as np
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from nltk.corpus import stopwords
import warnings

warnings.filterwarnings("ignore")

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to /home/ibrahim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ibrahim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ibrahim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
random_state = 42
testing_n = 5000
random.seed(random_state)
data_path = "./data/combined.csv"

In [3]:
df = pd.read_csv(data_path, low_memory=False)
df = df[["overall", "reviewText", "summary"]]
df.dropna(inplace=True)
df["sentiment"] = df["overall"].apply(lambda x: 1 if x > 3 else -1 if x < 3 else 0)
df["reviewTextWithSummary"] = df["summary"] + " " + df["reviewText"]
df.drop(["overall", "summary", "reviewText"], axis=1, inplace=True)
df.head()

Unnamed: 0,sentiment,reviewTextWithSummary
0,1,Five Stars As advertised. Reasonably priced
1,1,Good for the face Like the oder and the feel w...
2,-1,Smells awful I bought this to smell nice after...
3,1,Truth is There IS Nothing Like an AQUA VELVA M...
4,1,Bvlgari Shampoo If you ever want to feel pampe...


# Model experimentation

In [4]:
df_testing = (
    df.groupby("sentiment")
    .apply(lambda x: x.sample(n=testing_n, random_state=random_state, replace=True))
    .reset_index(drop=True)
)
df_testing["sentiment"].value_counts()

sentiment
-1    5000
 0    5000
 1    5000
Name: count, dtype: int64

In [5]:
df_testing.head()

Unnamed: 0,sentiment,reviewTextWithSummary
0,-1,Intuit is One Unethical Company. Any alternati...
1,-1,Very disappointed Very disappointed guitar cam...
2,-1,Not what I was hoping for The rings did not pe...
3,-1,Mine broke. Very light use. Padded case. Maybe...
4,-1,"Two Stars Not the greatest, really flimsy."


In [6]:
print("Dataset size:", len(df_testing))

Dataset size: 15000


In [7]:
STOP_WORDS = set(stopwords.words("english"))

In [8]:
def preprocess_text(sentence, stop, type_proc=None):
    words = []
    for word in sentence.lower().strip().split():

        word = re.sub("\d", "", word)
        word = re.sub("[^\w\s]", "", word)

        if word not in stop and word != "":
            words.append(preprocess_type(word, type_proc))

    return " ".join(words)

In [9]:
def preprocess_type(word, type_proc):
    if type_proc == "Baseline":
        return word
    elif type_proc == "Stemmed":
        return PorterStemmer().stem(word)
    elif type_proc == "Lemmatized":
        return WordNetLemmatizer().lemmatize(word)
    else:
        raise ValueError("Invalid Preprocessing Type")

In [10]:
def train_val_test_split(df=df, random_state=random_state):
    x = df[["reviewTextWithSummary"]]
    y = df["sentiment"]
    x_train, x_tmp, y_train, y_tmp = train_test_split(
        x, y, test_size=0.3, random_state=random_state
    )
    x_val, x_test, y_val, y_test = train_test_split(
        x_tmp, y_tmp, test_size=0.5, random_state=random_state
    )
    return x_train, x_val, x_test, y_train, y_val, y_test

In [11]:
def pipeline(proc, df, random_state=random_state):
    df_ = df.copy()
    if proc is not None:
        df_["reviewTextWithSummary"] = df_["reviewTextWithSummary"].apply(
            lambda x: preprocess_text(x, STOP_WORDS, proc)
        )

    x_train, x_val, x_test, y_train, y_val, y_test = train_val_test_split(
        df_, random_state
    )
    vectorizer = TfidfVectorizer()
    vectorizer.fit(df_["reviewTextWithSummary"])
    x_train = vectorizer.transform(x_train["reviewTextWithSummary"])
    x_val = vectorizer.transform(x_val["reviewTextWithSummary"])
    x_test = vectorizer.transform(x_test["reviewTextWithSummary"])

    return x_train, x_val, x_test, y_train, y_val, y_test, vectorizer

## Testing different configs

In [12]:
param_grid = {
    "C": [0.01, 0.1, 1, 10, 100, 1000],
    "gamma": [1, 0.1, 0.01, 0.001, "scale", "auto"],
    "kernel": ["rbf", "linear", "poly", "sigmoid"],
}
n_jobs = None
verbose = 3
cv = 3

In [13]:
compare_list = pd.DataFrame(
    columns=[
        "tuning",
        "dataset",
        "proc",
        "C",
        "gamma",
        "kernel",
        "grid_score",
        "f1_score",
        "accuracy",
    ]
)

### No preprocessing

In [14]:
x_train, x_val, x_test, y_train, y_val, y_test, vec_noproc = pipeline(None, df_testing)

In [15]:
svc_noproc_prelim = SVC()
svc_noproc_prelim.fit(x_train, y_train)

In [16]:
y_val_pred = svc_noproc_prelim.predict(x_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

          -1       0.78      0.78      0.78       751
           0       0.71      0.74      0.72       751
           1       0.88      0.85      0.86       748

    accuracy                           0.79      2250
   macro avg       0.79      0.79      0.79      2250
weighted avg       0.79      0.79      0.79      2250



In [17]:
compare_list.loc[len(compare_list)] = [
    "before",
    "validation",
    None,
    "default",
    "default",
    "default",
    None,
    f1_score(y_val, y_val_pred, average="weighted"),
    accuracy_score(y_val, y_val_pred),
]

In [18]:
svc_noproc_grid = GridSearchCV(SVC(), param_grid, cv=cv, verbose=verbose, n_jobs=n_jobs)
svc_noproc_grid.fit(x_val, y_val)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
[CV 1/3] END .......C=0.01, gamma=1, kernel=rbf;, score=0.333 total time=   0.7s
[CV 2/3] END .......C=0.01, gamma=1, kernel=rbf;, score=0.333 total time=   0.7s
[CV 3/3] END .......C=0.01, gamma=1, kernel=rbf;, score=0.361 total time=   0.7s
[CV 1/3] END ....C=0.01, gamma=1, kernel=linear;, score=0.333 total time=   0.7s
[CV 2/3] END ....C=0.01, gamma=1, kernel=linear;, score=0.333 total time=   0.7s
[CV 3/3] END ....C=0.01, gamma=1, kernel=linear;, score=0.385 total time=   0.7s
[CV 1/3] END ......C=0.01, gamma=1, kernel=poly;, score=0.333 total time=   0.7s
[CV 2/3] END ......C=0.01, gamma=1, kernel=poly;, score=0.333 total time=   0.7s
[CV 3/3] END ......C=0.01, gamma=1, kernel=poly;, score=0.341 total time=   0.7s
[CV 1/3] END ...C=0.01, gamma=1, kernel=sigmoid;, score=0.333 total time=   0.7s
[CV 2/3] END ...C=0.01, gamma=1, kernel=sigmoid;, score=0.333 total time=   0.7s
[CV 3/3] END ...C=0.01, gamma=1, kernel=sigmoi

[CV 3/3] END ..C=0.1, gamma=0.01, kernel=linear;, score=0.563 total time=   0.7s
[CV 1/3] END ....C=0.1, gamma=0.01, kernel=poly;, score=0.333 total time=   0.7s
[CV 2/3] END ....C=0.1, gamma=0.01, kernel=poly;, score=0.333 total time=   0.7s
[CV 3/3] END ....C=0.1, gamma=0.01, kernel=poly;, score=0.341 total time=   0.7s
[CV 1/3] END .C=0.1, gamma=0.01, kernel=sigmoid;, score=0.333 total time=   0.7s
[CV 2/3] END .C=0.1, gamma=0.01, kernel=sigmoid;, score=0.333 total time=   0.8s
[CV 3/3] END .C=0.1, gamma=0.01, kernel=sigmoid;, score=0.385 total time=   0.7s
[CV 1/3] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.333 total time=   0.7s
[CV 2/3] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.333 total time=   0.7s
[CV 3/3] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.385 total time=   0.7s
[CV 1/3] END .C=0.1, gamma=0.001, kernel=linear;, score=0.547 total time=   0.7s
[CV 2/3] END .C=0.1, gamma=0.001, kernel=linear;, score=0.577 total time=   0.7s
[CV 3/3] END .C=0.1, gamma=0

[CV 3/3] END ..C=1, gamma=scale, kernel=sigmoid;, score=0.736 total time=   0.6s
[CV 1/3] END .......C=1, gamma=auto, kernel=rbf;, score=0.333 total time=   0.7s
[CV 2/3] END .......C=1, gamma=auto, kernel=rbf;, score=0.333 total time=   0.8s
[CV 3/3] END .......C=1, gamma=auto, kernel=rbf;, score=0.385 total time=   0.7s
[CV 1/3] END ....C=1, gamma=auto, kernel=linear;, score=0.721 total time=   0.6s
[CV 2/3] END ....C=1, gamma=auto, kernel=linear;, score=0.737 total time=   0.6s
[CV 3/3] END ....C=1, gamma=auto, kernel=linear;, score=0.736 total time=   0.6s
[CV 1/3] END ......C=1, gamma=auto, kernel=poly;, score=0.333 total time=   0.7s
[CV 2/3] END ......C=1, gamma=auto, kernel=poly;, score=0.333 total time=   0.7s
[CV 3/3] END ......C=1, gamma=auto, kernel=poly;, score=0.341 total time=   0.7s
[CV 1/3] END ...C=1, gamma=auto, kernel=sigmoid;, score=0.333 total time=   0.7s
[CV 2/3] END ...C=1, gamma=auto, kernel=sigmoid;, score=0.333 total time=   0.8s
[CV 3/3] END ...C=1, gamma=a

[CV 3/3] END ...C=100, gamma=0.1, kernel=linear;, score=0.703 total time=   0.7s
[CV 1/3] END .....C=100, gamma=0.1, kernel=poly;, score=0.368 total time=   0.7s
[CV 2/3] END .....C=100, gamma=0.1, kernel=poly;, score=0.359 total time=   0.7s
[CV 3/3] END .....C=100, gamma=0.1, kernel=poly;, score=0.361 total time=   0.7s
[CV 1/3] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.713 total time=   0.7s
[CV 2/3] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.720 total time=   0.7s
[CV 3/3] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.701 total time=   0.7s
[CV 1/3] END .....C=100, gamma=0.01, kernel=rbf;, score=0.713 total time=   0.7s
[CV 2/3] END .....C=100, gamma=0.01, kernel=rbf;, score=0.728 total time=   0.7s
[CV 3/3] END .....C=100, gamma=0.01, kernel=rbf;, score=0.717 total time=   0.7s
[CV 1/3] END ..C=100, gamma=0.01, kernel=linear;, score=0.715 total time=   0.7s
[CV 2/3] END ..C=100, gamma=0.01, kernel=linear;, score=0.720 total time=   0.7s
[CV 3/3] END ..C=100, gamma=

[CV 3/3] END C=1000, gamma=0.001, kernel=sigmoid;, score=0.736 total time=   0.7s
[CV 1/3] END ...C=1000, gamma=scale, kernel=rbf;, score=0.723 total time=   0.8s
[CV 2/3] END ...C=1000, gamma=scale, kernel=rbf;, score=0.749 total time=   0.8s
[CV 3/3] END ...C=1000, gamma=scale, kernel=rbf;, score=0.739 total time=   0.8s
[CV 1/3] END C=1000, gamma=scale, kernel=linear;, score=0.715 total time=   0.7s
[CV 2/3] END C=1000, gamma=scale, kernel=linear;, score=0.720 total time=   0.7s
[CV 3/3] END C=1000, gamma=scale, kernel=linear;, score=0.703 total time=   0.7s
[CV 1/3] END ..C=1000, gamma=scale, kernel=poly;, score=0.605 total time=   0.7s
[CV 2/3] END ..C=1000, gamma=scale, kernel=poly;, score=0.668 total time=   0.7s
[CV 3/3] END ..C=1000, gamma=scale, kernel=poly;, score=0.597 total time=   0.7s
[CV 1/3] END C=1000, gamma=scale, kernel=sigmoid;, score=0.643 total time=   0.4s
[CV 2/3] END C=1000, gamma=scale, kernel=sigmoid;, score=0.659 total time=   0.5s
[CV 3/3] END C=1000, gamm

In [19]:
print(classification_report(y_val, svc_noproc_grid.predict(x_val)))

              precision    recall  f1-score   support

          -1       1.00      1.00      1.00       751
           0       1.00      1.00      1.00       751
           1       1.00      1.00      1.00       748

    accuracy                           1.00      2250
   macro avg       1.00      1.00      1.00      2250
weighted avg       1.00      1.00      1.00      2250



In [20]:
print("best params for noproc")
print(svc_noproc_grid.best_params_)

best params for noproc
{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}


In [21]:
svc_noproc = SVC(**svc_noproc_grid.best_params_)
svc_noproc.fit(x_train, y_train)

In [22]:
y_test_pred = svc_noproc.predict(x_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

          -1       0.77      0.77      0.77       773
           0       0.70      0.72      0.71       741
           1       0.88      0.86      0.87       736

    accuracy                           0.78      2250
   macro avg       0.78      0.78      0.78      2250
weighted avg       0.78      0.78      0.78      2250



In [23]:
compare_list.loc[len(compare_list)] = [
    "after",
    "testing",
    None,
    svc_noproc_grid.best_params_["C"],
    svc_noproc_grid.best_params_["gamma"],
    svc_noproc_grid.best_params_["kernel"],
    svc_noproc_grid.best_score_,
    f1_score(y_test, y_test_pred, average="weighted"),
    accuracy_score(y_test, y_test_pred),
]

In [24]:
display(compare_list)

Unnamed: 0,tuning,dataset,proc,C,gamma,kernel,grid_score,f1_score,accuracy
0,before,validation,,default,default,default,,0.789808,0.788889
1,after,testing,,10,scale,rbf,0.736889,0.7833,0.782667


### Baseline preprocessing

In [25]:
x_train, x_val, x_test, y_train, y_val, y_test, vec_baseline = pipeline(
    "Baseline", df_testing
)

In [26]:
svc_baseline_prelim = SVC()
svc_baseline_prelim.fit(x_train, y_train)

In [27]:
y_val_pred = svc_baseline_prelim.predict(x_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

          -1       0.75      0.77      0.76       751
           0       0.70      0.73      0.71       751
           1       0.87      0.81      0.84       748

    accuracy                           0.77      2250
   macro avg       0.77      0.77      0.77      2250
weighted avg       0.77      0.77      0.77      2250



In [28]:
compare_list.loc[len(compare_list)] = [
    "before",
    "validation",
    "Baseline",
    "default",
    "default",
    "default",
    None,
    f1_score(y_val, y_val_pred, average="weighted"),
    accuracy_score(y_val, y_val_pred),
]

In [29]:
svc_baseline_grid = GridSearchCV(
    SVC(), param_grid, cv=cv, verbose=verbose, n_jobs=n_jobs
)
svc_baseline_grid.fit(x_val, y_val)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
[CV 1/3] END .......C=0.01, gamma=1, kernel=rbf;, score=0.333 total time=   0.5s
[CV 2/3] END .......C=0.01, gamma=1, kernel=rbf;, score=0.333 total time=   0.5s
[CV 3/3] END .......C=0.01, gamma=1, kernel=rbf;, score=0.365 total time=   0.5s
[CV 1/3] END ....C=0.01, gamma=1, kernel=linear;, score=0.333 total time=   0.5s
[CV 2/3] END ....C=0.01, gamma=1, kernel=linear;, score=0.333 total time=   0.5s
[CV 3/3] END ....C=0.01, gamma=1, kernel=linear;, score=0.396 total time=   0.5s
[CV 1/3] END ......C=0.01, gamma=1, kernel=poly;, score=0.333 total time=   0.5s
[CV 2/3] END ......C=0.01, gamma=1, kernel=poly;, score=0.333 total time=   0.5s
[CV 3/3] END ......C=0.01, gamma=1, kernel=poly;, score=0.341 total time=   0.5s
[CV 1/3] END ...C=0.01, gamma=1, kernel=sigmoid;, score=0.333 total time=   0.5s
[CV 2/3] END ...C=0.01, gamma=1, kernel=sigmoid;, score=0.333 total time=   0.5s
[CV 3/3] END ...C=0.01, gamma=1, kernel=sigmoi

[CV 3/3] END ..C=0.1, gamma=0.01, kernel=linear;, score=0.523 total time=   0.5s
[CV 1/3] END ....C=0.1, gamma=0.01, kernel=poly;, score=0.333 total time=   0.5s
[CV 2/3] END ....C=0.1, gamma=0.01, kernel=poly;, score=0.333 total time=   0.5s
[CV 3/3] END ....C=0.1, gamma=0.01, kernel=poly;, score=0.341 total time=   0.5s
[CV 1/3] END .C=0.1, gamma=0.01, kernel=sigmoid;, score=0.333 total time=   0.5s
[CV 2/3] END .C=0.1, gamma=0.01, kernel=sigmoid;, score=0.333 total time=   0.5s
[CV 3/3] END .C=0.1, gamma=0.01, kernel=sigmoid;, score=0.396 total time=   0.5s
[CV 1/3] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.333 total time=   0.5s
[CV 2/3] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.333 total time=   0.5s
[CV 3/3] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.396 total time=   0.5s
[CV 1/3] END .C=0.1, gamma=0.001, kernel=linear;, score=0.524 total time=   0.5s
[CV 2/3] END .C=0.1, gamma=0.001, kernel=linear;, score=0.553 total time=   0.5s
[CV 3/3] END .C=0.1, gamma=0

[CV 3/3] END ..C=1, gamma=scale, kernel=sigmoid;, score=0.696 total time=   0.4s
[CV 1/3] END .......C=1, gamma=auto, kernel=rbf;, score=0.333 total time=   0.5s
[CV 2/3] END .......C=1, gamma=auto, kernel=rbf;, score=0.333 total time=   0.5s
[CV 3/3] END .......C=1, gamma=auto, kernel=rbf;, score=0.396 total time=   0.5s
[CV 1/3] END ....C=1, gamma=auto, kernel=linear;, score=0.692 total time=   0.4s
[CV 2/3] END ....C=1, gamma=auto, kernel=linear;, score=0.709 total time=   0.4s
[CV 3/3] END ....C=1, gamma=auto, kernel=linear;, score=0.692 total time=   0.4s
[CV 1/3] END ......C=1, gamma=auto, kernel=poly;, score=0.333 total time=   0.5s
[CV 2/3] END ......C=1, gamma=auto, kernel=poly;, score=0.333 total time=   0.5s
[CV 3/3] END ......C=1, gamma=auto, kernel=poly;, score=0.341 total time=   0.5s
[CV 1/3] END ...C=1, gamma=auto, kernel=sigmoid;, score=0.333 total time=   0.5s
[CV 2/3] END ...C=1, gamma=auto, kernel=sigmoid;, score=0.333 total time=   0.5s
[CV 3/3] END ...C=1, gamma=a

[CV 3/3] END ...C=100, gamma=0.1, kernel=linear;, score=0.681 total time=   0.5s
[CV 1/3] END .....C=100, gamma=0.1, kernel=poly;, score=0.373 total time=   0.5s
[CV 2/3] END .....C=100, gamma=0.1, kernel=poly;, score=0.360 total time=   0.5s
[CV 3/3] END .....C=100, gamma=0.1, kernel=poly;, score=0.363 total time=   0.5s
[CV 1/3] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.693 total time=   0.5s
[CV 2/3] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.696 total time=   0.5s
[CV 3/3] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.680 total time=   0.5s
[CV 1/3] END .....C=100, gamma=0.01, kernel=rbf;, score=0.697 total time=   0.5s
[CV 2/3] END .....C=100, gamma=0.01, kernel=rbf;, score=0.709 total time=   0.5s
[CV 3/3] END .....C=100, gamma=0.01, kernel=rbf;, score=0.683 total time=   0.5s
[CV 1/3] END ..C=100, gamma=0.01, kernel=linear;, score=0.691 total time=   0.5s
[CV 2/3] END ..C=100, gamma=0.01, kernel=linear;, score=0.696 total time=   0.5s
[CV 3/3] END ..C=100, gamma=

[CV 3/3] END C=1000, gamma=0.001, kernel=sigmoid;, score=0.692 total time=   0.4s
[CV 1/3] END ...C=1000, gamma=scale, kernel=rbf;, score=0.696 total time=   0.5s
[CV 2/3] END ...C=1000, gamma=scale, kernel=rbf;, score=0.719 total time=   0.5s
[CV 3/3] END ...C=1000, gamma=scale, kernel=rbf;, score=0.704 total time=   0.5s
[CV 1/3] END C=1000, gamma=scale, kernel=linear;, score=0.691 total time=   0.5s
[CV 2/3] END C=1000, gamma=scale, kernel=linear;, score=0.696 total time=   0.5s
[CV 3/3] END C=1000, gamma=scale, kernel=linear;, score=0.681 total time=   0.5s
[CV 1/3] END ..C=1000, gamma=scale, kernel=poly;, score=0.541 total time=   0.5s
[CV 2/3] END ..C=1000, gamma=scale, kernel=poly;, score=0.579 total time=   0.5s
[CV 3/3] END ..C=1000, gamma=scale, kernel=poly;, score=0.565 total time=   0.5s
[CV 1/3] END C=1000, gamma=scale, kernel=sigmoid;, score=0.649 total time=   0.4s
[CV 2/3] END C=1000, gamma=scale, kernel=sigmoid;, score=0.663 total time=   0.4s
[CV 3/3] END C=1000, gamm

In [30]:
print(classification_report(y_val, svc_baseline_grid.predict(x_val)))

              precision    recall  f1-score   support

          -1       1.00      1.00      1.00       751
           0       1.00      1.00      1.00       751
           1       1.00      1.00      1.00       748

    accuracy                           1.00      2250
   macro avg       1.00      1.00      1.00      2250
weighted avg       1.00      1.00      1.00      2250



In [31]:
print("best params for baseline")
print(svc_baseline_grid.best_params_)

best params for baseline
{'C': 10, 'gamma': 1, 'kernel': 'rbf'}


In [32]:
svc_baseline = SVC(**svc_baseline_grid.best_params_)
svc_baseline.fit(x_train, y_train)

In [33]:
y_test_pred = svc_baseline.predict(x_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

          -1       0.75      0.75      0.75       773
           0       0.67      0.70      0.69       741
           1       0.87      0.83      0.85       736

    accuracy                           0.76      2250
   macro avg       0.76      0.76      0.76      2250
weighted avg       0.76      0.76      0.76      2250



In [34]:
compare_list.loc[len(compare_list)] = [
    "after",
    "testing",
    "Baseline",
    svc_baseline_grid.best_params_["C"],
    svc_baseline_grid.best_params_["gamma"],
    svc_baseline_grid.best_params_["kernel"],
    svc_baseline_grid.best_score_,
    f1_score(y_test, y_test_pred, average="weighted"),
    accuracy_score(y_test, y_test_pred),
]

In [35]:
display(compare_list)

Unnamed: 0,tuning,dataset,proc,C,gamma,kernel,grid_score,f1_score,accuracy
0,before,validation,,default,default,default,,0.789808,0.788889
1,after,testing,,10,scale,rbf,0.736889,0.7833,0.782667
2,before,validation,Baseline,default,default,default,,0.771851,0.770667
3,after,testing,Baseline,10,1,rbf,0.706222,0.760602,0.759556


### Stemmed + baseline preprocessing

In [36]:
x_train, x_val, x_test, y_train, y_val, y_test, vec_stem = pipeline("Stemmed", df_testing)

In [37]:
svc_stem_prelim = SVC()
svc_stem_prelim.fit(x_train, y_train)

In [38]:
y_val_pred = svc_stem_prelim.predict(x_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

          -1       0.75      0.76      0.76       751
           0       0.69      0.74      0.71       751
           1       0.87      0.81      0.84       748

    accuracy                           0.77      2250
   macro avg       0.77      0.77      0.77      2250
weighted avg       0.77      0.77      0.77      2250



In [39]:
compare_list.loc[len(compare_list)] = [
    "before",
    "validation",
    "Stemmed",
    "default",
    "default",
    "default",
    None,
    f1_score(y_val, y_val_pred, average="weighted"),
    accuracy_score(y_val, y_val_pred),
]

In [40]:
svc_stem_grid = GridSearchCV(SVC(), param_grid, cv=cv, verbose=verbose, n_jobs=n_jobs)
svc_stem_grid.fit(x_val, y_val)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
[CV 1/3] END .......C=0.01, gamma=1, kernel=rbf;, score=0.333 total time=   0.5s
[CV 2/3] END .......C=0.01, gamma=1, kernel=rbf;, score=0.333 total time=   0.5s
[CV 3/3] END .......C=0.01, gamma=1, kernel=rbf;, score=0.355 total time=   0.5s
[CV 1/3] END ....C=0.01, gamma=1, kernel=linear;, score=0.333 total time=   0.5s
[CV 2/3] END ....C=0.01, gamma=1, kernel=linear;, score=0.333 total time=   0.5s
[CV 3/3] END ....C=0.01, gamma=1, kernel=linear;, score=0.376 total time=   0.5s
[CV 1/3] END ......C=0.01, gamma=1, kernel=poly;, score=0.333 total time=   0.5s
[CV 2/3] END ......C=0.01, gamma=1, kernel=poly;, score=0.333 total time=   0.5s
[CV 3/3] END ......C=0.01, gamma=1, kernel=poly;, score=0.341 total time=   0.5s
[CV 1/3] END ...C=0.01, gamma=1, kernel=sigmoid;, score=0.333 total time=   0.5s
[CV 2/3] END ...C=0.01, gamma=1, kernel=sigmoid;, score=0.333 total time=   0.5s
[CV 3/3] END ...C=0.01, gamma=1, kernel=sigmoi

[CV 3/3] END ..C=0.1, gamma=0.01, kernel=linear;, score=0.531 total time=   0.4s
[CV 1/3] END ....C=0.1, gamma=0.01, kernel=poly;, score=0.333 total time=   0.5s
[CV 2/3] END ....C=0.1, gamma=0.01, kernel=poly;, score=0.333 total time=   0.5s
[CV 3/3] END ....C=0.1, gamma=0.01, kernel=poly;, score=0.341 total time=   0.5s
[CV 1/3] END .C=0.1, gamma=0.01, kernel=sigmoid;, score=0.333 total time=   0.5s
[CV 2/3] END .C=0.1, gamma=0.01, kernel=sigmoid;, score=0.333 total time=   0.5s
[CV 3/3] END .C=0.1, gamma=0.01, kernel=sigmoid;, score=0.376 total time=   0.5s
[CV 1/3] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.333 total time=   0.5s
[CV 2/3] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.333 total time=   0.5s
[CV 3/3] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.376 total time=   0.5s
[CV 1/3] END .C=0.1, gamma=0.001, kernel=linear;, score=0.529 total time=   0.4s
[CV 2/3] END .C=0.1, gamma=0.001, kernel=linear;, score=0.563 total time=   0.5s
[CV 3/3] END .C=0.1, gamma=0

[CV 3/3] END ..C=1, gamma=scale, kernel=sigmoid;, score=0.695 total time=   0.4s
[CV 1/3] END .......C=1, gamma=auto, kernel=rbf;, score=0.333 total time=   0.5s
[CV 2/3] END .......C=1, gamma=auto, kernel=rbf;, score=0.333 total time=   0.5s
[CV 3/3] END .......C=1, gamma=auto, kernel=rbf;, score=0.376 total time=   0.5s
[CV 1/3] END ....C=1, gamma=auto, kernel=linear;, score=0.695 total time=   0.4s
[CV 2/3] END ....C=1, gamma=auto, kernel=linear;, score=0.707 total time=   0.4s
[CV 3/3] END ....C=1, gamma=auto, kernel=linear;, score=0.696 total time=   0.4s
[CV 1/3] END ......C=1, gamma=auto, kernel=poly;, score=0.333 total time=   0.5s
[CV 2/3] END ......C=1, gamma=auto, kernel=poly;, score=0.333 total time=   0.5s
[CV 3/3] END ......C=1, gamma=auto, kernel=poly;, score=0.341 total time=   0.5s
[CV 1/3] END ...C=1, gamma=auto, kernel=sigmoid;, score=0.333 total time=   0.5s
[CV 2/3] END ...C=1, gamma=auto, kernel=sigmoid;, score=0.333 total time=   0.5s
[CV 3/3] END ...C=1, gamma=a

[CV 3/3] END ...C=100, gamma=0.1, kernel=linear;, score=0.664 total time=   0.5s
[CV 1/3] END .....C=100, gamma=0.1, kernel=poly;, score=0.376 total time=   0.5s
[CV 2/3] END .....C=100, gamma=0.1, kernel=poly;, score=0.371 total time=   0.5s
[CV 3/3] END .....C=100, gamma=0.1, kernel=poly;, score=0.367 total time=   0.5s
[CV 1/3] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.684 total time=   0.5s
[CV 2/3] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.669 total time=   0.5s
[CV 3/3] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.669 total time=   0.5s
[CV 1/3] END .....C=100, gamma=0.01, kernel=rbf;, score=0.688 total time=   0.5s
[CV 2/3] END .....C=100, gamma=0.01, kernel=rbf;, score=0.696 total time=   0.5s
[CV 3/3] END .....C=100, gamma=0.01, kernel=rbf;, score=0.684 total time=   0.5s
[CV 1/3] END ..C=100, gamma=0.01, kernel=linear;, score=0.683 total time=   0.5s
[CV 2/3] END ..C=100, gamma=0.01, kernel=linear;, score=0.672 total time=   0.5s
[CV 3/3] END ..C=100, gamma=

[CV 3/3] END C=1000, gamma=0.001, kernel=sigmoid;, score=0.696 total time=   0.4s
[CV 1/3] END ...C=1000, gamma=scale, kernel=rbf;, score=0.688 total time=   0.5s
[CV 2/3] END ...C=1000, gamma=scale, kernel=rbf;, score=0.713 total time=   0.5s
[CV 3/3] END ...C=1000, gamma=scale, kernel=rbf;, score=0.695 total time=   0.5s
[CV 1/3] END C=1000, gamma=scale, kernel=linear;, score=0.683 total time=   0.4s
[CV 2/3] END C=1000, gamma=scale, kernel=linear;, score=0.672 total time=   0.5s
[CV 3/3] END C=1000, gamma=scale, kernel=linear;, score=0.664 total time=   0.5s
[CV 1/3] END ..C=1000, gamma=scale, kernel=poly;, score=0.547 total time=   0.5s
[CV 2/3] END ..C=1000, gamma=scale, kernel=poly;, score=0.588 total time=   0.5s
[CV 3/3] END ..C=1000, gamma=scale, kernel=poly;, score=0.575 total time=   0.5s
[CV 1/3] END C=1000, gamma=scale, kernel=sigmoid;, score=0.629 total time=   0.3s
[CV 2/3] END C=1000, gamma=scale, kernel=sigmoid;, score=0.613 total time=   0.3s
[CV 3/3] END C=1000, gamm

In [41]:
print(classification_report(y_val, svc_stem_grid.predict(x_val)))

              precision    recall  f1-score   support

          -1       0.94      0.95      0.94       751
           0       0.92      0.93      0.93       751
           1       0.97      0.95      0.96       748

    accuracy                           0.94      2250
   macro avg       0.94      0.94      0.94      2250
weighted avg       0.94      0.94      0.94      2250



In [42]:
print("best params for stem")
print(svc_stem_grid.best_params_)

best params for stem
{'C': 10, 'gamma': 0.1, 'kernel': 'sigmoid'}


In [43]:
svc_stem = SVC(**svc_stem_grid.best_params_)
svc_stem.fit(x_train, y_train)

In [44]:
y_test_pred = svc_stem.predict(x_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

          -1       0.75      0.72      0.73       773
           0       0.66      0.70      0.68       741
           1       0.85      0.83      0.84       736

    accuracy                           0.75      2250
   macro avg       0.75      0.75      0.75      2250
weighted avg       0.75      0.75      0.75      2250



In [45]:
compare_list.loc[len(compare_list)] = [
    "after",
    "testing",
    "Stemmed",
    svc_stem_grid.best_params_["C"],
    svc_stem_grid.best_params_["gamma"],
    svc_stem_grid.best_params_["kernel"],
    svc_stem_grid.best_score_,
    f1_score(y_test, y_test_pred, average="weighted"),
    accuracy_score(y_test, y_test_pred),
]

In [46]:
display(compare_list)

Unnamed: 0,tuning,dataset,proc,C,gamma,kernel,grid_score,f1_score,accuracy
0,before,validation,,default,default,default,,0.789808,0.788889
1,after,testing,,10,scale,rbf,0.736889,0.7833,0.782667
2,before,validation,Baseline,default,default,default,,0.771851,0.770667
3,after,testing,Baseline,10,1,rbf,0.706222,0.760602,0.759556
4,before,validation,Stemmed,default,default,default,,0.769903,0.768444
5,after,testing,Stemmed,10,0.1,sigmoid,0.699556,0.748964,0.748


### Lemmatized + baseline preprocessing

In [47]:
x_train, x_val, x_test, y_train, y_val, y_test, vec_lem = pipeline("Lemmatized", df_testing)

In [48]:
svc_lem_prelim = SVC()
svc_lem_prelim.fit(x_train, y_train)

In [49]:
y_val_pred = svc_lem_prelim.predict(x_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

          -1       0.74      0.77      0.76       751
           0       0.70      0.73      0.71       751
           1       0.87      0.80      0.83       748

    accuracy                           0.77      2250
   macro avg       0.77      0.77      0.77      2250
weighted avg       0.77      0.77      0.77      2250



In [50]:
compare_list.loc[len(compare_list)] = [
    "before",
    "validation",
    "Lemmatized",
    "default",
    "default",
    "default",
    None,
    f1_score(y_val, y_val_pred, average="weighted"),
    accuracy_score(y_val, y_val_pred),
]

In [51]:
svc_lem_grid = GridSearchCV(SVC(), param_grid, cv=cv, verbose=verbose, n_jobs=n_jobs)
svc_lem_grid.fit(x_val, y_val)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
[CV 1/3] END .......C=0.01, gamma=1, kernel=rbf;, score=0.333 total time=   0.5s
[CV 2/3] END .......C=0.01, gamma=1, kernel=rbf;, score=0.333 total time=   0.5s
[CV 3/3] END .......C=0.01, gamma=1, kernel=rbf;, score=0.355 total time=   0.5s
[CV 1/3] END ....C=0.01, gamma=1, kernel=linear;, score=0.333 total time=   0.5s
[CV 2/3] END ....C=0.01, gamma=1, kernel=linear;, score=0.333 total time=   0.5s
[CV 3/3] END ....C=0.01, gamma=1, kernel=linear;, score=0.373 total time=   0.5s
[CV 1/3] END ......C=0.01, gamma=1, kernel=poly;, score=0.333 total time=   0.5s
[CV 2/3] END ......C=0.01, gamma=1, kernel=poly;, score=0.333 total time=   0.5s
[CV 3/3] END ......C=0.01, gamma=1, kernel=poly;, score=0.341 total time=   0.5s
[CV 1/3] END ...C=0.01, gamma=1, kernel=sigmoid;, score=0.333 total time=   0.5s
[CV 2/3] END ...C=0.01, gamma=1, kernel=sigmoid;, score=0.333 total time=   0.5s
[CV 3/3] END ...C=0.01, gamma=1, kernel=sigmoi

[CV 3/3] END ..C=0.1, gamma=0.01, kernel=linear;, score=0.525 total time=   0.5s
[CV 1/3] END ....C=0.1, gamma=0.01, kernel=poly;, score=0.333 total time=   0.5s
[CV 2/3] END ....C=0.1, gamma=0.01, kernel=poly;, score=0.333 total time=   0.5s
[CV 3/3] END ....C=0.1, gamma=0.01, kernel=poly;, score=0.341 total time=   0.5s
[CV 1/3] END .C=0.1, gamma=0.01, kernel=sigmoid;, score=0.333 total time=   0.5s
[CV 2/3] END .C=0.1, gamma=0.01, kernel=sigmoid;, score=0.333 total time=   0.5s
[CV 3/3] END .C=0.1, gamma=0.01, kernel=sigmoid;, score=0.373 total time=   0.5s
[CV 1/3] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.333 total time=   0.5s
[CV 2/3] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.333 total time=   0.5s
[CV 3/3] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.373 total time=   0.5s
[CV 1/3] END .C=0.1, gamma=0.001, kernel=linear;, score=0.520 total time=   0.5s
[CV 2/3] END .C=0.1, gamma=0.001, kernel=linear;, score=0.557 total time=   0.5s
[CV 3/3] END .C=0.1, gamma=0

[CV 3/3] END ..C=1, gamma=scale, kernel=sigmoid;, score=0.696 total time=   0.4s
[CV 1/3] END .......C=1, gamma=auto, kernel=rbf;, score=0.333 total time=   0.5s
[CV 2/3] END .......C=1, gamma=auto, kernel=rbf;, score=0.333 total time=   0.5s
[CV 3/3] END .......C=1, gamma=auto, kernel=rbf;, score=0.373 total time=   0.5s
[CV 1/3] END ....C=1, gamma=auto, kernel=linear;, score=0.684 total time=   0.4s
[CV 2/3] END ....C=1, gamma=auto, kernel=linear;, score=0.703 total time=   0.4s
[CV 3/3] END ....C=1, gamma=auto, kernel=linear;, score=0.696 total time=   0.4s
[CV 1/3] END ......C=1, gamma=auto, kernel=poly;, score=0.333 total time=   0.5s
[CV 2/3] END ......C=1, gamma=auto, kernel=poly;, score=0.333 total time=   0.5s
[CV 3/3] END ......C=1, gamma=auto, kernel=poly;, score=0.341 total time=   0.5s
[CV 1/3] END ...C=1, gamma=auto, kernel=sigmoid;, score=0.333 total time=   0.5s
[CV 2/3] END ...C=1, gamma=auto, kernel=sigmoid;, score=0.333 total time=   0.5s
[CV 3/3] END ...C=1, gamma=a

[CV 3/3] END ...C=100, gamma=0.1, kernel=linear;, score=0.669 total time=   0.5s
[CV 1/3] END .....C=100, gamma=0.1, kernel=poly;, score=0.369 total time=   0.5s
[CV 2/3] END .....C=100, gamma=0.1, kernel=poly;, score=0.360 total time=   0.5s
[CV 3/3] END .....C=100, gamma=0.1, kernel=poly;, score=0.363 total time=   0.5s
[CV 1/3] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.700 total time=   0.5s
[CV 2/3] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.673 total time=   0.5s
[CV 3/3] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.668 total time=   0.5s
[CV 1/3] END .....C=100, gamma=0.01, kernel=rbf;, score=0.700 total time=   0.5s
[CV 2/3] END .....C=100, gamma=0.01, kernel=rbf;, score=0.696 total time=   0.5s
[CV 3/3] END .....C=100, gamma=0.01, kernel=rbf;, score=0.675 total time=   0.5s
[CV 1/3] END ..C=100, gamma=0.01, kernel=linear;, score=0.696 total time=   0.5s
[CV 2/3] END ..C=100, gamma=0.01, kernel=linear;, score=0.673 total time=   0.5s
[CV 3/3] END ..C=100, gamma=

[CV 3/3] END C=1000, gamma=0.001, kernel=sigmoid;, score=0.696 total time=   0.5s
[CV 1/3] END ...C=1000, gamma=scale, kernel=rbf;, score=0.685 total time=   0.5s
[CV 2/3] END ...C=1000, gamma=scale, kernel=rbf;, score=0.695 total time=   0.5s
[CV 3/3] END ...C=1000, gamma=scale, kernel=rbf;, score=0.693 total time=   0.5s
[CV 1/3] END C=1000, gamma=scale, kernel=linear;, score=0.696 total time=   0.5s
[CV 2/3] END C=1000, gamma=scale, kernel=linear;, score=0.673 total time=   0.5s
[CV 3/3] END C=1000, gamma=scale, kernel=linear;, score=0.669 total time=   0.5s
[CV 1/3] END ..C=1000, gamma=scale, kernel=poly;, score=0.536 total time=   0.5s
[CV 2/3] END ..C=1000, gamma=scale, kernel=poly;, score=0.577 total time=   0.5s
[CV 3/3] END ..C=1000, gamma=scale, kernel=poly;, score=0.568 total time=   0.5s
[CV 1/3] END C=1000, gamma=scale, kernel=sigmoid;, score=0.643 total time=   0.3s
[CV 2/3] END C=1000, gamma=scale, kernel=sigmoid;, score=0.607 total time=   0.3s
[CV 3/3] END C=1000, gamm

In [52]:
print(classification_report(y_val, svc_lem_grid.predict(x_val)))

              precision    recall  f1-score   support

          -1       0.88      0.92      0.90       751
           0       0.89      0.89      0.89       751
           1       0.96      0.91      0.93       748

    accuracy                           0.91      2250
   macro avg       0.91      0.91      0.91      2250
weighted avg       0.91      0.91      0.91      2250



In [53]:
print("best params for lem")
print(svc_lem_grid.best_params_)

best params for lem
{'C': 1, 'gamma': 1, 'kernel': 'sigmoid'}


In [54]:
svc_lem = SVC(**svc_lem_grid.best_params_)
svc_lem.fit(x_train, y_train)

In [55]:
y_test_pred = svc_lem.predict(x_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

          -1       0.73      0.71      0.72       773
           0       0.64      0.68      0.66       741
           1       0.86      0.82      0.84       736

    accuracy                           0.74      2250
   macro avg       0.74      0.74      0.74      2250
weighted avg       0.74      0.74      0.74      2250



In [56]:
compare_list.loc[len(compare_list)] = [
    "after",
    "testing",
    "Lemmatized",
    svc_lem_grid.best_params_["C"],
    svc_lem_grid.best_params_["gamma"],
    svc_lem_grid.best_params_["kernel"],
    svc_lem_grid.best_score_,
    f1_score(y_test, y_test_pred, average="weighted"),
    accuracy_score(y_test, y_test_pred),
]

In [57]:
display(compare_list)

Unnamed: 0,tuning,dataset,proc,C,gamma,kernel,grid_score,f1_score,accuracy
0,before,validation,,default,default,default,,0.789808,0.788889
1,after,testing,,10,scale,rbf,0.736889,0.7833,0.782667
2,before,validation,Baseline,default,default,default,,0.771851,0.770667
3,after,testing,Baseline,10,1,rbf,0.706222,0.760602,0.759556
4,before,validation,Stemmed,default,default,default,,0.769903,0.768444
5,after,testing,Stemmed,10,0.1,sigmoid,0.699556,0.748964,0.748
6,before,validation,Lemmatized,default,default,default,,0.768739,0.767556
7,after,testing,Lemmatized,1,1,sigmoid,0.695111,0.739669,0.738222
