In [39]:
import pandas as pd
import nltk
import re
import random
import numpy as np
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, accuracy_score, precision_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
from nltk.corpus import stopwords
import warnings

warnings.filterwarnings("ignore")

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to /home/ibrahim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ibrahim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ibrahim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [40]:
random_state = 42
testing_n = 500
testing_frac = 0.001
random.seed(random_state)
data_path = "./data/combined.csv"

In [41]:
df = pd.read_csv(data_path, low_memory=False)
df = df.drop_duplicates()
df = df[["overall", "reviewText", "summary"]]
df.dropna(inplace=True)
df["sentiment"] = df["overall"].apply(lambda x: 1 if x > 3 else -1 if x < 3 else 0)
df["reviewTextWithSummary"] = df["summary"] + " " + df["reviewText"]
df.drop(["overall", "summary", "reviewText"], axis=1, inplace=True)
df.head()

Unnamed: 0,sentiment,reviewTextWithSummary
0,1,Five Stars As advertised. Reasonably priced
1,1,Good for the face Like the oder and the feel w...
2,-1,Smells awful I bought this to smell nice after...
3,1,Truth is There IS Nothing Like an AQUA VELVA M...
4,1,Bvlgari Shampoo If you ever want to feel pampe...


# Model experimentation

In [42]:
df_testing = (
    df.groupby("sentiment")
    .apply(lambda x: x.sample(n=testing_n, random_state=random_state, replace=True))
    .reset_index(drop=True)
)
df_testing = df.sample(frac=testing_frac)
df_testing["sentiment"].value_counts()

sentiment
 1    609
 0     50
-1     32
Name: count, dtype: int64

In [43]:
df_testing.head()

Unnamed: 0,sentiment,reviewTextWithSummary
293094,0,"Faulty earring pieces They're alright, many pi..."
264039,1,Five Stars I love this tool. I am getting stra...
735880,1,Five Stars Perfect
67836,1,Fantastic for the price. I ordered two of thes...
410324,1,Five Stars Great price


In [44]:
print("Dataset size:", len(df_testing))

Dataset size: 691


In [45]:
STOP_WORDS = set(stopwords.words("english"))

In [46]:
def preprocess_text(sentence, stop, type_proc=None):
    words = []
    for word in sentence.lower().strip().split():

        word = re.sub("\d", "", word)
        word = re.sub("[^\w\s]", "", word)

        if word not in stop and word != "":
            words.append(preprocess_type(word, type_proc))

    return " ".join(words)

In [47]:
def preprocess_type(word, type_proc):
    if type_proc == "Baseline":
        return word
    elif type_proc == "Stemmed":
        return PorterStemmer().stem(word)
    elif type_proc == "Lemmatized":
        return WordNetLemmatizer().lemmatize(word)
    else:
        raise ValueError("Invalid Preprocessing Type")

In [48]:
def train_val_test_split(df=df, random_state=random_state):
    x = df[["reviewTextWithSummary"]]
    y = df["sentiment"]
    x_train, x_tmp, y_train, y_tmp = train_test_split(
        x, y, test_size=0.3, random_state=random_state
    )
    x_val, x_test, y_val, y_test = train_test_split(
        x_tmp, y_tmp, test_size=0.5, random_state=random_state
    )
    return x_train, x_val, x_test, y_train, y_val, y_test

In [49]:
def pipeline(proc, df, random_state=random_state):
    df_ = df.copy()
    if proc is not None:
        df_["reviewTextWithSummary"] = df_["reviewTextWithSummary"].apply(
            lambda x: preprocess_text(x, STOP_WORDS, proc)
        )

    x_train, x_val, x_test, y_train, y_val, y_test = train_val_test_split(
        df_, random_state
    )
    vectorizer = TfidfVectorizer()
    vectorizer.fit(df_["reviewTextWithSummary"])
    x_train = vectorizer.transform(x_train["reviewTextWithSummary"])
    x_val = vectorizer.transform(x_val["reviewTextWithSummary"])
    x_test = vectorizer.transform(x_test["reviewTextWithSummary"])

    return x_train, x_val, x_test, y_train, y_val, y_test, vectorizer

## Testing different configs

In [50]:
param_grid = {
    "C": [0.01, 0.1, 1, 10, 100, 1000],
    "gamma": [1, 0.1, 0.01, 0.001, "scale", "auto"],
    "kernel": ["rbf", "linear", "poly", "sigmoid"],
}
n_jobs = None
verbose = 3
cv = 5

In [51]:
compare_list = pd.DataFrame(
    columns=[
        "tuning",
        "dataset",
        "proc",
        "C",
        "gamma",
        "kernel",
        "grid_score",
        "f1_score",
        "accuracy",
        "precision",
        "roc_auc",
    ]
)

### No preprocessing

In [52]:
x_train, x_val, x_test, y_train, y_val, y_test, vec_noproc = pipeline(None, df_testing)

In [53]:
svc_noproc_prelim = SVC()
svc_noproc_prelim.fit(x_train, y_train)

In [54]:
y_val_pred = svc_noproc_prelim.predict(x_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         1
           0       0.00      0.00      0.00         7
           1       0.92      1.00      0.96        96

    accuracy                           0.92       104
   macro avg       0.31      0.33      0.32       104
weighted avg       0.85      0.92      0.89       104



In [55]:
compare_list.loc[len(compare_list)] = [
    "before",
    "validation",
    None,
    "default",
    "default",
    "default",
    None,
    f1_score(y_val, y_val_pred, average="weighted"),
    accuracy_score(y_val, y_val_pred),
    precision_score(y_val, y_val_pred, average="weighted"),
    None,
]

In [56]:
svc_noproc_grid = GridSearchCV(SVC(), param_grid, cv=cv, verbose=verbose, n_jobs=n_jobs)
svc_noproc_grid.fit(x_val, y_val)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
[CV 1/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.952 total time=   0.0s
[CV 2/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.905 total time=   0.0s
[CV 3/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.905 total time=   0.0s
[CV 4/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.905 total time=   0.0s
[CV 5/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.950 total time=   0.0s
[CV 1/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.952 total time=   0.0s
[CV 2/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.905 total time=   0.0s
[CV 3/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.905 total time=   0.0s
[CV 4/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.905 total time=   0.0s
[CV 5/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.950 total time=   0.0s
[CV 1/5] END ......C=0.01, gamma=1, kernel=poly;, score=0.952 total time=   0.0s
[CV 2/5] END ......C=0.01, gamma=1, kernel=pol

[CV 4/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.905 total time=   0.0s
[CV 5/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.950 total time=   0.0s
[CV 1/5] END ..C=0.1, gamma=0.01, kernel=linear;, score=0.952 total time=   0.0s
[CV 2/5] END ..C=0.1, gamma=0.01, kernel=linear;, score=0.905 total time=   0.0s
[CV 3/5] END ..C=0.1, gamma=0.01, kernel=linear;, score=0.905 total time=   0.0s
[CV 4/5] END ..C=0.1, gamma=0.01, kernel=linear;, score=0.905 total time=   0.0s
[CV 5/5] END ..C=0.1, gamma=0.01, kernel=linear;, score=0.950 total time=   0.0s
[CV 1/5] END ....C=0.1, gamma=0.01, kernel=poly;, score=0.952 total time=   0.0s
[CV 2/5] END ....C=0.1, gamma=0.01, kernel=poly;, score=0.905 total time=   0.0s
[CV 3/5] END ....C=0.1, gamma=0.01, kernel=poly;, score=0.905 total time=   0.0s
[CV 4/5] END ....C=0.1, gamma=0.01, kernel=poly;, score=0.905 total time=   0.0s
[CV 5/5] END ....C=0.1, gamma=0.01, kernel=poly;, score=0.950 total time=   0.0s
[CV 1/5] END .C=0.1, gamma=0

[CV 5/5] END .....C=1, gamma=scale, kernel=poly;, score=0.950 total time=   0.0s
[CV 1/5] END ..C=1, gamma=scale, kernel=sigmoid;, score=0.952 total time=   0.0s
[CV 2/5] END ..C=1, gamma=scale, kernel=sigmoid;, score=0.905 total time=   0.0s
[CV 3/5] END ..C=1, gamma=scale, kernel=sigmoid;, score=0.905 total time=   0.0s
[CV 4/5] END ..C=1, gamma=scale, kernel=sigmoid;, score=0.905 total time=   0.0s
[CV 5/5] END ..C=1, gamma=scale, kernel=sigmoid;, score=0.950 total time=   0.0s
[CV 1/5] END .......C=1, gamma=auto, kernel=rbf;, score=0.952 total time=   0.0s
[CV 2/5] END .......C=1, gamma=auto, kernel=rbf;, score=0.905 total time=   0.0s
[CV 3/5] END .......C=1, gamma=auto, kernel=rbf;, score=0.905 total time=   0.0s
[CV 4/5] END .......C=1, gamma=auto, kernel=rbf;, score=0.905 total time=   0.0s
[CV 5/5] END .......C=1, gamma=auto, kernel=rbf;, score=0.950 total time=   0.0s
[CV 1/5] END ....C=1, gamma=auto, kernel=linear;, score=0.952 total time=   0.0s
[CV 2/5] END ....C=1, gamma=

[CV 1/5] END .....C=100, gamma=1, kernel=linear;, score=0.952 total time=   0.0s
[CV 2/5] END .....C=100, gamma=1, kernel=linear;, score=0.905 total time=   0.0s
[CV 3/5] END .....C=100, gamma=1, kernel=linear;, score=0.905 total time=   0.0s
[CV 4/5] END .....C=100, gamma=1, kernel=linear;, score=0.905 total time=   0.0s
[CV 5/5] END .....C=100, gamma=1, kernel=linear;, score=0.950 total time=   0.0s
[CV 1/5] END .......C=100, gamma=1, kernel=poly;, score=0.952 total time=   0.0s
[CV 2/5] END .......C=100, gamma=1, kernel=poly;, score=0.905 total time=   0.0s
[CV 3/5] END .......C=100, gamma=1, kernel=poly;, score=0.905 total time=   0.0s
[CV 4/5] END .......C=100, gamma=1, kernel=poly;, score=0.905 total time=   0.0s
[CV 5/5] END .......C=100, gamma=1, kernel=poly;, score=0.950 total time=   0.0s
[CV 1/5] END ....C=100, gamma=1, kernel=sigmoid;, score=0.952 total time=   0.0s
[CV 2/5] END ....C=100, gamma=1, kernel=sigmoid;, score=0.905 total time=   0.0s
[CV 3/5] END ....C=100, gamm

[CV 1/5] END ...C=1000, gamma=1, kernel=sigmoid;, score=0.952 total time=   0.0s
[CV 2/5] END ...C=1000, gamma=1, kernel=sigmoid;, score=0.905 total time=   0.0s
[CV 3/5] END ...C=1000, gamma=1, kernel=sigmoid;, score=0.905 total time=   0.0s
[CV 4/5] END ...C=1000, gamma=1, kernel=sigmoid;, score=0.905 total time=   0.0s
[CV 5/5] END ...C=1000, gamma=1, kernel=sigmoid;, score=0.950 total time=   0.0s
[CV 1/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.952 total time=   0.0s
[CV 2/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.905 total time=   0.0s
[CV 3/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.905 total time=   0.0s
[CV 4/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.905 total time=   0.0s
[CV 5/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.950 total time=   0.0s
[CV 1/5] END ..C=1000, gamma=0.1, kernel=linear;, score=0.952 total time=   0.0s
[CV 2/5] END ..C=1000, gamma=0.1, kernel=linear;, score=0.905 total time=   0.0s
[CV 3/5] END ..C=1000, gamma

In [57]:
print(classification_report(y_val, svc_noproc_grid.predict(x_val)))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         1
           0       0.00      0.00      0.00         7
           1       0.92      1.00      0.96        96

    accuracy                           0.92       104
   macro avg       0.31      0.33      0.32       104
weighted avg       0.85      0.92      0.89       104



In [58]:
print("best params for noproc")
print(svc_noproc_grid.best_params_)

best params for noproc
{'C': 0.01, 'gamma': 1, 'kernel': 'rbf'}


In [59]:
svc_noproc = SVC(**svc_noproc_grid.best_params_, probability=True)
svc_noproc.fit(x_train, y_train)

In [60]:
y_test_pred = svc_noproc.predict(x_test)
y_test_pred_proba = svc_noproc.predict_proba(x_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         4
           0       0.00      0.00      0.00         7
           1       0.89      1.00      0.94        93

    accuracy                           0.89       104
   macro avg       0.30      0.33      0.31       104
weighted avg       0.80      0.89      0.84       104



In [61]:
compare_list.loc[len(compare_list)] = [
    "after",
    "testing",
    None,
    svc_noproc_grid.best_params_["C"],
    svc_noproc_grid.best_params_["gamma"],
    svc_noproc_grid.best_params_["kernel"],
    svc_noproc_grid.best_score_,
    f1_score(y_test, y_test_pred, average="weighted"),
    accuracy_score(y_test, y_test_pred),
    precision_score(y_test, y_test_pred, average="weighted"),
    roc_auc_score(y_test, y_test_pred_proba, multi_class="ovr")
]

In [62]:
display(compare_list)

Unnamed: 0,tuning,dataset,proc,C,gamma,kernel,grid_score,f1_score,accuracy,precision,roc_auc
0,before,validation,,default,default,default,,0.886154,0.923077,0.852071,
1,after,testing,,0.01,1,rbf,0.923333,0.844299,0.894231,0.799649,0.932369


### Baseline preprocessing

In [63]:
x_train, x_val, x_test, y_train, y_val, y_test, vec_baseline = pipeline(
    "Baseline", df_testing
)

In [64]:
svc_baseline_prelim = SVC()
svc_baseline_prelim.fit(x_train, y_train)

In [65]:
y_val_pred = svc_baseline_prelim.predict(x_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         1
           0       0.00      0.00      0.00         7
           1       0.92      1.00      0.96        96

    accuracy                           0.92       104
   macro avg       0.31      0.33      0.32       104
weighted avg       0.85      0.92      0.89       104



In [66]:
compare_list.loc[len(compare_list)] = [
    "before",
    "validation",
    "Baseline",
    "default",
    "default",
    "default",
    None,
    f1_score(y_val, y_val_pred, average="weighted"),
    accuracy_score(y_val, y_val_pred),
    precision_score(y_val, y_val_pred, average="weighted"),
    None,
]

In [67]:
svc_baseline_grid = GridSearchCV(
    SVC(), param_grid, cv=cv, verbose=verbose, n_jobs=n_jobs
)
svc_baseline_grid.fit(x_val, y_val)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
[CV 1/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.952 total time=   0.0s
[CV 2/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.905 total time=   0.0s
[CV 3/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.905 total time=   0.0s
[CV 4/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.905 total time=   0.0s
[CV 5/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.950 total time=   0.0s
[CV 1/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.952 total time=   0.0s
[CV 2/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.905 total time=   0.0s
[CV 3/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.905 total time=   0.0s
[CV 4/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.905 total time=   0.0s
[CV 5/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.950 total time=   0.0s
[CV 1/5] END ......C=0.01, gamma=1, kernel=poly;, score=0.952 total time=   0.0s
[CV 2/5] END ......C=0.01, gamma=1, kernel=pol

[CV 4/5] END C=0.01, gamma=auto, kernel=sigmoid;, score=0.905 total time=   0.0s
[CV 5/5] END C=0.01, gamma=auto, kernel=sigmoid;, score=0.950 total time=   0.0s
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.952 total time=   0.0s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.905 total time=   0.0s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.905 total time=   0.0s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.905 total time=   0.0s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.950 total time=   0.0s
[CV 1/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.952 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.905 total time=   0.0s
[CV 3/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.905 total time=   0.0s
[CV 4/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.905 total time=   0.0s
[CV 5/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.950 total time=   0.0s
[CV 1/5] END .......C=0.1, g

[CV 1/5] END ....C=1, gamma=0.1, kernel=sigmoid;, score=0.952 total time=   0.0s
[CV 2/5] END ....C=1, gamma=0.1, kernel=sigmoid;, score=0.905 total time=   0.0s
[CV 3/5] END ....C=1, gamma=0.1, kernel=sigmoid;, score=0.905 total time=   0.0s
[CV 4/5] END ....C=1, gamma=0.1, kernel=sigmoid;, score=0.905 total time=   0.0s
[CV 5/5] END ....C=1, gamma=0.1, kernel=sigmoid;, score=0.950 total time=   0.0s
[CV 1/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.952 total time=   0.0s
[CV 2/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.905 total time=   0.0s
[CV 3/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.905 total time=   0.0s
[CV 4/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.905 total time=   0.0s
[CV 5/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.950 total time=   0.0s
[CV 1/5] END ....C=1, gamma=0.01, kernel=linear;, score=0.952 total time=   0.0s
[CV 2/5] END ....C=1, gamma=0.01, kernel=linear;, score=0.905 total time=   0.0s
[CV 3/5] END ....C=1, gamma=

[CV 5/5] END .......C=10, gamma=0.1, kernel=rbf;, score=0.950 total time=   0.0s
[CV 1/5] END ....C=10, gamma=0.1, kernel=linear;, score=0.952 total time=   0.0s
[CV 2/5] END ....C=10, gamma=0.1, kernel=linear;, score=0.905 total time=   0.0s
[CV 3/5] END ....C=10, gamma=0.1, kernel=linear;, score=0.905 total time=   0.0s
[CV 4/5] END ....C=10, gamma=0.1, kernel=linear;, score=0.905 total time=   0.0s
[CV 5/5] END ....C=10, gamma=0.1, kernel=linear;, score=0.950 total time=   0.0s
[CV 1/5] END ......C=10, gamma=0.1, kernel=poly;, score=0.952 total time=   0.0s
[CV 2/5] END ......C=10, gamma=0.1, kernel=poly;, score=0.905 total time=   0.0s
[CV 3/5] END ......C=10, gamma=0.1, kernel=poly;, score=0.905 total time=   0.0s
[CV 4/5] END ......C=10, gamma=0.1, kernel=poly;, score=0.905 total time=   0.0s
[CV 5/5] END ......C=10, gamma=0.1, kernel=poly;, score=0.950 total time=   0.0s
[CV 1/5] END ...C=10, gamma=0.1, kernel=sigmoid;, score=0.952 total time=   0.0s
[CV 2/5] END ...C=10, gamma=

[CV 5/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.950 total time=   0.0s
[CV 1/5] END ..C=100, gamma=0.01, kernel=linear;, score=0.952 total time=   0.0s
[CV 2/5] END ..C=100, gamma=0.01, kernel=linear;, score=0.905 total time=   0.0s
[CV 3/5] END ..C=100, gamma=0.01, kernel=linear;, score=0.905 total time=   0.0s
[CV 4/5] END ..C=100, gamma=0.01, kernel=linear;, score=0.905 total time=   0.0s
[CV 5/5] END ..C=100, gamma=0.01, kernel=linear;, score=0.950 total time=   0.0s
[CV 1/5] END ....C=100, gamma=0.01, kernel=poly;, score=0.952 total time=   0.0s
[CV 2/5] END ....C=100, gamma=0.01, kernel=poly;, score=0.905 total time=   0.0s
[CV 3/5] END ....C=100, gamma=0.01, kernel=poly;, score=0.905 total time=   0.0s
[CV 4/5] END ....C=100, gamma=0.01, kernel=poly;, score=0.905 total time=   0.0s
[CV 5/5] END ....C=100, gamma=0.01, kernel=poly;, score=0.950 total time=   0.0s
[CV 1/5] END .C=100, gamma=0.01, kernel=sigmoid;, score=0.952 total time=   0.0s
[CV 2/5] END .C=100, gamma=0

[CV 5/5] END ..C=1000, gamma=0.001, kernel=poly;, score=0.950 total time=   0.0s
[CV 1/5] END C=1000, gamma=0.001, kernel=sigmoid;, score=0.952 total time=   0.0s
[CV 2/5] END C=1000, gamma=0.001, kernel=sigmoid;, score=0.905 total time=   0.0s
[CV 3/5] END C=1000, gamma=0.001, kernel=sigmoid;, score=0.905 total time=   0.0s
[CV 4/5] END C=1000, gamma=0.001, kernel=sigmoid;, score=0.905 total time=   0.0s
[CV 5/5] END C=1000, gamma=0.001, kernel=sigmoid;, score=0.950 total time=   0.0s
[CV 1/5] END ...C=1000, gamma=scale, kernel=rbf;, score=0.952 total time=   0.0s
[CV 2/5] END ...C=1000, gamma=scale, kernel=rbf;, score=0.905 total time=   0.0s
[CV 3/5] END ...C=1000, gamma=scale, kernel=rbf;, score=0.905 total time=   0.0s
[CV 4/5] END ...C=1000, gamma=scale, kernel=rbf;, score=0.905 total time=   0.0s
[CV 5/5] END ...C=1000, gamma=scale, kernel=rbf;, score=0.950 total time=   0.0s
[CV 1/5] END C=1000, gamma=scale, kernel=linear;, score=0.952 total time=   0.0s
[CV 2/5] END C=1000, ga

In [68]:
print(classification_report(y_val, svc_baseline_grid.predict(x_val)))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         1
           0       0.00      0.00      0.00         7
           1       0.92      1.00      0.96        96

    accuracy                           0.92       104
   macro avg       0.31      0.33      0.32       104
weighted avg       0.85      0.92      0.89       104



In [69]:
print("best params for baseline")
print(svc_baseline_grid.best_params_)

best params for baseline
{'C': 0.01, 'gamma': 1, 'kernel': 'rbf'}


In [70]:
svc_baseline = SVC(**svc_baseline_grid.best_params_, probability=True)
svc_baseline.fit(x_train, y_train)

In [71]:
y_test_pred = svc_baseline.predict(x_test)
y_test_pred_proba = svc_baseline.predict_proba(x_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         4
           0       0.00      0.00      0.00         7
           1       0.89      1.00      0.94        93

    accuracy                           0.89       104
   macro avg       0.30      0.33      0.31       104
weighted avg       0.80      0.89      0.84       104



In [72]:
compare_list.loc[len(compare_list)] = [
    "after",
    "testing",
    "Baseline",
    svc_baseline_grid.best_params_["C"],
    svc_baseline_grid.best_params_["gamma"],
    svc_baseline_grid.best_params_["kernel"],
    svc_baseline_grid.best_score_,
    f1_score(y_test, y_test_pred, average="weighted"),
    accuracy_score(y_test, y_test_pred),
    precision_score(y_test, y_test_pred, average="weighted"),
    roc_auc_score(y_test, y_test_pred_proba, multi_class="ovr")
]

In [73]:
display(compare_list)

Unnamed: 0,tuning,dataset,proc,C,gamma,kernel,grid_score,f1_score,accuracy,precision,roc_auc
0,before,validation,,default,default,default,,0.886154,0.923077,0.852071,
1,after,testing,,0.01,1,rbf,0.923333,0.844299,0.894231,0.799649,0.932369
2,before,validation,Baseline,default,default,default,,0.886154,0.923077,0.852071,
3,after,testing,Baseline,0.01,1,rbf,0.923333,0.844299,0.894231,0.799649,0.833617


### Stemmed + baseline preprocessing

In [74]:
x_train, x_val, x_test, y_train, y_val, y_test, vec_stem = pipeline("Stemmed", df_testing)

In [75]:
svc_stem_prelim = SVC()
svc_stem_prelim.fit(x_train, y_train)

In [76]:
y_val_pred = svc_stem_prelim.predict(x_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         1
           0       0.00      0.00      0.00         7
           1       0.92      1.00      0.96        96

    accuracy                           0.92       104
   macro avg       0.31      0.33      0.32       104
weighted avg       0.85      0.92      0.89       104



In [77]:
compare_list.loc[len(compare_list)] = [
    "before",
    "validation",
    "Stemmed",
    "default",
    "default",
    "default",
    None,
    f1_score(y_val, y_val_pred, average="weighted"),
    accuracy_score(y_val, y_val_pred),
    precision_score(y_val, y_val_pred, average="weighted"),
    None,
]

In [78]:
svc_stem_grid = GridSearchCV(SVC(), param_grid, cv=cv, verbose=verbose, n_jobs=n_jobs)
svc_stem_grid.fit(x_val, y_val)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
[CV 1/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.952 total time=   0.0s
[CV 2/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.905 total time=   0.0s
[CV 3/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.905 total time=   0.0s
[CV 4/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.905 total time=   0.0s
[CV 5/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.950 total time=   0.0s
[CV 1/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.952 total time=   0.0s
[CV 2/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.905 total time=   0.0s
[CV 3/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.905 total time=   0.0s
[CV 4/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.905 total time=   0.0s
[CV 5/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.950 total time=   0.0s
[CV 1/5] END ......C=0.01, gamma=1, kernel=poly;, score=0.952 total time=   0.0s
[CV 2/5] END ......C=0.01, gamma=1, kernel=pol

[CV 2/5] END ....C=0.01, gamma=auto, kernel=rbf;, score=0.905 total time=   0.0s
[CV 3/5] END ....C=0.01, gamma=auto, kernel=rbf;, score=0.905 total time=   0.0s
[CV 4/5] END ....C=0.01, gamma=auto, kernel=rbf;, score=0.905 total time=   0.0s
[CV 5/5] END ....C=0.01, gamma=auto, kernel=rbf;, score=0.950 total time=   0.0s
[CV 1/5] END .C=0.01, gamma=auto, kernel=linear;, score=0.952 total time=   0.0s
[CV 2/5] END .C=0.01, gamma=auto, kernel=linear;, score=0.905 total time=   0.0s
[CV 3/5] END .C=0.01, gamma=auto, kernel=linear;, score=0.905 total time=   0.0s
[CV 4/5] END .C=0.01, gamma=auto, kernel=linear;, score=0.905 total time=   0.0s
[CV 5/5] END .C=0.01, gamma=auto, kernel=linear;, score=0.950 total time=   0.0s
[CV 1/5] END ...C=0.01, gamma=auto, kernel=poly;, score=0.952 total time=   0.0s
[CV 2/5] END ...C=0.01, gamma=auto, kernel=poly;, score=0.905 total time=   0.0s
[CV 3/5] END ...C=0.01, gamma=auto, kernel=poly;, score=0.905 total time=   0.0s
[CV 4/5] END ...C=0.01, gamm

[CV 3/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.905 total time=   0.0s
[CV 4/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.905 total time=   0.0s
[CV 5/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.950 total time=   0.0s
[CV 1/5] END ....C=1, gamma=0.01, kernel=linear;, score=0.952 total time=   0.0s
[CV 2/5] END ....C=1, gamma=0.01, kernel=linear;, score=0.905 total time=   0.0s
[CV 3/5] END ....C=1, gamma=0.01, kernel=linear;, score=0.905 total time=   0.0s
[CV 4/5] END ....C=1, gamma=0.01, kernel=linear;, score=0.905 total time=   0.0s
[CV 5/5] END ....C=1, gamma=0.01, kernel=linear;, score=0.950 total time=   0.0s
[CV 1/5] END ......C=1, gamma=0.01, kernel=poly;, score=0.952 total time=   0.0s
[CV 2/5] END ......C=1, gamma=0.01, kernel=poly;, score=0.905 total time=   0.0s
[CV 3/5] END ......C=1, gamma=0.01, kernel=poly;, score=0.905 total time=   0.0s
[CV 4/5] END ......C=1, gamma=0.01, kernel=poly;, score=0.905 total time=   0.0s
[CV 5/5] END ......C=1, gamm

[CV 4/5] END .C=10, gamma=scale, kernel=sigmoid;, score=0.905 total time=   0.0s
[CV 5/5] END .C=10, gamma=scale, kernel=sigmoid;, score=0.950 total time=   0.0s
[CV 1/5] END ......C=10, gamma=auto, kernel=rbf;, score=0.952 total time=   0.0s
[CV 2/5] END ......C=10, gamma=auto, kernel=rbf;, score=0.905 total time=   0.0s
[CV 3/5] END ......C=10, gamma=auto, kernel=rbf;, score=0.905 total time=   0.0s
[CV 4/5] END ......C=10, gamma=auto, kernel=rbf;, score=0.905 total time=   0.0s
[CV 5/5] END ......C=10, gamma=auto, kernel=rbf;, score=0.950 total time=   0.0s
[CV 1/5] END ...C=10, gamma=auto, kernel=linear;, score=0.952 total time=   0.0s
[CV 2/5] END ...C=10, gamma=auto, kernel=linear;, score=0.905 total time=   0.0s
[CV 3/5] END ...C=10, gamma=auto, kernel=linear;, score=0.905 total time=   0.0s
[CV 4/5] END ...C=10, gamma=auto, kernel=linear;, score=0.905 total time=   0.0s
[CV 5/5] END ...C=10, gamma=auto, kernel=linear;, score=0.950 total time=   0.0s
[CV 1/5] END .....C=10, gamm

[CV 3/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.905 total time=   0.0s
[CV 4/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.905 total time=   0.0s
[CV 5/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.950 total time=   0.0s
[CV 1/5] END ..C=1000, gamma=0.1, kernel=linear;, score=0.952 total time=   0.0s
[CV 2/5] END ..C=1000, gamma=0.1, kernel=linear;, score=0.905 total time=   0.0s
[CV 3/5] END ..C=1000, gamma=0.1, kernel=linear;, score=0.905 total time=   0.0s
[CV 4/5] END ..C=1000, gamma=0.1, kernel=linear;, score=0.905 total time=   0.0s
[CV 5/5] END ..C=1000, gamma=0.1, kernel=linear;, score=0.950 total time=   0.0s
[CV 1/5] END ....C=1000, gamma=0.1, kernel=poly;, score=0.952 total time=   0.0s
[CV 2/5] END ....C=1000, gamma=0.1, kernel=poly;, score=0.905 total time=   0.0s
[CV 3/5] END ....C=1000, gamma=0.1, kernel=poly;, score=0.905 total time=   0.0s
[CV 4/5] END ....C=1000, gamma=0.1, kernel=poly;, score=0.905 total time=   0.0s
[CV 5/5] END ....C=1000, gam

In [79]:
print(classification_report(y_val, svc_stem_grid.predict(x_val)))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         1
           0       0.00      0.00      0.00         7
           1       0.92      1.00      0.96        96

    accuracy                           0.92       104
   macro avg       0.31      0.33      0.32       104
weighted avg       0.85      0.92      0.89       104



In [80]:
print("best params for stem")
print(svc_stem_grid.best_params_)

best params for stem
{'C': 0.01, 'gamma': 1, 'kernel': 'rbf'}


In [81]:
svc_stem = SVC(**svc_stem_grid.best_params_, probability=True)
svc_stem.fit(x_train, y_train)

In [82]:
y_test_pred = svc_stem.predict(x_test)
y_test_pred_proba = svc_stem.predict_proba(x_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         4
           0       0.00      0.00      0.00         7
           1       0.89      1.00      0.94        93

    accuracy                           0.89       104
   macro avg       0.30      0.33      0.31       104
weighted avg       0.80      0.89      0.84       104



In [83]:
compare_list.loc[len(compare_list)] = [
    "after",
    "testing",
    "Stemmed",
    svc_stem_grid.best_params_["C"],
    svc_stem_grid.best_params_["gamma"],
    svc_stem_grid.best_params_["kernel"],
    svc_stem_grid.best_score_,
    f1_score(y_test, y_test_pred, average="weighted"),
    accuracy_score(y_test, y_test_pred),
    precision_score(y_test, y_test_pred, average="weighted"),
    roc_auc_score(y_test, y_test_pred_proba, multi_class="ovr")
]

In [84]:
display(compare_list)

Unnamed: 0,tuning,dataset,proc,C,gamma,kernel,grid_score,f1_score,accuracy,precision,roc_auc
0,before,validation,,default,default,default,,0.886154,0.923077,0.852071,
1,after,testing,,0.01,1,rbf,0.923333,0.844299,0.894231,0.799649,0.932369
2,before,validation,Baseline,default,default,default,,0.886154,0.923077,0.852071,
3,after,testing,Baseline,0.01,1,rbf,0.923333,0.844299,0.894231,0.799649,0.833617
4,before,validation,Stemmed,default,default,default,,0.886154,0.923077,0.852071,
5,after,testing,Stemmed,0.01,1,rbf,0.923333,0.844299,0.894231,0.799649,0.89625


### Lemmatized + baseline preprocessing

In [85]:
x_train, x_val, x_test, y_train, y_val, y_test, vec_lem = pipeline("Lemmatized", df_testing)

In [86]:
svc_lem_prelim = SVC()
svc_lem_prelim.fit(x_train, y_train)

In [87]:
y_val_pred = svc_lem_prelim.predict(x_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         1
           0       0.00      0.00      0.00         7
           1       0.92      1.00      0.96        96

    accuracy                           0.92       104
   macro avg       0.31      0.33      0.32       104
weighted avg       0.85      0.92      0.89       104



In [88]:
compare_list.loc[len(compare_list)] = [
    "before",
    "validation",
    "Lemmatized",
    "default",
    "default",
    "default",
    None,
    f1_score(y_val, y_val_pred, average="weighted"),
    accuracy_score(y_val, y_val_pred),
    precision_score(y_val, y_val_pred, average="weighted"),
    None,
]

In [89]:
svc_lem_grid = GridSearchCV(SVC(), param_grid, cv=cv, verbose=verbose, n_jobs=n_jobs)
svc_lem_grid.fit(x_val, y_val)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
[CV 1/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.952 total time=   0.0s
[CV 2/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.905 total time=   0.0s
[CV 3/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.905 total time=   0.0s
[CV 4/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.905 total time=   0.0s
[CV 5/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.950 total time=   0.0s
[CV 1/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.952 total time=   0.0s
[CV 2/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.905 total time=   0.0s
[CV 3/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.905 total time=   0.0s
[CV 4/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.905 total time=   0.0s
[CV 5/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.950 total time=   0.0s
[CV 1/5] END ......C=0.01, gamma=1, kernel=poly;, score=0.952 total time=   0.0s
[CV 2/5] END ......C=0.01, gamma=1, kernel=pol

[CV 3/5] END ...C=0.1, gamma=0.001, kernel=poly;, score=0.905 total time=   0.0s
[CV 4/5] END ...C=0.1, gamma=0.001, kernel=poly;, score=0.905 total time=   0.0s
[CV 5/5] END ...C=0.1, gamma=0.001, kernel=poly;, score=0.950 total time=   0.0s
[CV 1/5] END C=0.1, gamma=0.001, kernel=sigmoid;, score=0.952 total time=   0.0s
[CV 2/5] END C=0.1, gamma=0.001, kernel=sigmoid;, score=0.905 total time=   0.0s
[CV 3/5] END C=0.1, gamma=0.001, kernel=sigmoid;, score=0.905 total time=   0.0s
[CV 4/5] END C=0.1, gamma=0.001, kernel=sigmoid;, score=0.905 total time=   0.0s
[CV 5/5] END C=0.1, gamma=0.001, kernel=sigmoid;, score=0.950 total time=   0.0s
[CV 1/5] END ....C=0.1, gamma=scale, kernel=rbf;, score=0.952 total time=   0.0s
[CV 2/5] END ....C=0.1, gamma=scale, kernel=rbf;, score=0.905 total time=   0.0s
[CV 3/5] END ....C=0.1, gamma=scale, kernel=rbf;, score=0.905 total time=   0.0s
[CV 4/5] END ....C=0.1, gamma=scale, kernel=rbf;, score=0.905 total time=   0.0s
[CV 5/5] END ....C=0.1, gamm

[CV 5/5] END .......C=1, gamma=auto, kernel=rbf;, score=0.950 total time=   0.0s
[CV 1/5] END ....C=1, gamma=auto, kernel=linear;, score=0.952 total time=   0.0s
[CV 2/5] END ....C=1, gamma=auto, kernel=linear;, score=0.905 total time=   0.0s
[CV 3/5] END ....C=1, gamma=auto, kernel=linear;, score=0.905 total time=   0.0s
[CV 4/5] END ....C=1, gamma=auto, kernel=linear;, score=0.905 total time=   0.0s
[CV 5/5] END ....C=1, gamma=auto, kernel=linear;, score=0.950 total time=   0.0s
[CV 1/5] END ......C=1, gamma=auto, kernel=poly;, score=0.952 total time=   0.0s
[CV 2/5] END ......C=1, gamma=auto, kernel=poly;, score=0.905 total time=   0.0s
[CV 3/5] END ......C=1, gamma=auto, kernel=poly;, score=0.905 total time=   0.0s
[CV 4/5] END ......C=1, gamma=auto, kernel=poly;, score=0.905 total time=   0.0s
[CV 5/5] END ......C=1, gamma=auto, kernel=poly;, score=0.950 total time=   0.0s
[CV 1/5] END ...C=1, gamma=auto, kernel=sigmoid;, score=0.952 total time=   0.0s
[CV 2/5] END ...C=1, gamma=a

[CV 2/5] END ...C=100, gamma=0.1, kernel=linear;, score=0.905 total time=   0.0s
[CV 3/5] END ...C=100, gamma=0.1, kernel=linear;, score=0.905 total time=   0.0s
[CV 4/5] END ...C=100, gamma=0.1, kernel=linear;, score=0.905 total time=   0.0s
[CV 5/5] END ...C=100, gamma=0.1, kernel=linear;, score=0.950 total time=   0.0s
[CV 1/5] END .....C=100, gamma=0.1, kernel=poly;, score=0.952 total time=   0.0s
[CV 2/5] END .....C=100, gamma=0.1, kernel=poly;, score=0.905 total time=   0.0s
[CV 3/5] END .....C=100, gamma=0.1, kernel=poly;, score=0.905 total time=   0.0s
[CV 4/5] END .....C=100, gamma=0.1, kernel=poly;, score=0.905 total time=   0.0s
[CV 5/5] END .....C=100, gamma=0.1, kernel=poly;, score=0.950 total time=   0.0s
[CV 1/5] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.952 total time=   0.0s
[CV 2/5] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.905 total time=   0.0s
[CV 3/5] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.905 total time=   0.0s
[CV 4/5] END ..C=100, gamma=

[CV 2/5] END ...C=1000, gamma=0.01, kernel=poly;, score=0.905 total time=   0.0s
[CV 3/5] END ...C=1000, gamma=0.01, kernel=poly;, score=0.905 total time=   0.0s
[CV 4/5] END ...C=1000, gamma=0.01, kernel=poly;, score=0.905 total time=   0.0s
[CV 5/5] END ...C=1000, gamma=0.01, kernel=poly;, score=0.950 total time=   0.0s
[CV 1/5] END C=1000, gamma=0.01, kernel=sigmoid;, score=0.952 total time=   0.0s
[CV 2/5] END C=1000, gamma=0.01, kernel=sigmoid;, score=0.905 total time=   0.0s
[CV 3/5] END C=1000, gamma=0.01, kernel=sigmoid;, score=0.905 total time=   0.0s
[CV 4/5] END C=1000, gamma=0.01, kernel=sigmoid;, score=0.905 total time=   0.0s
[CV 5/5] END C=1000, gamma=0.01, kernel=sigmoid;, score=0.950 total time=   0.0s
[CV 1/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=0.952 total time=   0.0s
[CV 2/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=0.905 total time=   0.0s
[CV 3/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=0.905 total time=   0.0s
[CV 4/5] END ...C=1000, gamm

In [90]:
print(classification_report(y_val, svc_lem_grid.predict(x_val)))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         1
           0       0.00      0.00      0.00         7
           1       0.92      1.00      0.96        96

    accuracy                           0.92       104
   macro avg       0.31      0.33      0.32       104
weighted avg       0.85      0.92      0.89       104



In [91]:
print("best params for lem")
print(svc_lem_grid.best_params_)

best params for lem
{'C': 0.01, 'gamma': 1, 'kernel': 'rbf'}


In [92]:
svc_lem = SVC(**svc_lem_grid.best_params_, probability=True)
svc_lem.fit(x_train, y_train)

In [93]:
y_test_pred = svc_lem.predict(x_test)
y_test_pred_proba = svc_lem.predict_proba(x_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         4
           0       0.00      0.00      0.00         7
           1       0.89      1.00      0.94        93

    accuracy                           0.89       104
   macro avg       0.30      0.33      0.31       104
weighted avg       0.80      0.89      0.84       104



In [94]:
compare_list.loc[len(compare_list)] = [
    "after",
    "testing",
    "Lemmatized",
    svc_lem_grid.best_params_["C"],
    svc_lem_grid.best_params_["gamma"],
    svc_lem_grid.best_params_["kernel"],
    svc_lem_grid.best_score_,
    f1_score(y_test, y_test_pred, average="weighted"),
    accuracy_score(y_test, y_test_pred),
    precision_score(y_test, y_test_pred, average="weighted"),
    roc_auc_score(y_test, y_test_pred_proba, multi_class="ovr")
]

In [95]:
display(compare_list)

Unnamed: 0,tuning,dataset,proc,C,gamma,kernel,grid_score,f1_score,accuracy,precision,roc_auc
0,before,validation,,default,default,default,,0.886154,0.923077,0.852071,
1,after,testing,,0.01,1,rbf,0.923333,0.844299,0.894231,0.799649,0.932369
2,before,validation,Baseline,default,default,default,,0.886154,0.923077,0.852071,
3,after,testing,Baseline,0.01,1,rbf,0.923333,0.844299,0.894231,0.799649,0.833617
4,before,validation,Stemmed,default,default,default,,0.886154,0.923077,0.852071,
5,after,testing,Stemmed,0.01,1,rbf,0.923333,0.844299,0.894231,0.799649,0.89625
6,before,validation,Lemmatized,default,default,default,,0.886154,0.923077,0.852071,
7,after,testing,Lemmatized,0.01,1,rbf,0.923333,0.844299,0.894231,0.799649,0.856928
