In [1]:
import pandas as pd
import nltk
import re
import random
import numpy as np
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from nltk.corpus import stopwords
import warnings

warnings.filterwarnings("ignore")

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to /home/ibrahim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ibrahim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ibrahim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
random_state = 42
testing_n = 500
testing_frac = 0.01
random.seed(random_state)
data_path = "./data/combined.csv"

In [3]:
df = pd.read_csv(data_path, low_memory=False)
df = df[["overall", "reviewText", "summary"]]
df.dropna(inplace=True)
df["sentiment"] = df["overall"].apply(lambda x: 1 if x > 3 else -1 if x < 3 else 0)
df["reviewTextWithSummary"] = df["summary"] + " " + df["reviewText"]
df.drop(["overall", "summary", "reviewText"], axis=1, inplace=True)
df.head()

Unnamed: 0,sentiment,reviewTextWithSummary
0,1,Five Stars As advertised. Reasonably priced
1,1,Good for the face Like the oder and the feel w...
2,-1,Smells awful I bought this to smell nice after...
3,1,Truth is There IS Nothing Like an AQUA VELVA M...
4,1,Bvlgari Shampoo If you ever want to feel pampe...


# Model experimentation

In [4]:
df_testing = (
    df.groupby("sentiment")
    .apply(lambda x: x.sample(n=testing_n, random_state=random_state, replace=True))
    .reset_index(drop=True)
)
df_testing = df.sample(frac=testing_frac)
df_testing["sentiment"].value_counts()

sentiment
 1    6610
 0     467
-1     412
Name: count, dtype: int64

In [5]:
df_testing.head()

Unnamed: 0,sentiment,reviewTextWithSummary
642853,1,Can't Beat the Price As expected.
139966,1,"Five Stars Perfect for my needs, fits a shure ..."
659227,0,Disappointed in claims Love glitter pens but h...
728087,1,"Five Stars Love it, have already changed the r..."
555851,-1,Returned a bought a better winder. Flimsy cons...


In [6]:
print("Dataset size:", len(df_testing))

Dataset size: 7489


In [7]:
STOP_WORDS = set(stopwords.words("english"))

In [8]:
def preprocess_text(sentence, stop, type_proc=None):
    words = []
    for word in sentence.lower().strip().split():

        word = re.sub("\d", "", word)
        word = re.sub("[^\w\s]", "", word)

        if word not in stop and word != "":
            words.append(preprocess_type(word, type_proc))

    return " ".join(words)

In [9]:
def preprocess_type(word, type_proc):
    if type_proc == "Baseline":
        return word
    elif type_proc == "Stemmed":
        return PorterStemmer().stem(word)
    elif type_proc == "Lemmatized":
        return WordNetLemmatizer().lemmatize(word)
    else:
        raise ValueError("Invalid Preprocessing Type")

In [10]:
def train_val_test_split(df=df, random_state=random_state):
    x = df[["reviewTextWithSummary"]]
    y = df["sentiment"]
    x_train, x_tmp, y_train, y_tmp = train_test_split(
        x, y, test_size=0.3, random_state=random_state
    )
    x_val, x_test, y_val, y_test = train_test_split(
        x_tmp, y_tmp, test_size=0.5, random_state=random_state
    )
    return x_train, x_val, x_test, y_train, y_val, y_test

In [11]:
def pipeline(proc, df, random_state=random_state):
    df_ = df.copy()
    if proc is not None:
        df_["reviewTextWithSummary"] = df_["reviewTextWithSummary"].apply(
            lambda x: preprocess_text(x, STOP_WORDS, proc)
        )

    x_train, x_val, x_test, y_train, y_val, y_test = train_val_test_split(
        df_, random_state
    )
    vectorizer = TfidfVectorizer()
    vectorizer.fit(df_["reviewTextWithSummary"])
    x_train = vectorizer.transform(x_train["reviewTextWithSummary"])
    x_val = vectorizer.transform(x_val["reviewTextWithSummary"])
    x_test = vectorizer.transform(x_test["reviewTextWithSummary"])

    return x_train, x_val, x_test, y_train, y_val, y_test, vectorizer

## Testing different configs

In [12]:
param_grid = {
    "C": [0.01, 0.1, 1, 10, 100, 1000],
    "gamma": [1, 0.1, 0.01, 0.001, "scale", "auto"],
    "kernel": ["rbf", "linear", "poly", "sigmoid"],
}
n_jobs = None
verbose = 3
cv = 5

In [13]:
compare_list = pd.DataFrame(
    columns=[
        "tuning",
        "dataset",
        "proc",
        "C",
        "gamma",
        "kernel",
        "grid_score",
        "f1_score",
        "accuracy",
    ]
)

### No preprocessing

In [14]:
x_train, x_val, x_test, y_train, y_val, y_test, vec_noproc = pipeline(None, df_testing)

In [15]:
svc_noproc_prelim = SVC()
svc_noproc_prelim.fit(x_train, y_train)

In [16]:
y_val_pred = svc_noproc_prelim.predict(x_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

          -1       0.91      0.16      0.27        64
           0       1.00      0.18      0.30        74
           1       0.90      1.00      0.94       985

    accuracy                           0.90      1123
   macro avg       0.93      0.44      0.50      1123
weighted avg       0.90      0.90      0.86      1123



In [17]:
compare_list.loc[len(compare_list)] = [
    "before",
    "validation",
    None,
    "default",
    "default",
    "default",
    None,
    f1_score(y_val, y_val_pred, average="weighted"),
    accuracy_score(y_val, y_val_pred),
]

In [18]:
svc_noproc_grid = GridSearchCV(SVC(), param_grid, cv=cv, verbose=verbose, n_jobs=n_jobs)
svc_noproc_grid.fit(x_val, y_val)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
[CV 1/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.876 total time=   0.1s
[CV 2/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.876 total time=   0.1s
[CV 3/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.876 total time=   0.1s
[CV 4/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.879 total time=   0.1s
[CV 5/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.879 total time=   0.1s
[CV 1/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.876 total time=   0.1s
[CV 2/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.876 total time=   0.1s
[CV 3/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.876 total time=   0.1s
[CV 4/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.879 total time=   0.1s
[CV 5/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.879 total time=   0.1s
[CV 1/5] END ......C=0.01, gamma=1, kernel=poly;, score=0.876 total time=   0.2s
[CV 2/5] END ......C=0.01, gamma=1, kernel=pol

[CV 1/5] END .C=0.01, gamma=auto, kernel=linear;, score=0.876 total time=   0.1s
[CV 2/5] END .C=0.01, gamma=auto, kernel=linear;, score=0.876 total time=   0.1s
[CV 3/5] END .C=0.01, gamma=auto, kernel=linear;, score=0.876 total time=   0.1s
[CV 4/5] END .C=0.01, gamma=auto, kernel=linear;, score=0.879 total time=   0.1s
[CV 5/5] END .C=0.01, gamma=auto, kernel=linear;, score=0.879 total time=   0.1s
[CV 1/5] END ...C=0.01, gamma=auto, kernel=poly;, score=0.876 total time=   0.0s
[CV 2/5] END ...C=0.01, gamma=auto, kernel=poly;, score=0.876 total time=   0.0s
[CV 3/5] END ...C=0.01, gamma=auto, kernel=poly;, score=0.876 total time=   0.0s
[CV 4/5] END ...C=0.01, gamma=auto, kernel=poly;, score=0.879 total time=   0.0s
[CV 5/5] END ...C=0.01, gamma=auto, kernel=poly;, score=0.879 total time=   0.0s
[CV 1/5] END C=0.01, gamma=auto, kernel=sigmoid;, score=0.876 total time=   0.0s
[CV 2/5] END C=0.01, gamma=auto, kernel=sigmoid;, score=0.876 total time=   0.0s
[CV 3/5] END C=0.01, gamma=a

[CV 4/5] END .C=0.1, gamma=scale, kernel=linear;, score=0.879 total time=   0.1s
[CV 5/5] END .C=0.1, gamma=scale, kernel=linear;, score=0.879 total time=   0.1s
[CV 1/5] END ...C=0.1, gamma=scale, kernel=poly;, score=0.876 total time=   0.2s
[CV 2/5] END ...C=0.1, gamma=scale, kernel=poly;, score=0.876 total time=   0.2s
[CV 3/5] END ...C=0.1, gamma=scale, kernel=poly;, score=0.876 total time=   0.2s
[CV 4/5] END ...C=0.1, gamma=scale, kernel=poly;, score=0.879 total time=   0.2s
[CV 5/5] END ...C=0.1, gamma=scale, kernel=poly;, score=0.879 total time=   0.2s
[CV 1/5] END C=0.1, gamma=scale, kernel=sigmoid;, score=0.876 total time=   0.1s
[CV 2/5] END C=0.1, gamma=scale, kernel=sigmoid;, score=0.876 total time=   0.1s
[CV 3/5] END C=0.1, gamma=scale, kernel=sigmoid;, score=0.876 total time=   0.1s
[CV 4/5] END C=0.1, gamma=scale, kernel=sigmoid;, score=0.879 total time=   0.1s
[CV 5/5] END C=0.1, gamma=scale, kernel=sigmoid;, score=0.879 total time=   0.1s
[CV 1/5] END .....C=0.1, gam

[CV 5/5] END .....C=1, gamma=0.001, kernel=poly;, score=0.879 total time=   0.0s
[CV 1/5] END ..C=1, gamma=0.001, kernel=sigmoid;, score=0.876 total time=   0.1s
[CV 2/5] END ..C=1, gamma=0.001, kernel=sigmoid;, score=0.876 total time=   0.1s
[CV 3/5] END ..C=1, gamma=0.001, kernel=sigmoid;, score=0.876 total time=   0.0s
[CV 4/5] END ..C=1, gamma=0.001, kernel=sigmoid;, score=0.879 total time=   0.0s
[CV 5/5] END ..C=1, gamma=0.001, kernel=sigmoid;, score=0.879 total time=   0.1s
[CV 1/5] END ......C=1, gamma=scale, kernel=rbf;, score=0.884 total time=   0.2s
[CV 2/5] END ......C=1, gamma=scale, kernel=rbf;, score=0.880 total time=   0.2s
[CV 3/5] END ......C=1, gamma=scale, kernel=rbf;, score=0.876 total time=   0.2s
[CV 4/5] END ......C=1, gamma=scale, kernel=rbf;, score=0.879 total time=   0.2s
[CV 5/5] END ......C=1, gamma=scale, kernel=rbf;, score=0.884 total time=   0.2s
[CV 1/5] END ...C=1, gamma=scale, kernel=linear;, score=0.889 total time=   0.1s
[CV 2/5] END ...C=1, gamma=s

[CV 3/5] END ..C=10, gamma=0.01, kernel=sigmoid;, score=0.876 total time=   0.1s
[CV 4/5] END ..C=10, gamma=0.01, kernel=sigmoid;, score=0.879 total time=   0.1s
[CV 5/5] END ..C=10, gamma=0.01, kernel=sigmoid;, score=0.879 total time=   0.1s
[CV 1/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.876 total time=   0.1s
[CV 2/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.876 total time=   0.1s
[CV 3/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.876 total time=   0.1s
[CV 4/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.879 total time=   0.1s
[CV 5/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.879 total time=   0.1s
[CV 1/5] END ..C=10, gamma=0.001, kernel=linear;, score=0.902 total time=   0.1s
[CV 2/5] END ..C=10, gamma=0.001, kernel=linear;, score=0.884 total time=   0.1s
[CV 3/5] END ..C=10, gamma=0.001, kernel=linear;, score=0.893 total time=   0.1s
[CV 4/5] END ..C=10, gamma=0.001, kernel=linear;, score=0.879 total time=   0.1s
[CV 5/5] END ..C=10, gamma=0

[CV 1/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.911 total time=   0.1s
[CV 2/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.889 total time=   0.1s
[CV 3/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.893 total time=   0.1s
[CV 4/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.884 total time=   0.1s
[CV 5/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.902 total time=   0.1s
[CV 1/5] END ..C=100, gamma=0.01, kernel=linear;, score=0.902 total time=   0.1s
[CV 2/5] END ..C=100, gamma=0.01, kernel=linear;, score=0.884 total time=   0.1s
[CV 3/5] END ..C=100, gamma=0.01, kernel=linear;, score=0.893 total time=   0.1s
[CV 4/5] END ..C=100, gamma=0.01, kernel=linear;, score=0.857 total time=   0.1s
[CV 5/5] END ..C=100, gamma=0.01, kernel=linear;, score=0.888 total time=   0.1s
[CV 1/5] END ....C=100, gamma=0.01, kernel=poly;, score=0.876 total time=   0.0s
[CV 2/5] END ....C=100, gamma=0.01, kernel=poly;, score=0.876 total time=   0.0s
[CV 3/5] END ....C=100, gamm

[CV 3/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.893 total time=   0.1s
[CV 4/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.875 total time=   0.1s
[CV 5/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.893 total time=   0.1s
[CV 1/5] END ..C=1000, gamma=0.1, kernel=linear;, score=0.902 total time=   0.1s
[CV 2/5] END ..C=1000, gamma=0.1, kernel=linear;, score=0.884 total time=   0.1s
[CV 3/5] END ..C=1000, gamma=0.1, kernel=linear;, score=0.893 total time=   0.1s
[CV 4/5] END ..C=1000, gamma=0.1, kernel=linear;, score=0.857 total time=   0.1s
[CV 5/5] END ..C=1000, gamma=0.1, kernel=linear;, score=0.888 total time=   0.1s
[CV 1/5] END ....C=1000, gamma=0.1, kernel=poly;, score=0.876 total time=   0.2s
[CV 2/5] END ....C=1000, gamma=0.1, kernel=poly;, score=0.876 total time=   0.2s
[CV 3/5] END ....C=1000, gamma=0.1, kernel=poly;, score=0.876 total time=   0.2s
[CV 4/5] END ....C=1000, gamma=0.1, kernel=poly;, score=0.879 total time=   0.2s
[CV 5/5] END ....C=1000, gam

In [19]:
print(classification_report(y_val, svc_noproc_grid.predict(x_val)))

              precision    recall  f1-score   support

          -1       1.00      1.00      1.00        64
           0       1.00      0.96      0.98        74
           1       1.00      1.00      1.00       985

    accuracy                           1.00      1123
   macro avg       1.00      0.99      0.99      1123
weighted avg       1.00      1.00      1.00      1123



In [20]:
print("best params for noproc")
print(svc_noproc_grid.best_params_)

best params for noproc
{'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}


In [21]:
svc_noproc = SVC(**svc_noproc_grid.best_params_)
svc_noproc.fit(x_train, y_train)

In [22]:
y_test_pred = svc_noproc.predict(x_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

          -1       0.71      0.39      0.50        57
           0       0.59      0.24      0.34        55
           1       0.93      0.99      0.96      1012

    accuracy                           0.92      1124
   macro avg       0.74      0.54      0.60      1124
weighted avg       0.91      0.92      0.91      1124



In [23]:
compare_list.loc[len(compare_list)] = [
    "after",
    "testing",
    None,
    svc_noproc_grid.best_params_["C"],
    svc_noproc_grid.best_params_["gamma"],
    svc_noproc_grid.best_params_["kernel"],
    svc_noproc_grid.best_score_,
    f1_score(y_test, y_test_pred, average="weighted"),
    accuracy_score(y_test, y_test_pred),
]

In [24]:
display(compare_list)

Unnamed: 0,tuning,dataset,proc,C,gamma,kernel,grid_score,f1_score,accuracy
0,before,validation,,default,default,default,,0.863183,0.896705
1,after,testing,,100,0.01,rbf,0.89581,0.906358,0.920819


### Baseline preprocessing

In [25]:
x_train, x_val, x_test, y_train, y_val, y_test, vec_baseline = pipeline(
    "Baseline", df_testing
)

In [26]:
svc_baseline_prelim = SVC()
svc_baseline_prelim.fit(x_train, y_train)

In [27]:
y_val_pred = svc_baseline_prelim.predict(x_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

          -1       0.91      0.16      0.27        64
           0       1.00      0.16      0.28        74
           1       0.89      1.00      0.94       985

    accuracy                           0.90      1123
   macro avg       0.93      0.44      0.50      1123
weighted avg       0.90      0.90      0.86      1123



In [28]:
compare_list.loc[len(compare_list)] = [
    "before",
    "validation",
    "Baseline",
    "default",
    "default",
    "default",
    None,
    f1_score(y_val, y_val_pred, average="weighted"),
    accuracy_score(y_val, y_val_pred),
]

In [29]:
svc_baseline_grid = GridSearchCV(
    SVC(), param_grid, cv=cv, verbose=verbose, n_jobs=n_jobs
)
svc_baseline_grid.fit(x_val, y_val)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
[CV 1/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.876 total time=   0.2s
[CV 2/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.876 total time=   0.1s
[CV 3/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.876 total time=   0.1s
[CV 4/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.879 total time=   0.1s
[CV 5/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.879 total time=   0.1s
[CV 1/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.876 total time=   0.1s
[CV 2/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.876 total time=   0.1s
[CV 3/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.876 total time=   0.1s
[CV 4/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.879 total time=   0.1s
[CV 5/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.879 total time=   0.1s
[CV 1/5] END ......C=0.01, gamma=1, kernel=poly;, score=0.876 total time=   0.2s
[CV 2/5] END ......C=0.01, gamma=1, kernel=pol

[CV 2/5] END ....C=0.01, gamma=auto, kernel=rbf;, score=0.876 total time=   0.0s
[CV 3/5] END ....C=0.01, gamma=auto, kernel=rbf;, score=0.876 total time=   0.0s
[CV 4/5] END ....C=0.01, gamma=auto, kernel=rbf;, score=0.879 total time=   0.0s
[CV 5/5] END ....C=0.01, gamma=auto, kernel=rbf;, score=0.879 total time=   0.0s
[CV 1/5] END .C=0.01, gamma=auto, kernel=linear;, score=0.876 total time=   0.1s
[CV 2/5] END .C=0.01, gamma=auto, kernel=linear;, score=0.876 total time=   0.1s
[CV 3/5] END .C=0.01, gamma=auto, kernel=linear;, score=0.876 total time=   0.1s
[CV 4/5] END .C=0.01, gamma=auto, kernel=linear;, score=0.879 total time=   0.1s
[CV 5/5] END .C=0.01, gamma=auto, kernel=linear;, score=0.879 total time=   0.1s
[CV 1/5] END ...C=0.01, gamma=auto, kernel=poly;, score=0.876 total time=   0.0s
[CV 2/5] END ...C=0.01, gamma=auto, kernel=poly;, score=0.876 total time=   0.0s
[CV 3/5] END ...C=0.01, gamma=auto, kernel=poly;, score=0.876 total time=   0.0s
[CV 4/5] END ...C=0.01, gamm

[CV 5/5] END ....C=0.1, gamma=scale, kernel=rbf;, score=0.879 total time=   0.1s
[CV 1/5] END .C=0.1, gamma=scale, kernel=linear;, score=0.876 total time=   0.1s
[CV 2/5] END .C=0.1, gamma=scale, kernel=linear;, score=0.876 total time=   0.1s
[CV 3/5] END .C=0.1, gamma=scale, kernel=linear;, score=0.876 total time=   0.1s
[CV 4/5] END .C=0.1, gamma=scale, kernel=linear;, score=0.879 total time=   0.1s
[CV 5/5] END .C=0.1, gamma=scale, kernel=linear;, score=0.879 total time=   0.1s
[CV 1/5] END ...C=0.1, gamma=scale, kernel=poly;, score=0.876 total time=   0.2s
[CV 2/5] END ...C=0.1, gamma=scale, kernel=poly;, score=0.876 total time=   0.2s
[CV 3/5] END ...C=0.1, gamma=scale, kernel=poly;, score=0.876 total time=   0.2s
[CV 4/5] END ...C=0.1, gamma=scale, kernel=poly;, score=0.879 total time=   0.2s
[CV 5/5] END ...C=0.1, gamma=scale, kernel=poly;, score=0.879 total time=   0.2s
[CV 1/5] END C=0.1, gamma=scale, kernel=sigmoid;, score=0.876 total time=   0.1s
[CV 2/5] END C=0.1, gamma=sc

[CV 2/5] END ...C=1, gamma=0.001, kernel=linear;, score=0.893 total time=   0.1s
[CV 3/5] END ...C=1, gamma=0.001, kernel=linear;, score=0.889 total time=   0.1s
[CV 4/5] END ...C=1, gamma=0.001, kernel=linear;, score=0.875 total time=   0.1s
[CV 5/5] END ...C=1, gamma=0.001, kernel=linear;, score=0.902 total time=   0.1s
[CV 1/5] END .....C=1, gamma=0.001, kernel=poly;, score=0.876 total time=   0.0s
[CV 2/5] END .....C=1, gamma=0.001, kernel=poly;, score=0.876 total time=   0.0s
[CV 3/5] END .....C=1, gamma=0.001, kernel=poly;, score=0.876 total time=   0.0s
[CV 4/5] END .....C=1, gamma=0.001, kernel=poly;, score=0.879 total time=   0.0s
[CV 5/5] END .....C=1, gamma=0.001, kernel=poly;, score=0.879 total time=   0.0s
[CV 1/5] END ..C=1, gamma=0.001, kernel=sigmoid;, score=0.876 total time=   0.0s
[CV 2/5] END ..C=1, gamma=0.001, kernel=sigmoid;, score=0.876 total time=   0.0s
[CV 3/5] END ..C=1, gamma=0.001, kernel=sigmoid;, score=0.876 total time=   0.0s
[CV 4/5] END ..C=1, gamma=0.

[CV 5/5] END ...C=10, gamma=0.01, kernel=linear;, score=0.884 total time=   0.1s
[CV 1/5] END .....C=10, gamma=0.01, kernel=poly;, score=0.876 total time=   0.0s
[CV 2/5] END .....C=10, gamma=0.01, kernel=poly;, score=0.876 total time=   0.0s
[CV 3/5] END .....C=10, gamma=0.01, kernel=poly;, score=0.876 total time=   0.0s
[CV 4/5] END .....C=10, gamma=0.01, kernel=poly;, score=0.879 total time=   0.0s
[CV 5/5] END .....C=10, gamma=0.01, kernel=poly;, score=0.879 total time=   0.0s
[CV 1/5] END ..C=10, gamma=0.01, kernel=sigmoid;, score=0.876 total time=   0.1s
[CV 2/5] END ..C=10, gamma=0.01, kernel=sigmoid;, score=0.876 total time=   0.1s
[CV 3/5] END ..C=10, gamma=0.01, kernel=sigmoid;, score=0.876 total time=   0.1s
[CV 4/5] END ..C=10, gamma=0.01, kernel=sigmoid;, score=0.879 total time=   0.1s
[CV 5/5] END ..C=10, gamma=0.01, kernel=sigmoid;, score=0.879 total time=   0.1s
[CV 1/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.876 total time=   0.1s
[CV 2/5] END .....C=10, gamm

[CV 3/5] END .....C=100, gamma=0.1, kernel=poly;, score=0.876 total time=   0.2s
[CV 4/5] END .....C=100, gamma=0.1, kernel=poly;, score=0.879 total time=   0.2s
[CV 5/5] END .....C=100, gamma=0.1, kernel=poly;, score=0.879 total time=   0.2s
[CV 1/5] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.916 total time=   0.1s
[CV 2/5] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.889 total time=   0.1s
[CV 3/5] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.898 total time=   0.1s
[CV 4/5] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.893 total time=   0.1s
[CV 5/5] END ..C=100, gamma=0.1, kernel=sigmoid;, score=0.884 total time=   0.1s
[CV 1/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.907 total time=   0.1s
[CV 2/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.889 total time=   0.1s
[CV 3/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.898 total time=   0.1s
[CV 4/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.884 total time=   0.1s
[CV 5/5] END .....C=100, gam

[CV 5/5] END ......C=1000, gamma=1, kernel=poly;, score=0.884 total time=   0.2s
[CV 1/5] END ...C=1000, gamma=1, kernel=sigmoid;, score=0.791 total time=   0.1s
[CV 2/5] END ...C=1000, gamma=1, kernel=sigmoid;, score=0.867 total time=   0.1s
[CV 3/5] END ...C=1000, gamma=1, kernel=sigmoid;, score=0.840 total time=   0.1s
[CV 4/5] END ...C=1000, gamma=1, kernel=sigmoid;, score=0.804 total time=   0.1s
[CV 5/5] END ...C=1000, gamma=1, kernel=sigmoid;, score=0.830 total time=   0.1s
[CV 1/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.907 total time=   0.1s
[CV 2/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.889 total time=   0.1s
[CV 3/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.898 total time=   0.1s
[CV 4/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.884 total time=   0.1s
[CV 5/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.888 total time=   0.1s
[CV 1/5] END ..C=1000, gamma=0.1, kernel=linear;, score=0.920 total time=   0.1s
[CV 2/5] END ..C=1000, gamma

[CV 4/5] END C=1000, gamma=auto, kernel=sigmoid;, score=0.879 total time=   0.1s
[CV 5/5] END C=1000, gamma=auto, kernel=sigmoid;, score=0.879 total time=   0.1s


In [30]:
print(classification_report(y_val, svc_baseline_grid.predict(x_val)))

              precision    recall  f1-score   support

          -1       1.00      1.00      1.00        64
           0       1.00      0.97      0.99        74
           1       1.00      1.00      1.00       985

    accuracy                           1.00      1123
   macro avg       1.00      0.99      1.00      1123
weighted avg       1.00      1.00      1.00      1123



In [31]:
print("best params for baseline")
print(svc_baseline_grid.best_params_)

best params for baseline
{'C': 10, 'gamma': 1, 'kernel': 'linear'}


In [32]:
svc_baseline = SVC(**svc_baseline_grid.best_params_)
svc_baseline.fit(x_train, y_train)

In [33]:
y_test_pred = svc_baseline.predict(x_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

          -1       0.52      0.46      0.49        57
           0       0.38      0.29      0.33        55
           1       0.94      0.96      0.95      1012

    accuracy                           0.90      1124
   macro avg       0.61      0.57      0.59      1124
weighted avg       0.89      0.90      0.90      1124



In [34]:
compare_list.loc[len(compare_list)] = [
    "after",
    "testing",
    "Baseline",
    svc_baseline_grid.best_params_["C"],
    svc_baseline_grid.best_params_["gamma"],
    svc_baseline_grid.best_params_["kernel"],
    svc_baseline_grid.best_score_,
    f1_score(y_test, y_test_pred, average="weighted"),
    accuracy_score(y_test, y_test_pred),
]

In [35]:
display(compare_list)

Unnamed: 0,tuning,dataset,proc,C,gamma,kernel,grid_score,f1_score,accuracy
0,before,validation,,default,default,default,,0.863183,0.896705
1,after,testing,,100,0.01,rbf,0.89581,0.906358,0.920819
2,before,validation,Baseline,default,default,default,,0.861482,0.895815
3,after,testing,Baseline,10,1,linear,0.895802,0.897976,0.903025


### Stemmed + baseline preprocessing

In [36]:
x_train, x_val, x_test, y_train, y_val, y_test, vec_stem = pipeline("Stemmed", df_testing)

In [37]:
svc_stem_prelim = SVC()
svc_stem_prelim.fit(x_train, y_train)

In [38]:
y_val_pred = svc_stem_prelim.predict(x_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

          -1       1.00      0.14      0.25        64
           0       1.00      0.19      0.32        74
           1       0.90      1.00      0.94       985

    accuracy                           0.90      1123
   macro avg       0.97      0.44      0.50      1123
weighted avg       0.91      0.90      0.86      1123



In [39]:
compare_list.loc[len(compare_list)] = [
    "before",
    "validation",
    "Stemmed",
    "default",
    "default",
    "default",
    None,
    f1_score(y_val, y_val_pred, average="weighted"),
    accuracy_score(y_val, y_val_pred),
]

In [40]:
svc_stem_grid = GridSearchCV(SVC(), param_grid, cv=cv, verbose=verbose, n_jobs=n_jobs)
svc_stem_grid.fit(x_val, y_val)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
[CV 1/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.876 total time=   0.1s
[CV 2/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.876 total time=   0.1s
[CV 3/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.876 total time=   0.1s
[CV 4/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.879 total time=   0.1s
[CV 5/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.879 total time=   0.1s
[CV 1/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.876 total time=   0.1s
[CV 2/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.876 total time=   0.1s
[CV 3/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.876 total time=   0.1s
[CV 4/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.879 total time=   0.1s
[CV 5/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.879 total time=   0.1s
[CV 1/5] END ......C=0.01, gamma=1, kernel=poly;, score=0.876 total time=   0.2s
[CV 2/5] END ......C=0.01, gamma=1, kernel=pol

[CV 1/5] END .C=0.01, gamma=auto, kernel=linear;, score=0.876 total time=   0.1s
[CV 2/5] END .C=0.01, gamma=auto, kernel=linear;, score=0.876 total time=   0.1s
[CV 3/5] END .C=0.01, gamma=auto, kernel=linear;, score=0.876 total time=   0.1s
[CV 4/5] END .C=0.01, gamma=auto, kernel=linear;, score=0.879 total time=   0.1s
[CV 5/5] END .C=0.01, gamma=auto, kernel=linear;, score=0.879 total time=   0.1s
[CV 1/5] END ...C=0.01, gamma=auto, kernel=poly;, score=0.876 total time=   0.0s
[CV 2/5] END ...C=0.01, gamma=auto, kernel=poly;, score=0.876 total time=   0.0s
[CV 3/5] END ...C=0.01, gamma=auto, kernel=poly;, score=0.876 total time=   0.0s
[CV 4/5] END ...C=0.01, gamma=auto, kernel=poly;, score=0.879 total time=   0.0s
[CV 5/5] END ...C=0.01, gamma=auto, kernel=poly;, score=0.879 total time=   0.0s
[CV 1/5] END C=0.01, gamma=auto, kernel=sigmoid;, score=0.876 total time=   0.0s
[CV 2/5] END C=0.01, gamma=auto, kernel=sigmoid;, score=0.876 total time=   0.0s
[CV 3/5] END C=0.01, gamma=a

[CV 3/5] END .C=0.1, gamma=scale, kernel=linear;, score=0.876 total time=   0.1s
[CV 4/5] END .C=0.1, gamma=scale, kernel=linear;, score=0.879 total time=   0.1s
[CV 5/5] END .C=0.1, gamma=scale, kernel=linear;, score=0.879 total time=   0.1s
[CV 1/5] END ...C=0.1, gamma=scale, kernel=poly;, score=0.876 total time=   0.2s
[CV 2/5] END ...C=0.1, gamma=scale, kernel=poly;, score=0.876 total time=   0.2s
[CV 3/5] END ...C=0.1, gamma=scale, kernel=poly;, score=0.876 total time=   0.2s
[CV 4/5] END ...C=0.1, gamma=scale, kernel=poly;, score=0.879 total time=   0.2s
[CV 5/5] END ...C=0.1, gamma=scale, kernel=poly;, score=0.879 total time=   0.2s
[CV 1/5] END C=0.1, gamma=scale, kernel=sigmoid;, score=0.876 total time=   0.1s
[CV 2/5] END C=0.1, gamma=scale, kernel=sigmoid;, score=0.876 total time=   0.1s
[CV 3/5] END C=0.1, gamma=scale, kernel=sigmoid;, score=0.876 total time=   0.1s
[CV 4/5] END C=0.1, gamma=scale, kernel=sigmoid;, score=0.879 total time=   0.1s
[CV 5/5] END C=0.1, gamma=sc

[CV 2/5] END .....C=1, gamma=0.001, kernel=poly;, score=0.876 total time=   0.0s
[CV 3/5] END .....C=1, gamma=0.001, kernel=poly;, score=0.876 total time=   0.0s
[CV 4/5] END .....C=1, gamma=0.001, kernel=poly;, score=0.879 total time=   0.0s
[CV 5/5] END .....C=1, gamma=0.001, kernel=poly;, score=0.879 total time=   0.0s
[CV 1/5] END ..C=1, gamma=0.001, kernel=sigmoid;, score=0.876 total time=   0.0s
[CV 2/5] END ..C=1, gamma=0.001, kernel=sigmoid;, score=0.876 total time=   0.0s
[CV 3/5] END ..C=1, gamma=0.001, kernel=sigmoid;, score=0.876 total time=   0.0s
[CV 4/5] END ..C=1, gamma=0.001, kernel=sigmoid;, score=0.879 total time=   0.0s
[CV 5/5] END ..C=1, gamma=0.001, kernel=sigmoid;, score=0.879 total time=   0.0s
[CV 1/5] END ......C=1, gamma=scale, kernel=rbf;, score=0.884 total time=   0.1s
[CV 2/5] END ......C=1, gamma=scale, kernel=rbf;, score=0.884 total time=   0.1s
[CV 3/5] END ......C=1, gamma=scale, kernel=rbf;, score=0.876 total time=   0.1s
[CV 4/5] END ......C=1, gamm

[CV 2/5] END ..C=10, gamma=0.01, kernel=sigmoid;, score=0.876 total time=   0.1s
[CV 3/5] END ..C=10, gamma=0.01, kernel=sigmoid;, score=0.876 total time=   0.1s
[CV 4/5] END ..C=10, gamma=0.01, kernel=sigmoid;, score=0.879 total time=   0.1s
[CV 5/5] END ..C=10, gamma=0.01, kernel=sigmoid;, score=0.879 total time=   0.1s
[CV 1/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.876 total time=   0.1s
[CV 2/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.876 total time=   0.1s
[CV 3/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.876 total time=   0.1s
[CV 4/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.879 total time=   0.1s
[CV 5/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.879 total time=   0.1s
[CV 1/5] END ..C=10, gamma=0.001, kernel=linear;, score=0.907 total time=   0.1s
[CV 2/5] END ..C=10, gamma=0.001, kernel=linear;, score=0.893 total time=   0.1s
[CV 3/5] END ..C=10, gamma=0.001, kernel=linear;, score=0.907 total time=   0.1s
[CV 4/5] END ..C=10, gamma=0

[CV 1/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.902 total time=   0.1s
[CV 2/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.898 total time=   0.1s
[CV 3/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.893 total time=   0.1s
[CV 4/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.902 total time=   0.1s
[CV 5/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.897 total time=   0.1s
[CV 1/5] END ..C=100, gamma=0.01, kernel=linear;, score=0.907 total time=   0.1s
[CV 2/5] END ..C=100, gamma=0.01, kernel=linear;, score=0.893 total time=   0.1s
[CV 3/5] END ..C=100, gamma=0.01, kernel=linear;, score=0.907 total time=   0.1s
[CV 4/5] END ..C=100, gamma=0.01, kernel=linear;, score=0.866 total time=   0.1s
[CV 5/5] END ..C=100, gamma=0.01, kernel=linear;, score=0.884 total time=   0.1s
[CV 1/5] END ....C=100, gamma=0.01, kernel=poly;, score=0.876 total time=   0.0s
[CV 2/5] END ....C=100, gamma=0.01, kernel=poly;, score=0.876 total time=   0.0s
[CV 3/5] END ....C=100, gamm

[CV 5/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.884 total time=   0.1s
[CV 1/5] END ..C=1000, gamma=0.1, kernel=linear;, score=0.907 total time=   0.1s
[CV 2/5] END ..C=1000, gamma=0.1, kernel=linear;, score=0.893 total time=   0.1s
[CV 3/5] END ..C=1000, gamma=0.1, kernel=linear;, score=0.907 total time=   0.1s
[CV 4/5] END ..C=1000, gamma=0.1, kernel=linear;, score=0.866 total time=   0.1s
[CV 5/5] END ..C=1000, gamma=0.1, kernel=linear;, score=0.884 total time=   0.1s
[CV 1/5] END ....C=1000, gamma=0.1, kernel=poly;, score=0.880 total time=   0.2s
[CV 2/5] END ....C=1000, gamma=0.1, kernel=poly;, score=0.880 total time=   0.2s
[CV 3/5] END ....C=1000, gamma=0.1, kernel=poly;, score=0.876 total time=   0.2s
[CV 4/5] END ....C=1000, gamma=0.1, kernel=poly;, score=0.879 total time=   0.2s
[CV 5/5] END ....C=1000, gamma=0.1, kernel=poly;, score=0.884 total time=   0.2s
[CV 1/5] END .C=1000, gamma=0.1, kernel=sigmoid;, score=0.907 total time=   0.1s
[CV 2/5] END .C=1000, gamma=

In [41]:
print(classification_report(y_val, svc_stem_grid.predict(x_val)))

              precision    recall  f1-score   support

          -1       1.00      1.00      1.00        64
           0       1.00      0.97      0.99        74
           1       1.00      1.00      1.00       985

    accuracy                           1.00      1123
   macro avg       1.00      0.99      1.00      1123
weighted avg       1.00      1.00      1.00      1123



In [42]:
print("best params for stem")
print(svc_stem_grid.best_params_)

best params for stem
{'C': 10, 'gamma': 1, 'kernel': 'linear'}


In [43]:
svc_stem = SVC(**svc_stem_grid.best_params_)
svc_stem.fit(x_train, y_train)

In [44]:
y_test_pred = svc_stem.predict(x_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

          -1       0.56      0.44      0.49        57
           0       0.39      0.38      0.39        55
           1       0.94      0.96      0.95      1012

    accuracy                           0.90      1124
   macro avg       0.63      0.59      0.61      1124
weighted avg       0.90      0.90      0.90      1124



In [45]:
compare_list.loc[len(compare_list)] = [
    "after",
    "testing",
    "Stemmed",
    svc_stem_grid.best_params_["C"],
    svc_stem_grid.best_params_["gamma"],
    svc_stem_grid.best_params_["kernel"],
    svc_stem_grid.best_score_,
    f1_score(y_test, y_test_pred, average="weighted"),
    accuracy_score(y_test, y_test_pred),
]

In [46]:
display(compare_list)

Unnamed: 0,tuning,dataset,proc,C,gamma,kernel,grid_score,f1_score,accuracy
0,before,validation,,default,default,default,,0.863183,0.896705
1,after,testing,,100,0.01,rbf,0.89581,0.906358,0.920819
2,before,validation,Baseline,default,default,default,,0.861482,0.895815
3,after,testing,Baseline,10,1,linear,0.895802,0.897976,0.903025
4,before,validation,Stemmed,default,default,default,,0.863756,0.897596
5,after,testing,Stemmed,10,1,linear,0.899369,0.899427,0.902135


### Lemmatized + baseline preprocessing

In [47]:
x_train, x_val, x_test, y_train, y_val, y_test, vec_lem = pipeline("Lemmatized", df_testing)

In [48]:
svc_lem_prelim = SVC()
svc_lem_prelim.fit(x_train, y_train)

In [49]:
y_val_pred = svc_lem_prelim.predict(x_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

          -1       1.00      0.14      0.25        64
           0       1.00      0.18      0.30        74
           1       0.89      1.00      0.94       985

    accuracy                           0.90      1123
   macro avg       0.96      0.44      0.50      1123
weighted avg       0.91      0.90      0.86      1123



In [50]:
compare_list.loc[len(compare_list)] = [
    "before",
    "validation",
    "Lemmatized",
    "default",
    "default",
    "default",
    None,
    f1_score(y_val, y_val_pred, average="weighted"),
    accuracy_score(y_val, y_val_pred),
]

In [51]:
svc_lem_grid = GridSearchCV(SVC(), param_grid, cv=cv, verbose=verbose, n_jobs=n_jobs)
svc_lem_grid.fit(x_val, y_val)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
[CV 1/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.876 total time=   0.1s
[CV 2/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.876 total time=   0.1s
[CV 3/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.876 total time=   0.1s
[CV 4/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.879 total time=   0.1s
[CV 5/5] END .......C=0.01, gamma=1, kernel=rbf;, score=0.879 total time=   0.1s
[CV 1/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.876 total time=   0.1s
[CV 2/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.876 total time=   0.1s
[CV 3/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.876 total time=   0.1s
[CV 4/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.879 total time=   0.1s
[CV 5/5] END ....C=0.01, gamma=1, kernel=linear;, score=0.879 total time=   0.1s
[CV 1/5] END ......C=0.01, gamma=1, kernel=poly;, score=0.876 total time=   0.2s
[CV 2/5] END ......C=0.01, gamma=1, kernel=pol

[CV 5/5] END ....C=0.01, gamma=auto, kernel=rbf;, score=0.879 total time=   0.0s
[CV 1/5] END .C=0.01, gamma=auto, kernel=linear;, score=0.876 total time=   0.1s
[CV 2/5] END .C=0.01, gamma=auto, kernel=linear;, score=0.876 total time=   0.1s
[CV 3/5] END .C=0.01, gamma=auto, kernel=linear;, score=0.876 total time=   0.1s
[CV 4/5] END .C=0.01, gamma=auto, kernel=linear;, score=0.879 total time=   0.1s
[CV 5/5] END .C=0.01, gamma=auto, kernel=linear;, score=0.879 total time=   0.1s
[CV 1/5] END ...C=0.01, gamma=auto, kernel=poly;, score=0.876 total time=   0.0s
[CV 2/5] END ...C=0.01, gamma=auto, kernel=poly;, score=0.876 total time=   0.0s
[CV 3/5] END ...C=0.01, gamma=auto, kernel=poly;, score=0.876 total time=   0.0s
[CV 4/5] END ...C=0.01, gamma=auto, kernel=poly;, score=0.879 total time=   0.0s
[CV 5/5] END ...C=0.01, gamma=auto, kernel=poly;, score=0.879 total time=   0.0s
[CV 1/5] END C=0.01, gamma=auto, kernel=sigmoid;, score=0.876 total time=   0.0s
[CV 2/5] END C=0.01, gamma=a

[CV 4/5] END .C=0.1, gamma=scale, kernel=linear;, score=0.879 total time=   0.1s
[CV 5/5] END .C=0.1, gamma=scale, kernel=linear;, score=0.879 total time=   0.1s
[CV 1/5] END ...C=0.1, gamma=scale, kernel=poly;, score=0.876 total time=   0.2s
[CV 2/5] END ...C=0.1, gamma=scale, kernel=poly;, score=0.876 total time=   0.2s
[CV 3/5] END ...C=0.1, gamma=scale, kernel=poly;, score=0.876 total time=   0.2s
[CV 4/5] END ...C=0.1, gamma=scale, kernel=poly;, score=0.879 total time=   0.2s
[CV 5/5] END ...C=0.1, gamma=scale, kernel=poly;, score=0.879 total time=   0.2s
[CV 1/5] END C=0.1, gamma=scale, kernel=sigmoid;, score=0.876 total time=   0.1s
[CV 2/5] END C=0.1, gamma=scale, kernel=sigmoid;, score=0.876 total time=   0.1s
[CV 3/5] END C=0.1, gamma=scale, kernel=sigmoid;, score=0.876 total time=   0.1s
[CV 4/5] END C=0.1, gamma=scale, kernel=sigmoid;, score=0.879 total time=   0.1s
[CV 5/5] END C=0.1, gamma=scale, kernel=sigmoid;, score=0.879 total time=   0.1s
[CV 1/5] END .....C=0.1, gam

[CV 4/5] END .....C=1, gamma=0.001, kernel=poly;, score=0.879 total time=   0.0s
[CV 5/5] END .....C=1, gamma=0.001, kernel=poly;, score=0.879 total time=   0.0s
[CV 1/5] END ..C=1, gamma=0.001, kernel=sigmoid;, score=0.876 total time=   0.0s
[CV 2/5] END ..C=1, gamma=0.001, kernel=sigmoid;, score=0.876 total time=   0.0s
[CV 3/5] END ..C=1, gamma=0.001, kernel=sigmoid;, score=0.876 total time=   0.0s
[CV 4/5] END ..C=1, gamma=0.001, kernel=sigmoid;, score=0.879 total time=   0.0s
[CV 5/5] END ..C=1, gamma=0.001, kernel=sigmoid;, score=0.879 total time=   0.0s
[CV 1/5] END ......C=1, gamma=scale, kernel=rbf;, score=0.884 total time=   0.2s
[CV 2/5] END ......C=1, gamma=scale, kernel=rbf;, score=0.884 total time=   0.2s
[CV 3/5] END ......C=1, gamma=scale, kernel=rbf;, score=0.876 total time=   0.2s
[CV 4/5] END ......C=1, gamma=scale, kernel=rbf;, score=0.879 total time=   0.2s
[CV 5/5] END ......C=1, gamma=scale, kernel=rbf;, score=0.884 total time=   0.2s
[CV 1/5] END ...C=1, gamma=s

[CV 2/5] END ..C=10, gamma=0.01, kernel=sigmoid;, score=0.876 total time=   0.1s
[CV 3/5] END ..C=10, gamma=0.01, kernel=sigmoid;, score=0.876 total time=   0.1s
[CV 4/5] END ..C=10, gamma=0.01, kernel=sigmoid;, score=0.879 total time=   0.1s
[CV 5/5] END ..C=10, gamma=0.01, kernel=sigmoid;, score=0.879 total time=   0.1s
[CV 1/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.876 total time=   0.1s
[CV 2/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.876 total time=   0.1s
[CV 3/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.876 total time=   0.1s
[CV 4/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.879 total time=   0.1s
[CV 5/5] END .....C=10, gamma=0.001, kernel=rbf;, score=0.879 total time=   0.1s
[CV 1/5] END ..C=10, gamma=0.001, kernel=linear;, score=0.911 total time=   0.1s
[CV 2/5] END ..C=10, gamma=0.001, kernel=linear;, score=0.889 total time=   0.1s
[CV 3/5] END ..C=10, gamma=0.001, kernel=linear;, score=0.902 total time=   0.1s
[CV 4/5] END ..C=10, gamma=0

[CV 1/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.902 total time=   0.1s
[CV 2/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.893 total time=   0.1s
[CV 3/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.893 total time=   0.1s
[CV 4/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.902 total time=   0.1s
[CV 5/5] END .....C=100, gamma=0.01, kernel=rbf;, score=0.897 total time=   0.1s
[CV 1/5] END ..C=100, gamma=0.01, kernel=linear;, score=0.911 total time=   0.1s
[CV 2/5] END ..C=100, gamma=0.01, kernel=linear;, score=0.889 total time=   0.1s
[CV 3/5] END ..C=100, gamma=0.01, kernel=linear;, score=0.902 total time=   0.1s
[CV 4/5] END ..C=100, gamma=0.01, kernel=linear;, score=0.879 total time=   0.1s
[CV 5/5] END ..C=100, gamma=0.01, kernel=linear;, score=0.884 total time=   0.1s
[CV 1/5] END ....C=100, gamma=0.01, kernel=poly;, score=0.876 total time=   0.0s
[CV 2/5] END ....C=100, gamma=0.01, kernel=poly;, score=0.876 total time=   0.0s
[CV 3/5] END ....C=100, gamm

[CV 5/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.884 total time=   0.1s
[CV 1/5] END ..C=1000, gamma=0.1, kernel=linear;, score=0.911 total time=   0.1s
[CV 2/5] END ..C=1000, gamma=0.1, kernel=linear;, score=0.889 total time=   0.1s
[CV 3/5] END ..C=1000, gamma=0.1, kernel=linear;, score=0.902 total time=   0.1s
[CV 4/5] END ..C=1000, gamma=0.1, kernel=linear;, score=0.879 total time=   0.1s
[CV 5/5] END ..C=1000, gamma=0.1, kernel=linear;, score=0.884 total time=   0.1s
[CV 1/5] END ....C=1000, gamma=0.1, kernel=poly;, score=0.880 total time=   0.2s
[CV 2/5] END ....C=1000, gamma=0.1, kernel=poly;, score=0.880 total time=   0.2s
[CV 3/5] END ....C=1000, gamma=0.1, kernel=poly;, score=0.876 total time=   0.2s
[CV 4/5] END ....C=1000, gamma=0.1, kernel=poly;, score=0.879 total time=   0.2s
[CV 5/5] END ....C=1000, gamma=0.1, kernel=poly;, score=0.884 total time=   0.2s
[CV 1/5] END .C=1000, gamma=0.1, kernel=sigmoid;, score=0.911 total time=   0.1s
[CV 2/5] END .C=1000, gamma=

In [52]:
print(classification_report(y_val, svc_lem_grid.predict(x_val)))

              precision    recall  f1-score   support

          -1       1.00      1.00      1.00        64
           0       1.00      0.96      0.98        74
           1       1.00      1.00      1.00       985

    accuracy                           1.00      1123
   macro avg       1.00      0.99      0.99      1123
weighted avg       1.00      1.00      1.00      1123



In [53]:
print("best params for lem")
print(svc_lem_grid.best_params_)

best params for lem
{'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}


In [54]:
svc_lem = SVC(**svc_lem_grid.best_params_)
svc_lem.fit(x_train, y_train)

In [55]:
y_test_pred = svc_lem.predict(x_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

          -1       0.62      0.35      0.45        57
           0       0.52      0.25      0.34        55
           1       0.93      0.98      0.96      1012

    accuracy                           0.91      1124
   macro avg       0.69      0.53      0.58      1124
weighted avg       0.90      0.91      0.90      1124



In [56]:
compare_list.loc[len(compare_list)] = [
    "after",
    "testing",
    "Lemmatized",
    svc_lem_grid.best_params_["C"],
    svc_lem_grid.best_params_["gamma"],
    svc_lem_grid.best_params_["kernel"],
    svc_lem_grid.best_score_,
    f1_score(y_test, y_test_pred, average="weighted"),
    accuracy_score(y_test, y_test_pred),
]

In [57]:
display(compare_list)

Unnamed: 0,tuning,dataset,proc,C,gamma,kernel,grid_score,f1_score,accuracy
0,before,validation,,default,default,default,,0.863183,0.896705
1,after,testing,,100,0.01,rbf,0.89581,0.906358,0.920819
2,before,validation,Baseline,default,default,default,,0.861482,0.895815
3,after,testing,Baseline,10,1,linear,0.895802,0.897976,0.903025
4,before,validation,Stemmed,default,default,default,,0.863756,0.897596
5,after,testing,Stemmed,10,1,linear,0.899369,0.899427,0.902135
6,before,validation,Lemmatized,default,default,default,,0.862085,0.896705
7,after,testing,Lemmatized,1000,0.001,rbf,0.898488,0.901276,0.914591
