In [1]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!nvidia-smi

Sat May  4 20:05:30 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   30C    P0              45W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 476, done.[K
remote: Counting objects: 100% (207/207), done.[K
remote: Compressing objects: 100% (116/116), done.[K
remote: Total 476 (delta 141), reused 124 (delta 91), pack-reused 269[K
Receiving objects: 100% (476/476), 131.59 KiB | 14.62 MiB/s, done.
Resolving deltas: 100% (243/243), done.
Collecting pynvml
  Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 53.1/53.1 kB 2.0 MB/s eta 0:00:00
Installing collected packages: pynvml
Successfully installed pynvml-11.5.0
***********************************************************************
Woo! Your instance has a NVIDIA A100-SXM4-40GB GPU!
We will install the latest stable RAPIDS via pip 24.4.*!  Please stand by, should be quick...
***********************************************************************

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
Collecting cudf-cu12==24.4.*
  Downloading https:

In [4]:
from cuml.svm import SVC
import cudf

import pandas as pd
import nltk
import re
import random
import numpy as np
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, accuracy_score, precision_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
from nltk.corpus import stopwords
import warnings

warnings.filterwarnings("ignore")

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
random_state = 42
testing_frac = 1
random.seed(random_state)
data_path = "/content/drive/MyDrive/Syncable/projects/data270/data/combined.csv"

In [6]:
df = pd.read_csv(data_path, low_memory=False)
df = df.drop_duplicates()
df = df[["overall", "reviewText", "summary"]]
df.dropna(inplace=True)
df["sentiment"] = df["overall"].apply(lambda x: 1 if x > 3 else -1 if x < 3 else 0)
df["reviewTextWithSummary"] = df["summary"] + " " + df["reviewText"]
df.drop(["overall", "summary", "reviewText"], axis=1, inplace=True)
df.head()

Unnamed: 0,sentiment,reviewTextWithSummary
0,1,Five Stars As advertised. Reasonably priced
1,1,Good for the face Like the oder and the feel w...
2,-1,Smells awful I bought this to smell nice after...
3,1,Truth is There IS Nothing Like an AQUA VELVA M...
4,1,Bvlgari Shampoo If you ever want to feel pampe...


# Model experimentation

In [7]:
df_testing = df.sample(frac=testing_frac)
df_testing["sentiment"].value_counts()

sentiment
 1    606258
 0     43210
-1     41158
Name: count, dtype: int64

In [8]:
df_testing.head()

Unnamed: 0,sentiment,reviewTextWithSummary
117135,1,Interesting bridge - nice sound played from th...
538556,1,"sturdy desk! Very nice looking, easy to put to..."
716582,1,Five Stars Some of the best stamps I've used. ...
14294,1,its perfect. I don't actually use a microphone...
215671,1,Five Stars Great item from a wonderful seller....


In [9]:
print("Dataset size:", len(df_testing))

Dataset size: 690626


In [10]:
STOP_WORDS = set(stopwords.words("english"))

In [11]:
def preprocess_text(sentence, stop, type_proc=None):
    words = []
    for word in sentence.lower().strip().split():

        word = re.sub("\d", "", word)
        word = re.sub("[^\w\s]", "", word)

        if word not in stop and word != "":
            words.append(preprocess_type(word, type_proc))

    return " ".join(words)

In [12]:
def preprocess_type(word, type_proc):
    if type_proc == "Baseline":
        return word
    elif type_proc == "Stemmed":
        return PorterStemmer().stem(word)
    elif type_proc == "Lemmatized":
        return WordNetLemmatizer().lemmatize(word)
    else:
        raise ValueError("Invalid Preprocessing Type")

In [13]:
def train_val_test_split(df=df, random_state=random_state):
    x = df[["reviewTextWithSummary"]]
    y = df["sentiment"]
    x_train, x_tmp, y_train, y_tmp = train_test_split(
        x, y, test_size=0.3, random_state=random_state
    )
    x_val, x_test, y_val, y_test = train_test_split(
        x_tmp, y_tmp, test_size=0.5, random_state=random_state
    )
    return x_train, x_val, x_test, y_train, y_val, y_test

In [14]:
def pipeline(proc, df, random_state=random_state):
    df_ = df.copy()
    if proc is not None:
        df_["reviewTextWithSummary"] = df_["reviewTextWithSummary"].apply(
            lambda x: preprocess_text(x, STOP_WORDS, proc)
        )

    x_train, x_val, x_test, y_train, y_val, y_test = train_val_test_split(
        df_, random_state
    )
    vectorizer = TfidfVectorizer()
    vectorizer.fit(df_["reviewTextWithSummary"])
    x_train = vectorizer.transform(x_train["reviewTextWithSummary"])
    x_val = vectorizer.transform(x_val["reviewTextWithSummary"])
    x_test = vectorizer.transform(x_test["reviewTextWithSummary"])

    return x_train, x_val, x_test, y_train, y_val, y_test, vectorizer

## Testing different configs

In [15]:
param_grid = {
    "C": [1, 0.1, 0.01, 0.001],
    "gamma": ["scale", 0.1, 0.01],
    "kernel": ["rbf", "linear"],
}
n_jobs = None
verbose = 3
cv = 3

In [16]:
compare_list = pd.DataFrame(
    columns=[
        "tuning",
        "dataset",
        "proc",
        "C",
        "gamma",
        "kernel",
        "grid_score",
        "f1_score",
        "accuracy",
        "precision",
        "roc_auc",
    ]
)

### No preprocessing

In [17]:
x_train, x_val, x_test, y_train, y_val, y_test, vec_noproc = pipeline(None, df_testing)

In [18]:
print(np.shape(x_train))
print(np.shape(x_val))
print(np.shape(x_test))

print(np.shape(y_train))
print(np.shape(y_val))
print(np.shape(y_test))

(483438, 118754)
(103594, 118754)
(103594, 118754)
(483438,)
(103594,)
(103594,)


In [19]:
svc_noproc_prelim = SVC(verbose=True)
svc_noproc_prelim.fit(x_train, y_train)

[D] [13:17:47.370255] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [13:17:52.630120] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 181 outer iterations, total inner 75821 iterations, and diff 0.000996
[D] [13:17:52.856838] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [13:18:24.092684] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 313 outer iterations, total inner 131862 iterations, and diff 0.001000
[D] [13:18:24.220598] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [13:19:12.574683] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 495 outer iterations, total inner 203918 iterations, and diff 0.000993


In [20]:
y_val_pred = svc_noproc_prelim.predict(x_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

          -1       0.82      0.70      0.75      6218
           0       0.79      0.38      0.52      6392
           1       0.95      0.99      0.97     90984

    accuracy                           0.94    103594
   macro avg       0.85      0.69      0.75    103594
weighted avg       0.93      0.94      0.93    103594



In [21]:
compare_list.loc[len(compare_list)] = [
    "before",
    "validation",
    None,
    "default",
    "default",
    "default",
    None,
    f1_score(y_val, y_val_pred, average="weighted"),
    accuracy_score(y_val, y_val_pred),
    precision_score(y_val, y_val_pred, average="weighted"),
    None,
]

In [22]:
svc_noproc_grid = GridSearchCV(SVC(), param_grid, cv=cv, verbose=verbose, n_jobs=n_jobs)
svc_noproc_grid.fit(x_val, y_val)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV 1/3] END ......C=1, gamma=scale, kernel=rbf;, score=0.924 total time=   6.0s
[CV 2/3] END ......C=1, gamma=scale, kernel=rbf;, score=0.924 total time=   5.8s
[CV 3/3] END ......C=1, gamma=scale, kernel=rbf;, score=0.925 total time=   5.9s
[W] [13:19:46.365464] SVC with the linear kernel can be much faster using the specialized solver provided by LinearSVC. Consider switching to LinearSVC if tranining takes too long.
[CV 1/3] END ...C=1, gamma=scale, kernel=linear;, score=0.926 total time=   4.8s
[CV 2/3] END ...C=1, gamma=scale, kernel=linear;, score=0.926 total time=   4.6s
[CV 3/3] END ...C=1, gamma=scale, kernel=linear;, score=0.926 total time=   4.8s
[CV 1/3] END ........C=1, gamma=0.1, kernel=rbf;, score=0.913 total time=   3.0s
[CV 2/3] END ........C=1, gamma=0.1, kernel=rbf;, score=0.913 total time=   3.0s
[CV 3/3] END ........C=1, gamma=0.1, kernel=rbf;, score=0.914 total time=   3.1s
[CV 1/3] END .....C=1, gamma=

In [23]:
print(classification_report(y_val, svc_noproc_grid.predict(x_val)))

              precision    recall  f1-score   support

          -1       0.87      0.76      0.81      6218
           0       0.87      0.42      0.57      6392
           1       0.95      1.00      0.97     90984

    accuracy                           0.95    103594
   macro avg       0.90      0.72      0.78    103594
weighted avg       0.94      0.95      0.94    103594



In [24]:
print("best params for noproc")
print(svc_noproc_grid.best_params_)

best params for noproc
{'C': 1, 'gamma': 0.01, 'kernel': 'linear'}


In [25]:
svc_noproc = SVC(**svc_noproc_grid.best_params_, probability=True, verbose=True)
svc_noproc.fit(x_train, y_train)

[D] [13:23:28.417387] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [13:23:32.164202] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 162 outer iterations, total inner 70697 iterations, and diff 0.000991
[D] [13:23:32.252050] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [13:23:52.514173] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 291 outer iterations, total inner 123413 iterations, and diff 0.000985
[D] [13:23:52.610985] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [13:24:26.182949] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 487 outer iterations, total inner 207268 iterations, and diff 0.000978
[D] [13:24:35.018734] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [13:24:38.796716] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver

In [26]:
y_test_pred = svc_noproc.predict(x_test)
y_test_pred_proba = svc_noproc.predict_proba(x_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

          -1       0.76      0.67      0.71      6129
           0       0.77      0.34      0.47      6442
           1       0.95      0.99      0.97     91023

    accuracy                           0.93    103594
   macro avg       0.83      0.67      0.72    103594
weighted avg       0.92      0.93      0.92    103594



In [27]:
compare_list.loc[len(compare_list)] = [
    "after",
    "testing",
    None,
    svc_noproc_grid.best_params_["C"],
    svc_noproc_grid.best_params_["gamma"],
    svc_noproc_grid.best_params_["kernel"],
    svc_noproc_grid.best_score_,
    f1_score(y_test, y_test_pred, average="weighted"),
    accuracy_score(y_test, y_test_pred),
    precision_score(y_test, y_test_pred, average="weighted"),
    roc_auc_score(y_test, y_test_pred_proba, multi_class="ovr")
]

In [28]:
display(compare_list)

Unnamed: 0,tuning,dataset,proc,C,gamma,kernel,grid_score,f1_score,accuracy,precision,roc_auc
0,before,validation,,default,default,default,,0.929112,0.937303,0.931095,
1,after,testing,,1,0.01,linear,0.926067,0.921103,0.930759,0.923284,0.90334


### Baseline preprocessing

In [29]:
x_train, x_val, x_test, y_train, y_val, y_test, vec_baseline = pipeline(
    "Baseline", df_testing
)

In [30]:
print(np.shape(x_train))
print(np.shape(x_val))
print(np.shape(x_test))

print(np.shape(y_train))
print(np.shape(y_val))
print(np.shape(y_test))

(483438, 183934)
(103594, 183934)
(103594, 183934)
(483438,)
(103594,)
(103594,)


In [31]:
svc_baseline_prelim = SVC(verbose=True)
svc_baseline_prelim.fit(x_train, y_train)

[D] [13:32:01.187191] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [13:32:04.711570] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 179 outer iterations, total inner 71283 iterations, and diff 0.000998
[D] [13:32:04.798489] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [13:32:44.164114] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 334 outer iterations, total inner 136652 iterations, and diff 0.001000
[D] [13:32:44.254223] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [13:33:43.831764] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:252 SMO iteration 500, diff 0.002889
[D] [13:33:47.196799] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 547 outer iterations, total inner 224192 iterations, and diff 0.001000


In [32]:
y_val_pred = svc_baseline_prelim.predict(x_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

          -1       0.82      0.64      0.72      6218
           0       0.82      0.36      0.50      6392
           1       0.94      0.99      0.97     90984

    accuracy                           0.93    103594
   macro avg       0.86      0.66      0.73    103594
weighted avg       0.93      0.93      0.92    103594



In [33]:
compare_list.loc[len(compare_list)] = [
    "before",
    "validation",
    "Baseline",
    "default",
    "default",
    "default",
    None,
    f1_score(y_val, y_val_pred, average="weighted"),
    accuracy_score(y_val, y_val_pred),
    precision_score(y_val, y_val_pred, average="weighted"),
    None,
]

In [34]:
svc_baseline_grid = GridSearchCV(
    SVC(), param_grid, cv=cv, verbose=verbose, n_jobs=n_jobs
)
svc_baseline_grid.fit(x_val, y_val)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV 1/3] END ......C=1, gamma=scale, kernel=rbf;, score=0.919 total time=   3.8s
[CV 2/3] END ......C=1, gamma=scale, kernel=rbf;, score=0.919 total time=   3.8s
[CV 3/3] END ......C=1, gamma=scale, kernel=rbf;, score=0.920 total time=   3.8s
[CV 1/3] END ...C=1, gamma=scale, kernel=linear;, score=0.922 total time=   2.9s
[CV 2/3] END ...C=1, gamma=scale, kernel=linear;, score=0.922 total time=   2.9s
[CV 3/3] END ...C=1, gamma=scale, kernel=linear;, score=0.923 total time=   3.0s
[CV 1/3] END ........C=1, gamma=0.1, kernel=rbf;, score=0.909 total time=   1.9s
[CV 2/3] END ........C=1, gamma=0.1, kernel=rbf;, score=0.908 total time=   1.9s
[CV 3/3] END ........C=1, gamma=0.1, kernel=rbf;, score=0.910 total time=   1.9s
[CV 1/3] END .....C=1, gamma=0.1, kernel=linear;, score=0.922 total time=   2.9s
[CV 2/3] END .....C=1, gamma=0.1, kernel=linear;, score=0.922 total time=   2.9s
[CV 3/3] END .....C=1, gamma=0.1, kernel=linear;

In [35]:
print(classification_report(y_val, svc_baseline_grid.predict(x_val)))

              precision    recall  f1-score   support

          -1       0.88      0.73      0.80      6218
           0       0.89      0.39      0.55      6392
           1       0.95      1.00      0.97     90984

    accuracy                           0.94    103594
   macro avg       0.91      0.71      0.77    103594
weighted avg       0.94      0.94      0.93    103594



In [36]:
print("best params for baseline")
print(svc_baseline_grid.best_params_)

best params for baseline
{'C': 1, 'gamma': 0.1, 'kernel': 'linear'}


In [37]:
svc_baseline = SVC(**svc_baseline_grid.best_params_, probability=True, verbose=True)
svc_baseline.fit(x_train, y_train)

[D] [13:36:19.314442] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [13:36:21.503654] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 169 outer iterations, total inner 72185 iterations, and diff 0.000985
[D] [13:36:21.567991] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [13:36:47.997452] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 329 outer iterations, total inner 138187 iterations, and diff 0.000998
[D] [13:36:48.071693] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [13:37:31.450996] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:252 SMO iteration 500, diff 0.003322
[D] [13:37:37.404752] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 639 outer iterations, total inner 257021 iterations, and diff 0.000992
[D] [13:37:43.475742] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working 

In [38]:
y_test_pred = svc_baseline.predict(x_test)
y_test_pred_proba = svc_baseline.predict_proba(x_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

          -1       0.76      0.63      0.69      6129
           0       0.79      0.30      0.44      6442
           1       0.94      0.99      0.96     91023

    accuracy                           0.93    103594
   macro avg       0.83      0.64      0.70    103594
weighted avg       0.92      0.93      0.92    103594



In [39]:
compare_list.loc[len(compare_list)] = [
    "after",
    "testing",
    "Baseline",
    svc_baseline_grid.best_params_["C"],
    svc_baseline_grid.best_params_["gamma"],
    svc_baseline_grid.best_params_["kernel"],
    svc_baseline_grid.best_score_,
    f1_score(y_test, y_test_pred, average="weighted"),
    accuracy_score(y_test, y_test_pred),
    precision_score(y_test, y_test_pred, average="weighted"),
    roc_auc_score(y_test, y_test_pred_proba, multi_class="ovr")
]

In [40]:
display(compare_list)

Unnamed: 0,tuning,dataset,proc,C,gamma,kernel,grid_score,f1_score,accuracy,precision,roc_auc
0,before,validation,,default,default,default,,0.929112,0.937303,0.931095,
1,after,testing,,1,0.01,linear,0.926067,0.921103,0.930759,0.923284,0.90334
2,before,validation,Baseline,default,default,default,,0.923309,0.933037,0.927202,
3,after,testing,Baseline,1,0.1,linear,0.922254,0.915934,0.927322,0.919752,0.876626


### Stemmed + baseline preprocessing

In [41]:
x_train, x_val, x_test, y_train, y_val, y_test, vec_stem = pipeline("Stemmed", df_testing)

In [42]:
print(np.shape(x_train))
print(np.shape(x_val))
print(np.shape(x_test))

print(np.shape(y_train))
print(np.shape(y_val))
print(np.shape(y_test))

(483438, 150189)
(103594, 150189)
(103594, 150189)
(483438,)
(103594,)
(103594,)


In [43]:
svc_stem_prelim = SVC(verbose=True)
svc_stem_prelim.fit(x_train, y_train)

[D] [13:52:08.254134] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [13:52:11.669345] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 175 outer iterations, total inner 71689 iterations, and diff 0.000997
[D] [13:52:11.753730] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [13:52:49.814857] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 335 outer iterations, total inner 139208 iterations, and diff 0.000990
[D] [13:52:49.905583] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [13:53:47.805139] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:252 SMO iteration 500, diff 0.003163
[D] [13:53:51.892168] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 559 outer iterations, total inner 228942 iterations, and diff 0.000998


In [41]:
y_val_pred = svc_stem_prelim.predict(x_val)
print(classification_report(y_val, y_val_pred))

In [42]:
compare_list.loc[len(compare_list)] = [
    "before",
    "validation",
    "Stemmed",
    "default",
    "default",
    "default",
    None,
    f1_score(y_val, y_val_pred, average="weighted"),
    accuracy_score(y_val, y_val_pred),
    precision_score(y_val, y_val_pred, average="weighted"),
    None,
]

In [43]:
svc_stem_grid = GridSearchCV(SVC(), param_grid, cv=cv, verbose=verbose, n_jobs=n_jobs)
svc_stem_grid.fit(x_val, y_val)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV 1/3] END ......C=1, gamma=scale, kernel=rbf;, score=0.919 total time=   3.6s
[CV 2/3] END ......C=1, gamma=scale, kernel=rbf;, score=0.919 total time=   3.6s
[CV 3/3] END ......C=1, gamma=scale, kernel=rbf;, score=0.920 total time=   3.6s
[CV 1/3] END ...C=1, gamma=scale, kernel=linear;, score=0.920 total time=   2.7s
[CV 2/3] END ...C=1, gamma=scale, kernel=linear;, score=0.921 total time=   2.7s
[CV 3/3] END ...C=1, gamma=scale, kernel=linear;, score=0.921 total time=   2.7s
[CV 1/3] END ........C=1, gamma=0.1, kernel=rbf;, score=0.908 total time=   1.8s
[CV 2/3] END ........C=1, gamma=0.1, kernel=rbf;, score=0.908 total time=   1.7s
[CV 3/3] END ........C=1, gamma=0.1, kernel=rbf;, score=0.910 total time=   1.8s
[CV 1/3] END .....C=1, gamma=0.1, kernel=linear;, score=0.920 total time=   2.7s
[CV 2/3] END .....C=1, gamma=0.1, kernel=linear;, score=0.921 total time=   2.7s
[CV 3/3] END .....C=1, gamma=0.1, kernel=linear;

In [44]:
print(classification_report(y_val, svc_stem_grid.predict(x_val)))

              precision    recall  f1-score   support

          -1       0.86      0.68      0.76      6218
           0       0.88      0.35      0.50      6392
           1       0.94      1.00      0.97     90984

    accuracy                           0.94    103594
   macro avg       0.90      0.67      0.74    103594
weighted avg       0.93      0.94      0.93    103594



In [45]:
print("best params for stem")
print(svc_stem_grid.best_params_)

best params for stem
{'C': 1, 'gamma': 0.1, 'kernel': 'linear'}


In [46]:
svc_stem = SVC(**svc_stem_grid.best_params_, probability=True, verbose=True)
svc_stem.fit(x_train, y_train)

[D] [13:56:18.745243] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [13:56:20.775834] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 180 outer iterations, total inner 78083 iterations, and diff 0.000983
[D] [13:56:20.843088] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [13:56:47.916874] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 372 outer iterations, total inner 154707 iterations, and diff 0.000998
[D] [13:56:47.987800] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [13:57:29.475821] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:252 SMO iteration 500, diff 0.005929
[D] [13:57:40.100401] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 728 outer iterations, total inner 302488 iterations, and diff 0.000999
[D] [13:57:46.050727] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working 

In [47]:
y_test_pred = svc_stem.predict(x_test)
y_test_pred_proba = svc_stem.predict_proba(x_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

          -1       0.76      0.61      0.67      6129
           0       0.80      0.29      0.42      6442
           1       0.94      0.99      0.96     91023

    accuracy                           0.92    103594
   macro avg       0.83      0.63      0.69    103594
weighted avg       0.92      0.92      0.91    103594



In [48]:
compare_list.loc[len(compare_list)] = [
    "after",
    "testing",
    "Stemmed",
    svc_stem_grid.best_params_["C"],
    svc_stem_grid.best_params_["gamma"],
    svc_stem_grid.best_params_["kernel"],
    svc_stem_grid.best_score_,
    f1_score(y_test, y_test_pred, average="weighted"),
    accuracy_score(y_test, y_test_pred),
    precision_score(y_test, y_test_pred, average="weighted"),
    roc_auc_score(y_test, y_test_pred_proba, multi_class="ovr")
]

In [49]:
display(compare_list)

Unnamed: 0,tuning,dataset,proc,C,gamma,kernel,grid_score,f1_score,accuracy,precision,roc_auc
0,before,validation,,default,default,default,,0.929112,0.937303,0.931095,
1,after,testing,,1,0.01,linear,0.926067,0.921103,0.930759,0.923284,0.90334
2,before,validation,Baseline,default,default,default,,0.923309,0.933037,0.927202,
3,after,testing,Baseline,1,0.1,linear,0.922254,0.915934,0.927322,0.919752,0.876626
4,before,validation,Stemmed,default,default,default,,0.921369,0.931463,0.925463,
5,after,testing,Stemmed,1,0.1,linear,0.920758,0.912371,0.924783,0.917112,0.865609


### Lemmatized + baseline preprocessing

In [50]:
x_train, x_val, x_test, y_train, y_val, y_test, vec_lem = pipeline("Lemmatized", df_testing)

In [51]:
print(np.shape(x_train))
print(np.shape(x_val))
print(np.shape(x_test))

print(np.shape(y_train))
print(np.shape(y_val))
print(np.shape(y_test))

(483438, 175340)
(103594, 175340)
(103594, 175340)
(483438,)
(103594,)
(103594,)


In [52]:
svc_lem_prelim = SVC(verbose=True)
svc_lem_prelim.fit(x_train, y_train)

[D] [14:06:47.603291] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [14:06:51.141700] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 178 outer iterations, total inner 72367 iterations, and diff 0.000998
[D] [14:06:51.213623] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [14:07:30.788815] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 342 outer iterations, total inner 142103 iterations, and diff 0.000995
[D] [14:07:30.865534] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [14:08:29.799955] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:252 SMO iteration 500, diff 0.003061
[D] [14:08:33.591671] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 556 outer iterations, total inner 226056 iterations, and diff 0.000997


In [53]:
y_val_pred = svc_lem_prelim.predict(x_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

          -1       0.82      0.63      0.72      6218
           0       0.82      0.36      0.50      6392
           1       0.94      0.99      0.97     90984

    accuracy                           0.93    103594
   macro avg       0.86      0.66      0.73    103594
weighted avg       0.93      0.93      0.92    103594



In [54]:
compare_list.loc[len(compare_list)] = [
    "before",
    "validation",
    "Lemmatized",
    "default",
    "default",
    "default",
    None,
    f1_score(y_val, y_val_pred, average="weighted"),
    accuracy_score(y_val, y_val_pred),
    precision_score(y_val, y_val_pred, average="weighted"),
    None,
]

In [55]:
svc_lem_grid = GridSearchCV(SVC(), param_grid, cv=cv, verbose=verbose, n_jobs=n_jobs)
svc_lem_grid.fit(x_val, y_val)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV 1/3] END ......C=1, gamma=scale, kernel=rbf;, score=0.918 total time=   3.8s
[CV 2/3] END ......C=1, gamma=scale, kernel=rbf;, score=0.918 total time=   3.7s
[CV 3/3] END ......C=1, gamma=scale, kernel=rbf;, score=0.920 total time=   3.8s
[CV 1/3] END ...C=1, gamma=scale, kernel=linear;, score=0.922 total time=   2.9s
[CV 2/3] END ...C=1, gamma=scale, kernel=linear;, score=0.921 total time=   2.9s
[CV 3/3] END ...C=1, gamma=scale, kernel=linear;, score=0.922 total time=   2.9s
[CV 1/3] END ........C=1, gamma=0.1, kernel=rbf;, score=0.907 total time=   1.9s
[CV 2/3] END ........C=1, gamma=0.1, kernel=rbf;, score=0.907 total time=   1.9s
[CV 3/3] END ........C=1, gamma=0.1, kernel=rbf;, score=0.909 total time=   1.9s
[CV 1/3] END .....C=1, gamma=0.1, kernel=linear;, score=0.922 total time=   2.8s
[CV 2/3] END .....C=1, gamma=0.1, kernel=linear;, score=0.921 total time=   2.9s
[CV 3/3] END .....C=1, gamma=0.1, kernel=linear;

In [56]:
print(classification_report(y_val, svc_lem_grid.predict(x_val)))

              precision    recall  f1-score   support

          -1       0.88      0.70      0.78      6218
           0       0.89      0.37      0.53      6392
           1       0.94      1.00      0.97     90984

    accuracy                           0.94    103594
   macro avg       0.90      0.69      0.76    103594
weighted avg       0.94      0.94      0.93    103594



In [57]:
print("best params for lem")
print(svc_lem_grid.best_params_)

best params for lem
{'C': 1, 'gamma': 0.1, 'kernel': 'linear'}


In [58]:
svc_lem = SVC(**svc_lem_grid.best_params_, probability=True, verbose=True)
svc_lem.fit(x_train, y_train)

[D] [14:11:05.050931] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [14:11:07.251111] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 179 outer iterations, total inner 76994 iterations, and diff 0.000995
[D] [14:11:07.306369] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [14:11:34.345948] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 355 outer iterations, total inner 146580 iterations, and diff 0.000999
[D] [14:11:34.412878] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [14:12:17.407405] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:252 SMO iteration 500, diff 0.004179
[D] [14:12:24.999042] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 672 outer iterations, total inner 274411 iterations, and diff 0.000997
[D] [14:12:31.049731] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working 

In [59]:
y_test_pred = svc_lem.predict(x_test)
y_test_pred_proba = svc_lem.predict_proba(x_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

          -1       0.76      0.61      0.68      6129
           0       0.79      0.30      0.43      6442
           1       0.94      0.99      0.96     91023

    accuracy                           0.93    103594
   macro avg       0.83      0.63      0.69    103594
weighted avg       0.92      0.93      0.91    103594



In [60]:
compare_list.loc[len(compare_list)] = [
    "after",
    "testing",
    "Lemmatized",
    svc_lem_grid.best_params_["C"],
    svc_lem_grid.best_params_["gamma"],
    svc_lem_grid.best_params_["kernel"],
    svc_lem_grid.best_score_,
    f1_score(y_test, y_test_pred, average="weighted"),
    accuracy_score(y_test, y_test_pred),
    precision_score(y_test, y_test_pred, average="weighted"),
    roc_auc_score(y_test, y_test_pred_proba, multi_class="ovr")
]

In [61]:
display(compare_list)

Unnamed: 0,tuning,dataset,proc,C,gamma,kernel,grid_score,f1_score,accuracy,precision,roc_auc
0,before,validation,,default,default,default,,0.929112,0.937303,0.931095,
1,after,testing,,1,0.01,linear,0.926067,0.921103,0.930759,0.923284,0.90334
2,before,validation,Baseline,default,default,default,,0.923309,0.933037,0.927202,
3,after,testing,Baseline,1,0.1,linear,0.922254,0.915934,0.927322,0.919752,0.876626
4,before,validation,Stemmed,default,default,default,,0.921369,0.931463,0.925463,
5,after,testing,Stemmed,1,0.1,linear,0.920758,0.912371,0.924783,0.917112,0.865609
6,before,validation,Lemmatized,default,default,default,,0.922313,0.932235,0.926326,
7,after,testing,Lemmatized,1,0.1,linear,0.921607,0.913989,0.925884,0.917947,0.873091


In [62]:
compare_list.to_csv("/content/drive/MyDrive/Syncable/projects/data270/results/svm_final_results.csv", index=False)