In [1]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!nvidia-smi

Sat May  4 20:35:35 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0              44W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 476, done.[K
remote: Counting objects: 100% (207/207), done.[K
remote: Compressing objects: 100% (116/116), done.[K
remote: Total 476 (delta 141), reused 124 (delta 91), pack-reused 269[K
Receiving objects: 100% (476/476), 131.59 KiB | 14.62 MiB/s, done.
Resolving deltas: 100% (243/243), done.
Collecting pynvml
  Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 53.1/53.1 kB 2.0 MB/s eta 0:00:00
Installing collected packages: pynvml
Successfully installed pynvml-11.5.0
***********************************************************************
Woo! Your instance has a NVIDIA A100-SXM4-40GB GPU!
We will install the latest stable RAPIDS via pip 24.4.*!  Please stand by, should be quick...
***********************************************************************

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
Collecting cudf-cu12==24.4.*
  Downloading https:

In [4]:
from cuml.svm import SVC
import cudf

import pandas as pd
import nltk
import re
import random
import numpy as np
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, accuracy_score, precision_score, roc_auc_score, roc_curve
from sklearn.model_selection import GridSearchCV
from nltk.corpus import stopwords
import pickle
import warnings

warnings.filterwarnings("ignore")

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
random_state = 42
testing_frac = 1
random.seed(random_state)
data_path = "/content/drive/MyDrive/Syncable/projects/data270/data/combined.csv"

In [6]:
df = pd.read_csv(data_path, low_memory=False)
df = df.drop_duplicates()
df = df[["overall", "reviewText", "summary"]]
df.dropna(inplace=True)
df["sentiment"] = df["overall"].apply(lambda x: 1 if x > 3 else -1 if x < 3 else 0)
df["reviewTextWithSummary"] = df["summary"] + " " + df["reviewText"]
df.drop(["overall", "summary", "reviewText"], axis=1, inplace=True)
df.head()

Unnamed: 0,sentiment,reviewTextWithSummary
0,1,Five Stars As advertised. Reasonably priced
1,1,Good for the face Like the oder and the feel w...
2,-1,Smells awful I bought this to smell nice after...
3,1,Truth is There IS Nothing Like an AQUA VELVA M...
4,1,Bvlgari Shampoo If you ever want to feel pampe...


# Model experimentation

In [7]:
df_testing = df.sample(frac=testing_frac)
df_testing["sentiment"].value_counts()

sentiment
 1    606258
 0     43210
-1     41158
Name: count, dtype: int64

In [8]:
df_testing.head()

Unnamed: 0,sentiment,reviewTextWithSummary
318228,1,Beadalon Stringing Wire 49-Strand .018-Inch Fa...
247183,1,A virtual playground of creativity! I've been ...
608865,1,The best money I have spent!!! Love It! Works...
527067,0,Did not fit properly with card base die cut Bo...
734439,0,way bigger than i expected but will find a way...


In [9]:
print("Dataset size:", len(df_testing))

Dataset size: 690626


In [10]:
STOP_WORDS = set(stopwords.words("english"))

In [11]:
def preprocess_text(sentence, stop, type_proc=None):
    words = []
    for word in sentence.lower().strip().split():

        word = re.sub("\d", "", word)
        word = re.sub("[^\w\s]", "", word)

        if word not in stop and word != "":
            words.append(preprocess_type(word, type_proc))

    return " ".join(words)

In [12]:
def preprocess_type(word, type_proc):
    if type_proc == "Baseline":
        return word
    elif type_proc == "Stemmed":
        return PorterStemmer().stem(word)
    elif type_proc == "Lemmatized":
        return WordNetLemmatizer().lemmatize(word)
    else:
        raise ValueError("Invalid Preprocessing Type")

In [13]:
def train_val_test_split(df=df, random_state=random_state):
    x = df[["reviewTextWithSummary"]]
    y = df["sentiment"]
    x_train, x_tmp, y_train, y_tmp = train_test_split(
        x, y, test_size=0.3, random_state=random_state
    )
    x_val, x_test, y_val, y_test = train_test_split(
        x_tmp, y_tmp, test_size=0.5, random_state=random_state
    )
    return x_train, x_val, x_test, y_train, y_val, y_test

In [14]:
def pipeline(proc, df, vectorizer=None, random_state=random_state):
    df_ = df.copy()
    if proc is not None:
        df_["reviewTextWithSummary"] = df_["reviewTextWithSummary"].apply(
            lambda x: preprocess_text(x, STOP_WORDS, proc)
        )

    x_train, x_val, x_test, y_train, y_val, y_test = train_val_test_split(
        df_, random_state
    )
    if vectorizer is None:
        vectorizer = TfidfVectorizer()
        vectorizer.fit(df_["reviewTextWithSummary"])
    x_train = vectorizer.transform(x_train["reviewTextWithSummary"])
    x_val = vectorizer.transform(x_val["reviewTextWithSummary"])
    x_test = vectorizer.transform(x_test["reviewTextWithSummary"])

    return x_train, x_val, x_test, y_train, y_val, y_test, vectorizer

In [None]:
model_found = False
model = None
vectorizer = None
try:
    model = pickle.load(open("/content/drive/MyDrive/Syncable/projects/data270/models/svm_final_model.pkl", "rb"))
    vectorizer = pickle.load(open("/content/drive/MyDrive/Syncable/projects/data270/models/svm_final_vectorizer.pkl", "rb"))
    model_found = True
    print("Model and vectorizer found")
except FileNotFoundError:
    print("Model and vectorizer not found")

In [None]:
if model_found:
    x_train, x_val, x_test, y_train, y_val, y_test, vectorizer = pipeline(None, df_testing, vectorizer)
    y_test_pred = model.predict(x_test)
    y_test_pred_proba = model.predict_proba(x_test)
    print(classification_report(y_test, y_test_pred))
    print()
    print("f1_score:", f1_score(y_test, y_test_pred, average="weighted"))
    print("accuracy:", accuracy_score(y_test, y_test_pred))
    print("precision:", precision_score(y_test, y_test_pred, average="weighted"))
    print("roc_auc:", roc_auc_score(y_test, y_test_pred_proba, multi_class="ovr"))

    plt.figure(figsize=(10, 10))
    fpr, tpr, _ = roc_curve(y_test, y_test_pred_proba, pos_label=1)
    plt.plot(fpr, tpr)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.show()
    
    print("Stopping the execution of the notebook to avoid retraining the model and overwriting the existing notebook results")
    raise SystemExit("Stopping the execution of the notebook to avoid retraining the model and overwriting the existing notebook results")

## Testing different configs

In [15]:
param_grid = {
    "C": [10, 1, 0.1, 0.01],
    "gamma": ["scale", 0.1, 0.01],
    "kernel": ["rbf", "linear"],
}
n_jobs = None
verbose = 3
cv = 3

In [16]:
compare_list = pd.DataFrame(
    columns=[
        "tuning",
        "dataset",
        "proc",
        "C",
        "gamma",
        "kernel",
        "grid_score",
        "f1_score",
        "accuracy",
        "precision",
        "roc_auc",
    ]
)

### No preprocessing

In [17]:
x_train, x_val, x_test, y_train, y_val, y_test, vec_noproc = pipeline(None, df_testing)

In [18]:
print(np.shape(x_train))
print(np.shape(x_val))
print(np.shape(x_test))

print(np.shape(y_train))
print(np.shape(y_val))
print(np.shape(y_test))

(483438, 118754)
(103594, 118754)
(103594, 118754)
(483438,)
(103594,)
(103594,)


In [19]:
svc_noproc_prelim = SVC(verbose=True)
svc_noproc_prelim.fit(x_train, y_train)

[D] [20:36:32.017105] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [20:36:37.193110] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 178 outer iterations, total inner 74653 iterations, and diff 0.000998
[D] [20:36:37.418633] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [20:37:08.562763] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 314 outer iterations, total inner 132016 iterations, and diff 0.000998
[D] [20:37:08.693621] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [20:37:56.513337] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 491 outer iterations, total inner 202669 iterations, and diff 0.000998


In [20]:
y_val_pred = svc_noproc_prelim.predict(x_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

          -1       0.81      0.69      0.74      6302
           0       0.79      0.38      0.52      6608
           1       0.95      0.99      0.97     90684

    accuracy                           0.94    103594
   macro avg       0.85      0.69      0.74    103594
weighted avg       0.93      0.94      0.93    103594



In [21]:
compare_list.loc[len(compare_list)] = [
    "before",
    "validation",
    None,
    "default",
    "default",
    "default",
    None,
    f1_score(y_val, y_val_pred, average="weighted"),
    accuracy_score(y_val, y_val_pred),
    precision_score(y_val, y_val_pred, average="weighted"),
    None,
]

In [22]:
svc_noproc_grid = GridSearchCV(SVC(), param_grid, cv=cv, verbose=verbose, n_jobs=n_jobs)
svc_noproc_grid.fit(x_val, y_val)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV 1/3] END .....C=10, gamma=scale, kernel=rbf;, score=0.926 total time=  11.2s
[CV 2/3] END .....C=10, gamma=scale, kernel=rbf;, score=0.926 total time=   9.3s
[CV 3/3] END .....C=10, gamma=scale, kernel=rbf;, score=0.929 total time=   9.3s
[W] [20:38:42.744497] SVC with the linear kernel can be much faster using the specialized solver provided by LinearSVC. Consider switching to LinearSVC if tranining takes too long.
[CV 1/3] END ..C=10, gamma=scale, kernel=linear;, score=0.913 total time=  20.3s
[CV 2/3] END ..C=10, gamma=scale, kernel=linear;, score=0.913 total time=  20.8s
[CV 3/3] END ..C=10, gamma=scale, kernel=linear;, score=0.914 total time=  20.6s
[CV 1/3] END .......C=10, gamma=0.1, kernel=rbf;, score=0.924 total time=   6.3s
[CV 2/3] END .......C=10, gamma=0.1, kernel=rbf;, score=0.925 total time=   6.3s
[CV 3/3] END .......C=10, gamma=0.1, kernel=rbf;, score=0.926 total time=   6.3s
[CV 1/3] END ....C=10, gamma=

In [23]:
print(classification_report(y_val, svc_noproc_grid.predict(x_val)))

              precision    recall  f1-score   support

          -1       1.00      1.00      1.00      6302
           0       1.00      1.00      1.00      6608
           1       1.00      1.00      1.00     90684

    accuracy                           1.00    103594
   macro avg       1.00      1.00      1.00    103594
weighted avg       1.00      1.00      1.00    103594



In [24]:
print("best params for noproc")
print(svc_noproc_grid.best_params_)

best params for noproc
{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}


In [25]:
svc_noproc = SVC(**svc_noproc_grid.best_params_, probability=True, verbose=True)
svc_noproc.fit(x_train, y_train)

[D] [20:45:41.239887] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [20:45:49.172198] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 300 outer iterations, total inner 131822 iterations, and diff 0.000997
[D] [20:45:49.268917] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [20:46:29.950836] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 487 outer iterations, total inner 211887 iterations, and diff 0.000999
[D] [20:46:30.054239] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [20:47:12.558161] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:252 SMO iteration 500, diff 0.054266
[D] [20:47:39.470817] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 861 outer iterations, total inner 362447 iterations, and diff 0.000999
[D] [20:47:56.043687] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working

In [26]:
y_test_pred = svc_noproc.predict(x_test)
y_test_pred_proba = svc_noproc.predict_proba(x_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

          -1       0.81      0.71      0.76      6208
           0       0.76      0.42      0.54      6507
           1       0.95      0.99      0.97     90879

    accuracy                           0.94    103594
   macro avg       0.84      0.71      0.76    103594
weighted avg       0.93      0.94      0.93    103594



In [27]:
compare_list.loc[len(compare_list)] = [
    "after",
    "testing",
    None,
    svc_noproc_grid.best_params_["C"],
    svc_noproc_grid.best_params_["gamma"],
    svc_noproc_grid.best_params_["kernel"],
    svc_noproc_grid.best_score_,
    f1_score(y_test, y_test_pred, average="weighted"),
    accuracy_score(y_test, y_test_pred),
    precision_score(y_test, y_test_pred, average="weighted"),
    roc_auc_score(y_test, y_test_pred_proba, multi_class="ovr")
]

In [28]:
display(compare_list)

Unnamed: 0,tuning,dataset,proc,C,gamma,kernel,grid_score,f1_score,accuracy,precision,roc_auc
0,before,validation,,default,default,default,,0.926572,0.935083,0.928717,
1,after,testing,,10,scale,rbf,0.927197,0.931802,0.938529,0.932113,0.920846


### Baseline preprocessing

In [29]:
x_train, x_val, x_test, y_train, y_val, y_test, vec_baseline = pipeline(
    "Baseline", df_testing
)

In [30]:
print(np.shape(x_train))
print(np.shape(x_val))
print(np.shape(x_test))

print(np.shape(y_train))
print(np.shape(y_val))
print(np.shape(y_test))

(483438, 183934)
(103594, 183934)
(103594, 183934)
(483438,)
(103594,)
(103594,)


In [31]:
svc_baseline_prelim = SVC(verbose=True)
svc_baseline_prelim.fit(x_train, y_train)

[D] [21:01:05.243937] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [21:01:08.769010] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 175 outer iterations, total inner 70400 iterations, and diff 0.000999
[D] [21:01:08.855542] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [21:01:48.266718] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 332 outer iterations, total inner 137096 iterations, and diff 0.000998
[D] [21:01:48.361262] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [21:02:48.226311] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:252 SMO iteration 500, diff 0.002200
[D] [21:02:50.591742] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 536 outer iterations, total inner 220385 iterations, and diff 0.000999


In [32]:
y_val_pred = svc_baseline_prelim.predict(x_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

          -1       0.81      0.64      0.71      6302
           0       0.82      0.36      0.50      6608
           1       0.94      0.99      0.97     90684

    accuracy                           0.93    103594
   macro avg       0.86      0.66      0.73    103594
weighted avg       0.92      0.93      0.92    103594



In [33]:
compare_list.loc[len(compare_list)] = [
    "before",
    "validation",
    "Baseline",
    "default",
    "default",
    "default",
    None,
    f1_score(y_val, y_val_pred, average="weighted"),
    accuracy_score(y_val, y_val_pred),
    precision_score(y_val, y_val_pred, average="weighted"),
    None,
]

In [34]:
svc_baseline_grid = GridSearchCV(
    SVC(), param_grid, cv=cv, verbose=verbose, n_jobs=n_jobs
)
svc_baseline_grid.fit(x_val, y_val)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV 1/3] END .....C=10, gamma=scale, kernel=rbf;, score=0.924 total time=   5.7s
[CV 2/3] END .....C=10, gamma=scale, kernel=rbf;, score=0.921 total time=   5.7s
[CV 3/3] END .....C=10, gamma=scale, kernel=rbf;, score=0.924 total time=   5.7s
[CV 1/3] END ..C=10, gamma=scale, kernel=linear;, score=0.910 total time=  11.9s
[CV 2/3] END ..C=10, gamma=scale, kernel=linear;, score=0.908 total time=  12.1s
[CV 3/3] END ..C=10, gamma=scale, kernel=linear;, score=0.909 total time=  11.8s
[CV 1/3] END .......C=10, gamma=0.1, kernel=rbf;, score=0.922 total time=   4.1s
[CV 2/3] END .......C=10, gamma=0.1, kernel=rbf;, score=0.920 total time=   4.0s
[CV 3/3] END .......C=10, gamma=0.1, kernel=rbf;, score=0.922 total time=   4.0s
[CV 1/3] END ....C=10, gamma=0.1, kernel=linear;, score=0.910 total time=  11.9s
[CV 2/3] END ....C=10, gamma=0.1, kernel=linear;, score=0.908 total time=  12.1s
[CV 3/3] END ....C=10, gamma=0.1, kernel=linear;

In [35]:
print(classification_report(y_val, svc_baseline_grid.predict(x_val)))

              precision    recall  f1-score   support

          -1       1.00      1.00      1.00      6302
           0       1.00      1.00      1.00      6608
           1       1.00      1.00      1.00     90684

    accuracy                           1.00    103594
   macro avg       1.00      1.00      1.00    103594
weighted avg       1.00      1.00      1.00    103594



In [36]:
print("best params for baseline")
print(svc_baseline_grid.best_params_)

best params for baseline
{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}


In [37]:
svc_baseline = SVC(**svc_baseline_grid.best_params_, probability=True, verbose=True)
svc_baseline.fit(x_train, y_train)

[D] [21:07:28.684981] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [21:07:33.252585] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 276 outer iterations, total inner 118036 iterations, and diff 0.000996
[D] [21:07:33.324696] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [21:08:22.038078] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:252 SMO iteration 500, diff 0.001044
[D] [21:08:22.055718] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 502 outer iterations, total inner 215229 iterations, and diff 0.000996
[D] [21:08:22.135275] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [21:09:11.440506] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:252 SMO iteration 500, diff 0.042247
[D] [21:09:42.095075] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 887 outer iterations, total inner 363480 iterations, an

In [38]:
y_test_pred = svc_baseline.predict(x_test)
y_test_pred_proba = svc_baseline.predict_proba(x_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

          -1       0.81      0.66      0.73      6208
           0       0.78      0.40      0.53      6507
           1       0.95      0.99      0.97     90879

    accuracy                           0.93    103594
   macro avg       0.85      0.69      0.74    103594
weighted avg       0.93      0.93      0.93    103594



In [39]:
compare_list.loc[len(compare_list)] = [
    "after",
    "testing",
    "Baseline",
    svc_baseline_grid.best_params_["C"],
    svc_baseline_grid.best_params_["gamma"],
    svc_baseline_grid.best_params_["kernel"],
    svc_baseline_grid.best_score_,
    f1_score(y_test, y_test_pred, average="weighted"),
    accuracy_score(y_test, y_test_pred),
    precision_score(y_test, y_test_pred, average="weighted"),
    roc_auc_score(y_test, y_test_pred_proba, multi_class="ovr")
]

In [40]:
display(compare_list)

Unnamed: 0,tuning,dataset,proc,C,gamma,kernel,grid_score,f1_score,accuracy,precision,roc_auc
0,before,validation,,default,default,default,,0.926572,0.935083,0.928717,
1,after,testing,,10,scale,rbf,0.927197,0.931802,0.938529,0.932113,0.920846
2,before,validation,Baseline,default,default,default,,0.920955,0.931038,0.924932,
3,after,testing,Baseline,10,scale,rbf,0.922766,0.926995,0.934996,0.928525,0.901149


### Stemmed + baseline preprocessing

In [41]:
x_train, x_val, x_test, y_train, y_val, y_test, vec_stem = pipeline("Stemmed", df_testing)

In [42]:
print(np.shape(x_train))
print(np.shape(x_val))
print(np.shape(x_test))

print(np.shape(y_train))
print(np.shape(y_val))
print(np.shape(y_test))

(483438, 150189)
(103594, 150189)
(103594, 150189)
(483438,)
(103594,)
(103594,)


In [43]:
svc_stem_prelim = SVC(verbose=True)
svc_stem_prelim.fit(x_train, y_train)

[D] [21:29:29.300987] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [21:29:32.666477] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 173 outer iterations, total inner 71335 iterations, and diff 0.000994
[D] [21:29:32.750336] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [21:30:11.249019] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 338 outer iterations, total inner 139602 iterations, and diff 0.000999
[D] [21:30:11.341683] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [21:31:09.594312] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:252 SMO iteration 500, diff 0.002575
[D] [21:31:12.651869] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 549 outer iterations, total inner 225043 iterations, and diff 0.000994


In [44]:
y_val_pred = svc_stem_prelim.predict(x_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

          -1       0.81      0.63      0.71      6302
           0       0.82      0.35      0.49      6608
           1       0.94      0.99      0.97     90684

    accuracy                           0.93    103594
   macro avg       0.86      0.66      0.72    103594
weighted avg       0.92      0.93      0.92    103594



In [45]:
compare_list.loc[len(compare_list)] = [
    "before",
    "validation",
    "Stemmed",
    "default",
    "default",
    "default",
    None,
    f1_score(y_val, y_val_pred, average="weighted"),
    accuracy_score(y_val, y_val_pred),
    precision_score(y_val, y_val_pred, average="weighted"),
    None,
]

In [46]:
svc_stem_grid = GridSearchCV(SVC(), param_grid, cv=cv, verbose=verbose, n_jobs=n_jobs)
svc_stem_grid.fit(x_val, y_val)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV 1/3] END .....C=10, gamma=scale, kernel=rbf;, score=0.922 total time=   5.6s
[CV 2/3] END .....C=10, gamma=scale, kernel=rbf;, score=0.920 total time=   5.6s
[CV 3/3] END .....C=10, gamma=scale, kernel=rbf;, score=0.923 total time=   5.6s
[CV 1/3] END ..C=10, gamma=scale, kernel=linear;, score=0.910 total time=  12.8s
[CV 2/3] END ..C=10, gamma=scale, kernel=linear;, score=0.908 total time=  12.6s
[CV 3/3] END ..C=10, gamma=scale, kernel=linear;, score=0.909 total time=  13.0s
[CV 1/3] END .......C=10, gamma=0.1, kernel=rbf;, score=0.920 total time=   3.8s
[CV 2/3] END .......C=10, gamma=0.1, kernel=rbf;, score=0.920 total time=   3.7s
[CV 3/3] END .......C=10, gamma=0.1, kernel=rbf;, score=0.921 total time=   3.8s
[CV 1/3] END ....C=10, gamma=0.1, kernel=linear;, score=0.910 total time=  12.7s
[CV 2/3] END ....C=10, gamma=0.1, kernel=linear;, score=0.908 total time=  12.7s
[CV 3/3] END ....C=10, gamma=0.1, kernel=linear;

In [47]:
print(classification_report(y_val, svc_stem_grid.predict(x_val)))

              precision    recall  f1-score   support

          -1       1.00      1.00      1.00      6302
           0       1.00      1.00      1.00      6608
           1       1.00      1.00      1.00     90684

    accuracy                           1.00    103594
   macro avg       1.00      1.00      1.00    103594
weighted avg       1.00      1.00      1.00    103594



In [48]:
print("best params for stem")
print(svc_stem_grid.best_params_)

best params for stem
{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}


In [49]:
svc_stem = SVC(**svc_stem_grid.best_params_, probability=True, verbose=True)
svc_stem.fit(x_train, y_train)

[D] [21:35:51.588375] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [21:35:56.302957] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 292 outer iterations, total inner 128298 iterations, and diff 0.001000
[D] [21:35:56.372285] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [21:36:45.143383] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:252 SMO iteration 500, diff 0.002319
[D] [21:36:47.382989] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 543 outer iterations, total inner 234896 iterations, and diff 0.000994
[D] [21:36:47.462005] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [21:37:35.580954] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:252 SMO iteration 500, diff 0.058602
[D] [21:38:09.653222] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 917 outer iterations, total inner 386363 iterations, an

In [50]:
y_test_pred = svc_stem.predict(x_test)
y_test_pred_proba = svc_stem.predict_proba(x_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

          -1       0.81      0.65      0.72      6208
           0       0.79      0.41      0.54      6507
           1       0.95      0.99      0.97     90879

    accuracy                           0.93    103594
   macro avg       0.85      0.68      0.74    103594
weighted avg       0.93      0.93      0.93    103594



In [51]:
compare_list.loc[len(compare_list)] = [
    "after",
    "testing",
    "Stemmed",
    svc_stem_grid.best_params_["C"],
    svc_stem_grid.best_params_["gamma"],
    svc_stem_grid.best_params_["kernel"],
    svc_stem_grid.best_score_,
    f1_score(y_test, y_test_pred, average="weighted"),
    accuracy_score(y_test, y_test_pred),
    precision_score(y_test, y_test_pred, average="weighted"),
    roc_auc_score(y_test, y_test_pred_proba, multi_class="ovr")
]

In [52]:
display(compare_list)

Unnamed: 0,tuning,dataset,proc,C,gamma,kernel,grid_score,f1_score,accuracy,precision,roc_auc
0,before,validation,,default,default,default,,0.926572,0.935083,0.928717,
1,after,testing,,10,scale,rbf,0.927197,0.931802,0.938529,0.932113,0.920846
2,before,validation,Baseline,default,default,default,,0.920955,0.931038,0.924932,
3,after,testing,Baseline,10,scale,rbf,0.922766,0.926995,0.934996,0.928525,0.901149
4,before,validation,Stemmed,default,default,default,,0.919281,0.929735,0.923648,
5,after,testing,Stemmed,10,scale,rbf,0.921714,0.92666,0.934697,0.928322,0.898719


### Lemmatized + baseline preprocessing

In [53]:
x_train, x_val, x_test, y_train, y_val, y_test, vec_lem = pipeline("Lemmatized", df_testing)

In [54]:
print(np.shape(x_train))
print(np.shape(x_val))
print(np.shape(x_test))

print(np.shape(y_train))
print(np.shape(y_val))
print(np.shape(y_test))

(483438, 175340)
(103594, 175340)
(103594, 175340)
(483438,)
(103594,)
(103594,)


In [55]:
svc_lem_prelim = SVC(verbose=True)
svc_lem_prelim.fit(x_train, y_train)

[D] [21:52:39.949400] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [21:52:43.459632] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 177 outer iterations, total inner 72148 iterations, and diff 0.001000
[D] [21:52:43.529375] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [21:53:23.469496] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 342 outer iterations, total inner 141881 iterations, and diff 0.001000
[D] [21:53:23.557342] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [21:54:22.365765] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:252 SMO iteration 500, diff 0.002646
[D] [21:54:25.448193] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 544 outer iterations, total inner 225092 iterations, and diff 0.000998


In [56]:
y_val_pred = svc_lem_prelim.predict(x_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

          -1       0.81      0.62      0.71      6302
           0       0.81      0.36      0.49      6608
           1       0.94      0.99      0.97     90684

    accuracy                           0.93    103594
   macro avg       0.86      0.66      0.72    103594
weighted avg       0.92      0.93      0.92    103594



In [57]:
compare_list.loc[len(compare_list)] = [
    "before",
    "validation",
    "Lemmatized",
    "default",
    "default",
    "default",
    None,
    f1_score(y_val, y_val_pred, average="weighted"),
    accuracy_score(y_val, y_val_pred),
    precision_score(y_val, y_val_pred, average="weighted"),
    None,
]

In [58]:
svc_lem_grid = GridSearchCV(SVC(), param_grid, cv=cv, verbose=verbose, n_jobs=n_jobs)
svc_lem_grid.fit(x_val, y_val)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV 1/3] END .....C=10, gamma=scale, kernel=rbf;, score=0.923 total time=   5.6s
[CV 2/3] END .....C=10, gamma=scale, kernel=rbf;, score=0.921 total time=   5.6s
[CV 3/3] END .....C=10, gamma=scale, kernel=rbf;, score=0.923 total time=   5.7s
[CV 1/3] END ..C=10, gamma=scale, kernel=linear;, score=0.910 total time=  12.4s
[CV 2/3] END ..C=10, gamma=scale, kernel=linear;, score=0.909 total time=  12.7s
[CV 3/3] END ..C=10, gamma=scale, kernel=linear;, score=0.910 total time=  12.6s
[CV 1/3] END .......C=10, gamma=0.1, kernel=rbf;, score=0.920 total time=   3.9s
[CV 2/3] END .......C=10, gamma=0.1, kernel=rbf;, score=0.920 total time=   3.9s
[CV 3/3] END .......C=10, gamma=0.1, kernel=rbf;, score=0.921 total time=   4.0s
[CV 1/3] END ....C=10, gamma=0.1, kernel=linear;, score=0.910 total time=  12.6s
[CV 2/3] END ....C=10, gamma=0.1, kernel=linear;, score=0.909 total time=  12.6s
[CV 3/3] END ....C=10, gamma=0.1, kernel=linear;

In [59]:
print(classification_report(y_val, svc_lem_grid.predict(x_val)))

              precision    recall  f1-score   support

          -1       1.00      1.00      1.00      6302
           0       1.00      1.00      1.00      6608
           1       1.00      1.00      1.00     90684

    accuracy                           1.00    103594
   macro avg       1.00      1.00      1.00    103594
weighted avg       1.00      1.00      1.00    103594



In [60]:
print("best params for lem")
print(svc_lem_grid.best_params_)

best params for lem
{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}


In [61]:
svc_lem = SVC(**svc_lem_grid.best_params_, probability=True, verbose=True)
svc_lem.fit(x_train, y_train)

[D] [21:59:08.775937] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [21:59:13.461086] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 286 outer iterations, total inner 123323 iterations, and diff 0.000997
[D] [21:59:13.519722] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [22:00:02.673834] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:252 SMO iteration 500, diff 0.001997
[D] [22:00:03.972107] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 527 outer iterations, total inner 228006 iterations, and diff 0.000998
[D] [22:00:04.042806] /__w/cuml/cuml/cpp/src/svm/workingset.cuh:118 Creating working set with 1024 elements
[D] [22:00:52.955191] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:252 SMO iteration 500, diff 0.051262
[D] [22:01:24.642549] /__w/cuml/cuml/cpp/src/svm/smosolver.cuh:255 SMO solver finished after 881 outer iterations, total inner 372399 iterations, an

In [62]:
y_test_pred = svc_lem.predict(x_test)
y_test_pred_proba = svc_lem.predict_proba(x_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

          -1       0.81      0.65      0.72      6208
           0       0.78      0.40      0.53      6507
           1       0.95      0.99      0.97     90879

    accuracy                           0.93    103594
   macro avg       0.85      0.68      0.74    103594
weighted avg       0.93      0.93      0.93    103594



In [63]:
compare_list.loc[len(compare_list)] = [
    "after",
    "testing",
    "Lemmatized",
    svc_lem_grid.best_params_["C"],
    svc_lem_grid.best_params_["gamma"],
    svc_lem_grid.best_params_["kernel"],
    svc_lem_grid.best_score_,
    f1_score(y_test, y_test_pred, average="weighted"),
    accuracy_score(y_test, y_test_pred),
    precision_score(y_test, y_test_pred, average="weighted"),
    roc_auc_score(y_test, y_test_pred_proba, multi_class="ovr")
]

In [64]:
display(compare_list)

Unnamed: 0,tuning,dataset,proc,C,gamma,kernel,grid_score,f1_score,accuracy,precision,roc_auc
0,before,validation,,default,default,default,,0.926572,0.935083,0.928717,
1,after,testing,,10,scale,rbf,0.927197,0.931802,0.938529,0.932113,0.920846
2,before,validation,Baseline,default,default,default,,0.920955,0.931038,0.924932,
3,after,testing,Baseline,10,scale,rbf,0.922766,0.926995,0.934996,0.928525,0.901149
4,before,validation,Stemmed,default,default,default,,0.919281,0.929735,0.923648,
5,after,testing,Stemmed,10,scale,rbf,0.921714,0.92666,0.934697,0.928322,0.898719
6,before,validation,Lemmatized,default,default,default,,0.919798,0.930083,0.923782,
7,after,testing,Lemmatized,10,scale,rbf,0.922302,0.926326,0.934407,0.927716,0.899914


In [65]:
compare_list.to_csv("/content/drive/MyDrive/Syncable/projects/data270/results/svm_final_results.csv", index=False)

# Conclusion
- The best model based on the testing above is the model with no preprocessing applied to the text data.
- Though the model with no preprocessing applied to the text data is the best, the difference in performance between the models is not significant.

In [None]:
pickle.dump(svc_noproc, open("/content/drive/MyDrive/Syncable/projects/data270/models/svm_final_model.pkl", "wb"))
pickle.dump(vec_noproc, open("/content/drive/MyDrive/Syncable/projects/data270/models/svm_final_vectorizer.pkl", "wb"))

In [None]:
model = pickle.load(open("/content/drive/MyDrive/Syncable/projects/data270/models/svm_final_model.pkl", "rb"))
vectorizer = pickle.load(open("/content/drive/MyDrive/Syncable/projects/data270/models/svm_final_vectorizer.pkl", "rb"))

In [None]:
x_train, x_val, x_test, y_train, y_val, y_test, vectorizer = pipeline(None, df_testing, vectorizer)
y_test_pred = model.predict(x_test)
y_test_pred_proba = model.predict_proba(x_test)
print(classification_report(y_test, y_test_pred))
print()
print("f1_score:", f1_score(y_test, y_test_pred, average="weighted"))
print("accuracy:", accuracy_score(y_test, y_test_pred))
print("precision:", precision_score(y_test, y_test_pred, average="weighted"))
print("roc_auc:", roc_auc_score(y_test, y_test_pred_proba, multi_class="ovr"))

plt.figure(figsize=(10, 10))
fpr, tpr, _ = roc_curve(y_test, y_test_pred_proba, pos_label=1)
plt.plot(fpr, tpr)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.show()