In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV

from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, f1_score

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


## Reading in data sources

###**Spam** data sets

In [None]:
## Full features
X_train_full = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SPAM/X_train_fSP.pkl')
X_test_full = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SPAM/X_test_fSP.pkl')

## Vanilla features
X_train_v = X_train_full.drop(['text', 'cleaned_text', 'cleaned_text_full'], axis=1)
X_test_v = X_test_full.drop(['text', 'cleaned_text', 'cleaned_text_full'], axis=1)
scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_v), index = X_train_v.index, columns = X_train_v.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_v), index = X_test_v.index, columns = X_test_v.columns)

## target label
y_train = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SPAM/y_train_fSP.pkl')
y_test = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SPAM/y_test_fSP.pkl')

## TF-IDF with top 15 features
X_train_tfidf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF SPAM/X_train_tfSP.pkl')
X_test_tfidf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF SPAM/X_test_tfSP.pkl')

## TF-IDF GloVe with top 15 features
X_train_tfidf_glove = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF Glove SPAM/X_train_tfglSP.pkl')
X_test_tfidf_glove = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF Glove SPAM/X_test_tfglSP.pkl')

## TF-IDF FastText (cc) with top 15 features
X_train_tfidf_cc = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF CC SPAM/X_train_tfccSP.pkl')
X_test_tfidf_cc = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF CC SPAM/X_test_tfccSP.pkl')

## TF-IDF InferSent with top 15 features
X_train_tfidf_inf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SPAM/X_train_infSP.pkl')
X_test_tfidf_inf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SPAM/X_test_infSP.pkl')
y_train_tfidf_inf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SPAM/y_train_infSP.pkl')
y_test_tfidf_inf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SPAM/y_test_infSP.pkl')

##Support Vector Classifier

### Hyperparameter Tuning

In [None]:
def show_results(y_actual, y_pred):
  print(classification_report(y_actual, y_pred, digits=3))
  print(confusion_matrix(y_actual, y_pred))
  print("Accuracy: " + str(accuracy_score(y_actual, y_pred)))
  print("f1 score: " + str(f1_score(y_actual, y_pred)))
  tn, fp, fn, tp = confusion_matrix(y_actual, y_pred).ravel()
  fpr = fp/(fp+tn)
  print("False Postive Rate: " + str(fpr) + "\n")

In [None]:
# Specifies the norm used in the penalization.
penalty = ['l1', 'l2']
# Inverse of regularization strength: Smaller values specify stronger regularization.
C = [0.001, 0.01, 0.1, 1, 10]
# Weights associated with classes in the form {class_label: weight}. If not given, all classes are supposed to have weight one.
class_weight = [None, 'balanced']
# Maximum number of iterations taken for the solvers to converge. - Set at 10,000 after testing in order to ensure the logreg process converges.
max_iter = [10000]

# Create the param grid
param_test = {'penalty': penalty,
              'C': C,
              'class_weight': class_weight,
              'max_iter': max_iter}

###Training best model for scaled Vanilla features

In [None]:
clf_1 = LinearSVC(random_state=1)
rs_1 = RandomizedSearchCV(estimator = clf_1, 
                            param_distributions = param_test, 
                            n_iter = 50,
                            refit = True,
                            random_state = 1,
                            n_jobs = -1
                            )
%time rs_1.fit(X_train_scaled, y_train)
print('Best score reached: {} with params: {} '.format(rs_1.best_score_, rs_1.best_params_))



CPU times: user 3.78 s, sys: 112 ms, total: 3.89 s
Wall time: 24 s
Best score reached: 0.7624440280485665 with params: {'penalty': 'l2', 'max_iter': 10000, 'class_weight': 'balanced', 'C': 10} 


In [None]:
%time y_train_pred = rs_1.predict(X_train_scaled)
print("Train Results:")
show_results(y_train, y_train_pred)

%time y_test_pred = rs_1.predict(X_test_scaled)
print("Test Results:")
show_results(y_test, y_test_pred)

CPU times: user 8.05 ms, sys: 0 ns, total: 8.05 ms
Wall time: 7.44 ms
Train Results:
              precision    recall  f1-score   support

           0      0.761     0.834     0.796     12462
           1      0.766     0.675     0.718     10059

    accuracy                          0.763     22521
   macro avg      0.764     0.755     0.757     22521
weighted avg      0.763     0.763     0.761     22521

[[10391  2071]
 [ 3266  6793]]
Accuracy: 0.7630211802317837
f1 score: 0.7179622681393013
False Postive Rate: 0.1661852030171722

CPU times: user 6.48 ms, sys: 74 µs, total: 6.55 ms
Wall time: 6.23 ms
Test Results:
              precision    recall  f1-score   support

           0      0.761     0.840     0.799      6138
           1      0.773     0.673     0.720      4955

    accuracy                          0.766     11093
   macro avg      0.767     0.757     0.759     11093
weighted avg      0.766     0.766     0.764     11093

[[5158  980]
 [1618 3337]]
Accuracy: 0.76579825

###Training best model for Top 15 Features + TF-IDF

In [None]:
clf_2 = LinearSVC(random_state=1)
rs_2 = RandomizedSearchCV(estimator = clf_2, 
                      param_distributions = param_test, 
                      n_iter = 50,
                      refit = True,
                      random_state = 1,
                      n_jobs = -1
                      )

%time rs_2.fit(X_train_tfidf, y_train)
print('Best score reached: {} with params: {} '.format(rs_2.best_score_, rs_2.best_params_))



CPU times: user 1.45 s, sys: 591 ms, total: 2.04 s
Wall time: 30.6 s
Best score reached: 0.9260246065650364 with params: {'penalty': 'l2', 'max_iter': 10000, 'class_weight': None, 'C': 1} 


In [None]:
%time y_train_pred = rs_2.predict(X_train_tfidf)
print("Train Results:")
show_results(y_train, y_train_pred)

%time y_test_pred = rs_2.predict(X_test_tfidf)
print("Test Results:")
show_results(y_test, y_test_pred)

CPU times: user 47 ms, sys: 4.15 ms, total: 51.2 ms
Wall time: 37.6 ms
Train Results:
              precision    recall  f1-score   support

           0      0.935     0.961     0.947     12462
           1      0.950     0.917     0.933     10059

    accuracy                          0.941     22521
   macro avg      0.942     0.939     0.940     22521
weighted avg      0.941     0.941     0.941     22521

[[11974   488]
 [  839  9220]]
Accuracy: 0.941077216819857
f1 score: 0.9328679111650731
False Postive Rate: 0.039159043492216335

CPU times: user 92.8 ms, sys: 34.6 ms, total: 127 ms
Wall time: 122 ms
Test Results:
              precision    recall  f1-score   support

           0      0.924     0.945     0.934      6138
           1      0.930     0.904     0.917      4955

    accuracy                          0.927     11093
   macro avg      0.927     0.924     0.926     11093
weighted avg      0.927     0.927     0.927     11093

[[5799  339]
 [ 475 4480]]
Accuracy: 0.926620

###Training best model for Top 15 Features + TF-IDF weighted GloVe vectors

In [None]:
clf_3 = LinearSVC(random_state=1)
rs_3 = RandomizedSearchCV(estimator=clf_3, 
                      param_distributions = param_test, 
                      n_iter = 50,
                      refit = True,
                      random_state = 1,
                      n_jobs = -1
                      )

%time rs_3.fit(X_train_tfidf_glove, y_train)
print('Best score reached: {} with params: {} '.format(rs_3.best_score_, rs_3.best_params_))



CPU times: user 4min 23s, sys: 1.63 s, total: 4min 25s
Wall time: 41min 4s
Best score reached: 0.9007149151426381 with params: {'penalty': 'l2', 'max_iter': 10000, 'class_weight': None, 'C': 10} 




In [None]:
%time y_train_pred = rs_3.predict(X_train_tfidf_glove)
print("Train Results:")
show_results(y_train, y_train_pred)

%time y_test_pred = rs_3.predict(X_test_tfidf_glove)
print("Test Results:")
show_results(y_test, y_test_pred)

CPU times: user 25.8 ms, sys: 0 ns, total: 25.8 ms
Wall time: 22 ms
Train Results:
              precision    recall  f1-score   support

           0      0.898     0.935     0.916     12462
           1      0.915     0.869     0.891     10059

    accuracy                          0.905     22521
   macro avg      0.906     0.902     0.904     22521
weighted avg      0.906     0.905     0.905     22521

[[11649   813]
 [ 1322  8737]]
Accuracy: 0.9051995914923849
f1 score: 0.8911214238359937
False Postive Rate: 0.06523832450649976

CPU times: user 57.7 ms, sys: 12 ms, total: 69.7 ms
Wall time: 65.3 ms
Test Results:
              precision    recall  f1-score   support

           0      0.891     0.935     0.913      6138
           1      0.914     0.858     0.885      4955

    accuracy                          0.901     11093
   macro avg      0.903     0.897     0.899     11093
weighted avg      0.901     0.901     0.900     11093

[[5740  398]
 [ 702 4253]]
Accuracy: 0.900838366

###Training best model for Top 15 Features + TF-IDF weighted FastText

In [None]:
clf_4 = LinearSVC(random_state=1)
rs_4 = RandomizedSearchCV(estimator=clf_4, 
                      param_distributions = param_test,  
                      n_iter = 50,
                      refit = True,
                      random_state = 1,
                      n_jobs = -1
                      )

%time rs_4.fit(X_train_tfidf_cc, y_train)
print('Best score reached: {} with params: {} '.format(rs_4.best_score_, rs_4.best_params_))



CPU times: user 2min 38s, sys: 542 ms, total: 2min 39s
Wall time: 25min 52s
Best score reached: 0.9179434238255106 with params: {'penalty': 'l2', 'max_iter': 10000, 'class_weight': None, 'C': 10} 




In [None]:
%time y_train_pred = rs_4.predict(X_train_tfidf_cc)
print("Train Results:")
show_results(y_train, y_train_pred)

%time y_test_pred = rs_4.predict(X_test_tfidf_cc)
print("Test Results:")
show_results(y_test, y_test_pred)

CPU times: user 21.8 ms, sys: 4.01 ms, total: 25.8 ms
Wall time: 22.6 ms
Train Results:
              precision    recall  f1-score   support

           0      0.921     0.942     0.931     12462
           1      0.926     0.901     0.913     10059

    accuracy                          0.923     22521
   macro avg      0.924     0.921     0.922     22521
weighted avg      0.923     0.923     0.923     22521

[[11734   728]
 [ 1000  9059]]
Accuracy: 0.9232716131610497
f1 score: 0.9129295575934697
False Postive Rate: 0.058417589471994866

CPU times: user 61.3 ms, sys: 6.02 ms, total: 67.4 ms
Wall time: 63.7 ms
Test Results:
              precision    recall  f1-score   support

           0      0.917     0.939     0.928      6138
           1      0.922     0.895     0.908      4955

    accuracy                          0.919     11093
   macro avg      0.920     0.917     0.918     11093
weighted avg      0.919     0.919     0.919     11093

[[5764  374]
 [ 521 4434]]
Accuracy: 0.9

###Training best model for Top 15 Features + TF-IDF InferSent

In [None]:
clf_5 = LinearSVC(random_state=1)
rs_5 = RandomizedSearchCV(estimator=clf_5, 
                      param_distributions = param_test,  
                      n_iter = 50,
                      refit = True,
                      random_state = 1,
                      n_jobs = -1
                      )

%time rs_5.fit(X_train_tfidf_inf, y_train_tfidf_inf)
print('Best score reached: {} with params: {} '.format(rs_5.best_score_, rs_5.best_params_))



CPU times: user 29.5 s, sys: 3.96 s, total: 33.5 s
Wall time: 1h 49min 28s
Best score reached: 0.7087064996899832 with params: {'penalty': 'l2', 'max_iter': 10000, 'class_weight': None, 'C': 0.1} 


In [None]:
%time y_train_pred = rs_5.predict(X_train_tfidf_inf)
print("Train Results:")
show_results(y_train_tfidf_inf, y_train_pred)

%time y_test_pred = rs_5.predict(X_test_tfidf_inf)
print("Test Results:")
show_results(y_test_tfidf_inf, y_test_pred)

CPU times: user 243 ms, sys: 23.1 ms, total: 266 ms
Wall time: 205 ms
Train Results:
              precision    recall  f1-score   support

           0      0.758     0.928     0.835     10560
           1      0.835     0.553     0.665      6979

    accuracy                          0.779     17539
   macro avg      0.797     0.740     0.750     17539
weighted avg      0.789     0.779     0.767     17539

[[9799  761]
 [3122 3857]]
Accuracy: 0.7786076743257883
f1 score: 0.6651720272484263
False Postive Rate: 0.07206439393939394

CPU times: user 295 ms, sys: 25.1 ms, total: 321 ms
Wall time: 286 ms
Test Results:
              precision    recall  f1-score   support

           0      0.709     0.882     0.786      5191
           1      0.716     0.451     0.554      3422

    accuracy                          0.711      8613
   macro avg      0.713     0.667     0.670      8613
weighted avg      0.712     0.711     0.694      8613

[[4578  613]
 [1877 1545]]
Accuracy: 0.710902124695