In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV

from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, f1_score

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


## Reading in data sources

###**Scam** data sets

In [None]:
## Full features
X_train_full = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SCAM/X_train_fSC.pkl')
X_test_full = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SCAM/X_test_fSC.pkl')

## Vanilla features
X_train_v = X_train_full.drop(['text', 'cleaned_text', 'cleaned_text_full'], axis=1)
X_test_v = X_test_full.drop(['text', 'cleaned_text', 'cleaned_text_full'], axis=1)
scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_v), index = X_train_v.index, columns = X_train_v.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_v), index = X_test_v.index, columns = X_test_v.columns)

## target label
y_train = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SCAM/y_train_fSC.pkl')
y_test = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SCAM/y_test_fSC.pkl')

## TF-IDF with top 15 features
X_train_tfidf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF SCAM/X_train_tfSC.pkl')
X_test_tfidf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF SCAM/X_test_tfSC.pkl')

## TF-IDF GloVe with top 15 features
X_train_tfidf_glove = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF Glove SCAM/X_train_tfglSC.pkl')
X_test_tfidf_glove = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF Glove SCAM/X_test_tfglSC.pkl')

## TF-IDF FastText (cc) with top 15 features
X_train_tfidf_cc = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF CC SCAM/X_train_tfccSC.pkl')
X_test_tfidf_cc = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF CC SCAM/X_test_tfccSC.pkl')

## TF-IDF InferSent with top 15 features
X_train_tfidf_inf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SCAM/X_train_infSC.pkl')
X_test_tfidf_inf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SCAM/X_test_infSC.pkl')
y_train_tfidf_inf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SCAM/y_train_infSC.pkl')
y_test_tfidf_inf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SCAM/y_test_infSC.pkl')

##Support Vector Classifier

### Hyperparameter Tuning

In [None]:
def show_results(y_actual, y_pred, y_prob):
  print(classification_report(y_actual, y_pred, digits=3))
  print(confusion_matrix(y_actual, y_pred))
  print("Accuracy: " + str(accuracy_score(y_actual, y_pred)))
  print("AUC_ROC: " + str(roc_auc_score(y_actual, y_prob)))
  print("f1 score: " + str(f1_score(y_actual, y_pred)))
  tn, fp, fn, tp = confusion_matrix(y_actual, y_pred).ravel()
  fpr = fp/(fp+tn)
  print("False Postive Rate: " + str(fpr) + "\n")

In [None]:
# Specifies the norm used in the penalization.
penalty = ['l1', 'l2']
# Inverse of regularization strength: Smaller values specify stronger regularization.
C = [0.001, 0.01, 0.1, 1, 10]
# Weights associated with classes in the form {class_label: weight}. If not given, all classes are supposed to have weight one.
class_weight = [None, 'balanced']
# Maximum number of iterations taken for the solvers to converge. - Set at 10,000 after testing in order to ensure the logreg process converges.
max_iter = [10000]

# Create the param grid
param_test = {'penalty': penalty,
              'C': C,
              'class_weight': class_weight,
              'max_iter': max_iter}

###Training best model for scaled Vanilla features

In [None]:
clf_1 = LinearSVC(random_state=1)
rs_1 = RandomizedSearchCV(estimator = clf_1, 
                            param_distributions = param_test, 
                            n_iter = 50,
                            refit = True,
                            random_state = 1,
                            n_jobs = -1
                            )
%time rs_1.fit(X_train_scaled, y_train)
print('Best score reached: {} with params: {} '.format(rs_1.best_score_, rs_1.best_params_))



CPU times: user 411 ms, sys: 30.4 ms, total: 442 ms
Wall time: 2.64 s
Best score reached: 0.891377887788779 with params: {'penalty': 'l2', 'max_iter': 10000, 'class_weight': None, 'C': 10} 


In [None]:
%time y_train_pred = rs_1.predict(X_train_scaled)
print("Train Results:")
show_results(y_train, y_train_pred)

%time y_test_pred = rs_1.predict(X_test_scaled)
print("Test Results:")
show_results(y_test, y_test_pred)

CPU times: user 1.37 ms, sys: 4.04 ms, total: 5.41 ms
Wall time: 7.06 ms
Train Results:
              precision    recall  f1-score   support

           0      0.871     0.829     0.850       538
           1      0.909     0.933     0.921       981

    accuracy                          0.896      1519
   macro avg      0.890     0.881     0.885      1519
weighted avg      0.895     0.896     0.895      1519

[[446  92]
 [ 66 915]]
Accuracy: 0.8959842001316656
f1 score: 0.9205231388329981
False Postive Rate: 0.12890625

CPU times: user 1.82 ms, sys: 510 µs, total: 2.33 ms
Wall time: 1.17 ms
Test Results:
              precision    recall  f1-score   support

           0      0.840     0.853     0.846       265
           1      0.919     0.911     0.915       484

    accuracy                          0.891       749
   macro avg      0.879     0.882     0.881       749
weighted avg      0.891     0.891     0.891       749

[[226  39]
 [ 43 441]]
Accuracy: 0.890520694259012
f1 score

###Training best model for Top 15 Features + TF-IDF

In [None]:
clf_2 = LinearSVC(random_state=1)
rs_2 = RandomizedSearchCV(estimator = clf_2, 
                      param_distributions = param_test, 
                      n_iter = 50,
                      refit = True,
                      random_state = 1,
                      n_jobs = -1
                      )

%time rs_2.fit(X_train_tfidf, y_train)
print('Best score reached: {} with params: {} '.format(rs_2.best_score_, rs_2.best_params_))



CPU times: user 225 ms, sys: 55.8 ms, total: 281 ms
Wall time: 1.04 s
Best score reached: 0.9756405245787738 with params: {'penalty': 'l2', 'max_iter': 10000, 'class_weight': 'balanced', 'C': 1} 


In [None]:
%time y_train_pred = rs_2.predict(X_train_tfidf)
print("Train Results:")
show_results(y_train, y_train_pred)

%time y_test_pred = rs_2.predict(X_test_tfidf)
print("Test Results:")
show_results(y_test, y_test_pred)

CPU times: user 2.48 ms, sys: 0 ns, total: 2.48 ms
Wall time: 2.43 ms
Train Results:
              precision    recall  f1-score   support

           0      0.994     0.998     0.996       538
           1      0.999     0.997     0.998       981

    accuracy                          0.997      1519
   macro avg      0.997     0.998     0.997      1519
weighted avg      0.997     0.997     0.997      1519

[[537   1]
 [  3 978]]
Accuracy: 0.9973666886109283
f1 score: 0.9979591836734694
False Postive Rate: 0.005555555555555556

CPU times: user 4.57 ms, sys: 2.99 ms, total: 7.56 ms
Wall time: 3.8 ms
Test Results:
              precision    recall  f1-score   support

           0      0.949     0.981     0.965       265
           1      0.989     0.971     0.980       484

    accuracy                          0.975       749
   macro avg      0.969     0.976     0.972       749
weighted avg      0.975     0.975     0.975       749

[[260   5]
 [ 14 470]]
Accuracy: 0.9746328437917223


###Training best model for Top 15 Features + TF-IDF weighted GloVe vectors

In [None]:
clf_3 = LinearSVC(random_state=1)
rs_3 = RandomizedSearchCV(estimator=clf_3, 
                      param_distributions = param_test, 
                      n_iter = 50,
                      refit = True,
                      random_state = 1,
                      n_jobs = -1
                      )

%time rs_3.fit(X_train_tfidf_glove, y_train)
print('Best score reached: {} with params: {} '.format(rs_3.best_score_, rs_3.best_params_))



CPU times: user 390 ms, sys: 44.8 ms, total: 435 ms
Wall time: 11.5 s
Best score reached: 0.9828751954142783 with params: {'penalty': 'l2', 'max_iter': 10000, 'class_weight': None, 'C': 0.1} 


In [None]:
%time y_train_pred = rs_3.predict(X_train_tfidf_glove)
print("Train Results:")
show_results(y_train, y_train_pred)

%time y_test_pred = rs_3.predict(X_test_tfidf_glove)
print("Test Results:")
show_results(y_test, y_test_pred)

CPU times: user 2.35 ms, sys: 59 µs, total: 2.41 ms
Wall time: 1.87 ms
Train Results:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000       538
           1      1.000     1.000     1.000       981

    accuracy                          1.000      1519
   macro avg      1.000     1.000     1.000      1519
weighted avg      1.000     1.000     1.000      1519

[[538   0]
 [  0 981]]
Accuracy: 1.0
f1 score: 1.0
False Postive Rate: 0.0

CPU times: user 3.09 ms, sys: 3.72 ms, total: 6.81 ms
Wall time: 3.42 ms
Test Results:
              precision    recall  f1-score   support

           0      0.966     0.955     0.960       265
           1      0.975     0.981     0.978       484

    accuracy                          0.972       749
   macro avg      0.971     0.968     0.969       749
weighted avg      0.972     0.972     0.972       749

[[253  12]
 [  9 475]]
Accuracy: 0.9719626168224299
f1 score: 0.9783728115345005
False Postive Ra

###Training best model for Top 15 Features + TF-IDF weighted FastText

In [None]:
clf_4 = LinearSVC(random_state=1)
rs_4 = RandomizedSearchCV(estimator=clf_4, 
                      param_distributions = param_test,  
                      n_iter = 50,
                      refit = True,
                      random_state = 1,
                      n_jobs = -1
                      )

%time rs_4.fit(X_train_tfidf_cc, y_train)
print('Best score reached: {} with params: {} '.format(rs_4.best_score_, rs_4.best_params_))



CPU times: user 252 ms, sys: 60.7 ms, total: 313 ms
Wall time: 3.49 s
Best score reached: 0.9907807886051764 with params: {'penalty': 'l2', 'max_iter': 10000, 'class_weight': None, 'C': 0.01} 


In [None]:
%time y_train_pred = rs_4.predict(X_train_tfidf_cc)
print("Train Results:")
show_results(y_train, y_train_pred)

%time y_test_pred = rs_4.predict(X_test_tfidf_cc)
print("Test Results:")
show_results(y_test, y_test_pred)

CPU times: user 4.57 ms, sys: 4.02 ms, total: 8.59 ms
Wall time: 7.12 ms
Train Results:
              precision    recall  f1-score   support

           0      0.996     0.993     0.994       538
           1      0.996     0.998     0.997       981

    accuracy                          0.996      1519
   macro avg      0.996     0.995     0.996      1519
weighted avg      0.996     0.996     0.996      1519

[[534   4]
 [  2 979]]
Accuracy: 0.9960500329163924
f1 score: 0.9969450101832993
False Postive Rate: 0.0037313432835820895

CPU times: user 3.25 ms, sys: 3.94 ms, total: 7.19 ms
Wall time: 3.81 ms
Test Results:
              precision    recall  f1-score   support

           0      0.981     0.974     0.977       265
           1      0.986     0.990     0.988       484

    accuracy                          0.984       749
   macro avg      0.983     0.982     0.982       749
weighted avg      0.984     0.984     0.984       749

[[258   7]
 [  5 479]]
Accuracy: 0.983978638184

###Training best model for Top 15 Features + TF-IDF InferSent

In [None]:
clf_5 = LinearSVC(random_state=1)
rs_5 = RandomizedSearchCV(estimator=clf_5, 
                      param_distributions = param_test,  
                      n_iter = 50,
                      refit = True,
                      random_state = 1,
                      n_jobs = -1
                      )

%time rs_5.fit(X_train_tfidf_inf, y_train_tfidf_inf)
print('Best score reached: {} with params: {} '.format(rs_5.best_score_, rs_5.best_params_))



CPU times: user 2.34 s, sys: 279 ms, total: 2.61 s
Wall time: 3min 50s
Best score reached: 0.8577446943300601 with params: {'penalty': 'l2', 'max_iter': 10000, 'class_weight': None, 'C': 0.1} 


In [None]:
%time y_train_pred = rs_5.predict(X_train_tfidf_inf)
print("Train Results:")
show_results(y_train_tfidf_inf, y_train_pred)

%time y_test_pred = rs_5.predict(X_test_tfidf_inf)
print("Test Results:")
show_results(y_test_tfidf_inf, y_test_pred)

CPU times: user 19.8 ms, sys: 1.83 ms, total: 21.6 ms
Wall time: 20.1 ms
Train Results:
              precision    recall  f1-score   support

           0      0.925     0.821     0.870       453
           1      0.922     0.969     0.945       981

    accuracy                          0.923      1434
   macro avg      0.923     0.895     0.908      1434
weighted avg      0.923     0.923     0.921      1434

[[372  81]
 [ 30 951]]
Accuracy: 0.9225941422594143
f1 score: 0.9448584202682563
False Postive Rate: 0.07462686567164178

CPU times: user 17.9 ms, sys: 12.2 ms, total: 30.1 ms
Wall time: 15.9 ms
Test Results:
              precision    recall  f1-score   support

           0      0.787     0.695     0.738       223
           1      0.867     0.913     0.889       484

    accuracy                          0.844       707
   macro avg      0.827     0.804     0.814       707
weighted avg      0.841     0.844     0.842       707

[[155  68]
 [ 42 442]]
Accuracy: 0.84441301272984