In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, f1_score

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Reading in data sources

###**Scam** data sets

In [None]:
## Full features
X_train_full = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SCAM/X_train_fSC.pkl')
X_test_full = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SCAM/X_test_fSC.pkl')

## Vanilla features
X_train_v = X_train_full.drop(['text', 'cleaned_text', 'cleaned_text_full'], axis=1)
X_test_v = X_test_full.drop(['text', 'cleaned_text', 'cleaned_text_full'], axis=1)
scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_v), index = X_train_v.index, columns = X_train_v.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_v), index = X_test_v.index, columns = X_test_v.columns)

## target label
y_train = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SCAM/y_train_fSC.pkl')
y_test = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SCAM/y_test_fSC.pkl')

## TF-IDF with top 15 features
X_train_tfidf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF SCAM/X_train_tfSC.pkl')
X_test_tfidf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF SCAM/X_test_tfSC.pkl')

## TF-IDF GloVe with top 15 features
X_train_tfidf_glove = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF Glove SCAM/X_train_tfglSC.pkl')
X_test_tfidf_glove = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF Glove SCAM/X_test_tfglSC.pkl')

## TF-IDF FastText (cc) with top 15 features
X_train_tfidf_cc = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF CC SCAM/X_train_tfccSC.pkl')
X_test_tfidf_cc = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF CC SCAM/X_test_tfccSC.pkl')

## TF-IDF InferSent with top 15 features
X_train_tfidf_inf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SCAM/X_train_infSC.pkl')
X_test_tfidf_inf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SCAM/X_test_infSC.pkl')
y_train_tfidf_inf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SCAM/y_train_infSC.pkl')
y_test_tfidf_inf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SCAM/y_test_infSC.pkl')

##Logistic Regression Classifier

### Hyperparameter Tuning

In [None]:
def show_results(y_actual, y_pred, y_prob):
  print(classification_report(y_actual, y_pred, digits=3))
  print(confusion_matrix(y_actual, y_pred))
  print("Accuracy: " + str(accuracy_score(y_actual, y_pred)))
  print("AUC_ROC: " + str(roc_auc_score(y_actual, y_prob)))
  print("f1 score: " + str(f1_score(y_actual, y_pred)))
  tn, fp, fn, tp = confusion_matrix(y_actual, y_pred).ravel()
  fpr = fp/(fp+tn)
  print("False Postive Rate: " + str(fpr) + "\n")

In [None]:
# Inverse of regularization strength: Smaller values specify stronger regularization.
C = [0.001, 0.01, 0.1, 1, 10]
# Weights associated with classes in the form {class_label: weight}. If not given, all classes are supposed to have weight one.
class_weight = [None, 'balanced']
# Algorithm to use in the optimization problem.
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
# Maximum number of iterations taken for the solvers to converge. - Set at 30,000 after testing in order to ensure the logreg process converges.
max_iter = [30000]

# Create the param grid
param_test = {'C': C,
              'class_weight': class_weight,
              'solver': solver,
              'max_iter': max_iter}

###Training best model for scaled Vanilla features

In [None]:
clf_1 = LogisticRegression(random_state=1)
rs_1 = RandomizedSearchCV(estimator = clf_1, 
                            param_distributions = param_test, 
                            n_iter = 50,
                            refit = True,
                            random_state = 1,
                            n_jobs = -1
                            )
%time rs_1.fit(X_train_scaled, y_train)
print('Best score reached: {} with params: {} '.format(rs_1.best_score_, rs_1.best_params_))

CPU times: user 552 ms, sys: 16.4 ms, total: 568 ms
Wall time: 3.56 s
Best score reached: 0.878867031439986 with params: {'solver': 'saga', 'max_iter': 30000, 'class_weight': 'balanced', 'C': 10} 


In [None]:
%time y_train_pred = rs_1.predict(X_train_scaled)
%time y_train_prob = rs_1.predict_proba(X_train_scaled)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

%time y_test_pred = rs_1.predict(X_test_scaled)
%time y_test_prob = rs_1.predict_proba(X_test_scaled)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

CPU times: user 1.52 ms, sys: 103 µs, total: 1.62 ms
Wall time: 1.47 ms
CPU times: user 2.22 ms, sys: 24 µs, total: 2.24 ms
Wall time: 1.14 ms
Train Results:
              precision    recall  f1-score   support

           0      0.802     0.883     0.841       538
           1      0.932     0.881     0.906       981

    accuracy                          0.882      1519
   macro avg      0.867     0.882     0.873      1519
weighted avg      0.886     0.882     0.883      1519

[[475  63]
 [117 864]]
Accuracy: 0.8815009874917709
AUC_ROC: 0.950344652486462
f1 score: 0.9056603773584906
False Postive Rate: 0.19763513513513514

CPU times: user 552 µs, sys: 2 ms, total: 2.55 ms
Wall time: 2.6 ms
CPU times: user 2.14 ms, sys: 237 µs, total: 2.38 ms
Wall time: 2.55 ms
Test Results:
              precision    recall  f1-score   support

           0      0.796     0.883     0.837       265
           1      0.932     0.876     0.903       484

    accuracy                          0.879     

###Training best model for Top 15 Features + TF-IDF

In [None]:
clf_2 = LogisticRegression(random_state=1)
rs_2 = RandomizedSearchCV(estimator = clf_2, 
                      param_distributions = param_test, 
                      n_iter = 50,
                      refit = True,
                      random_state = 1,
                      n_jobs = -1
                      )

%time rs_2.fit(X_train_tfidf, y_train)
print('Best score reached: {} with params: {} '.format(rs_2.best_score_, rs_2.best_params_))

CPU times: user 735 ms, sys: 147 ms, total: 881 ms
Wall time: 15.6 s
Best score reached: 0.9769563140524579 with params: {'solver': 'newton-cg', 'max_iter': 30000, 'class_weight': None, 'C': 10} 


In [None]:
%time y_train_pred = rs_2.predict(X_train_tfidf)
%time y_train_prob = rs_2.predict_proba(X_train_tfidf)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

%time y_test_pred = rs_2.predict(X_test_tfidf)
%time y_test_prob = rs_2.predict_proba(X_test_tfidf)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

CPU times: user 2.35 ms, sys: 2.15 ms, total: 4.5 ms
Wall time: 2.26 ms
CPU times: user 2.38 ms, sys: 3.65 ms, total: 6.04 ms
Wall time: 3.09 ms
Train Results:
              precision    recall  f1-score   support

           0      1.000     0.998     0.999       538
           1      0.999     1.000     0.999       981

    accuracy                          0.999      1519
   macro avg      0.999     0.999     0.999      1519
weighted avg      0.999     0.999     0.999      1519

[[537   1]
 [  0 981]]
Accuracy: 0.9993416721527321
AUC_ROC: 0.9999507368628476
f1 score: 0.999490575649516
False Postive Rate: 0.0

CPU times: user 3.45 ms, sys: 2.97 ms, total: 6.43 ms
Wall time: 4.14 ms
CPU times: user 2.82 ms, sys: 975 µs, total: 3.8 ms
Wall time: 1.93 ms
Test Results:
              precision    recall  f1-score   support

           0      0.955     0.970     0.963       265
           1      0.983     0.975     0.979       484

    accuracy                          0.973       749
   m

###Training best model for Top 15 Features + TF-IDF weighted GloVe vectors

In [None]:
clf_3 = LogisticRegression(random_state=1)
rs_3 = RandomizedSearchCV(estimator=clf_3, 
                      param_distributions = param_test, 
                      n_iter = 50,
                      refit = True,
                      random_state = 1,
                      n_jobs = -1
                      )

%time rs_3.fit(X_train_tfidf_glove, y_train)
print('Best score reached: {} with params: {} '.format(rs_3.best_score_, rs_3.best_params_))

CPU times: user 2.17 s, sys: 201 ms, total: 2.37 s
Wall time: 3min 18s
Best score reached: 0.9822173006774362 with params: {'solver': 'liblinear', 'max_iter': 30000, 'class_weight': None, 'C': 10} 


In [None]:
%time y_train_pred = rs_3.predict(X_train_tfidf_glove)
%time y_train_prob = rs_3.predict_proba(X_train_tfidf_glove)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

%time y_test_pred = rs_3.predict(X_test_tfidf_glove)
%time y_test_prob = rs_3.predict_proba(X_test_tfidf_glove)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

CPU times: user 1.63 ms, sys: 3.04 ms, total: 4.66 ms
Wall time: 4.35 ms
CPU times: user 3.21 ms, sys: 5.21 ms, total: 8.41 ms
Wall time: 5.32 ms
Train Results:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000       538
           1      1.000     1.000     1.000       981

    accuracy                          1.000      1519
   macro avg      1.000     1.000     1.000      1519
weighted avg      1.000     1.000     1.000      1519

[[538   0]
 [  0 981]]
Accuracy: 1.0
AUC_ROC: 1.0
f1 score: 1.0
False Postive Rate: 0.0

CPU times: user 3.29 ms, sys: 3.98 ms, total: 7.26 ms
Wall time: 3.64 ms
CPU times: user 1.75 ms, sys: 2.01 ms, total: 3.76 ms
Wall time: 1.89 ms
Test Results:
              precision    recall  f1-score   support

           0      0.969     0.958     0.964       265
           1      0.977     0.983     0.980       484

    accuracy                          0.975       749
   macro avg      0.973     0.971     0.972  

###Training best model for Top 15 Features + TF-IDF weighted FastText

In [None]:
clf_4 = LogisticRegression(random_state=1)
rs_4 = RandomizedSearchCV(estimator=clf_4, 
                      param_distributions = param_test,  
                      n_iter = 50,
                      refit = True,
                      random_state = 1,
                      n_jobs = -1
                      )

%time rs_4.fit(X_train_tfidf_cc, y_train)
print('Best score reached: {} with params: {} '.format(rs_4.best_score_, rs_4.best_params_))

CPU times: user 1.64 s, sys: 206 ms, total: 1.84 s
Wall time: 1min 58s
Best score reached: 0.9901207225985755 with params: {'solver': 'newton-cg', 'max_iter': 30000, 'class_weight': None, 'C': 1} 


In [None]:
%time y_train_pred = rs_4.predict(X_train_tfidf_cc)
%time y_train_prob = rs_4.predict_proba(X_train_tfidf_cc)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

%time y_test_pred = rs_4.predict(X_test_tfidf_cc)
%time y_test_prob = rs_4.predict_proba(X_test_tfidf_cc)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

CPU times: user 2.49 ms, sys: 2.15 ms, total: 4.64 ms
Wall time: 2.35 ms
CPU times: user 9.35 ms, sys: 3.96 ms, total: 13.3 ms
Wall time: 6.82 ms
Train Results:
              precision    recall  f1-score   support

           0      0.998     1.000     0.999       538
           1      1.000     0.999     0.999       981

    accuracy                          0.999      1519
   macro avg      0.999     0.999     0.999      1519
weighted avg      0.999     0.999     0.999      1519

[[538   0]
 [  1 980]]
Accuracy: 0.9993416721527321
AUC_ROC: 1.0
f1 score: 0.9994900560938297
False Postive Rate: 0.0018552875695732839

CPU times: user 3.55 ms, sys: 4.02 ms, total: 7.57 ms
Wall time: 3.8 ms
CPU times: user 3 ms, sys: 1.01 ms, total: 4.01 ms
Wall time: 2.01 ms
Test Results:
              precision    recall  f1-score   support

           0      0.977     0.970     0.973       265
           1      0.984     0.988     0.986       484

    accuracy                          0.981       749
 

###Training best model for Top 15 Features + TF-IDF InferSent

In [None]:
clf_5 = LogisticRegression(random_state=1)
rs_5 = RandomizedSearchCV(estimator=clf_5, 
                      param_distributions = param_test,  
                      n_iter = 50,
                      refit = True,
                      random_state = 1,
                      n_jobs = -1
                      )

%time rs_5.fit(X_train_tfidf_inf, y_train_tfidf_inf)
print('Best score reached: {} with params: {} '.format(rs_5.best_score_, rs_5.best_params_))

CPU times: user 4.63 s, sys: 403 ms, total: 5.04 s
Wall time: 5min 50s
Best score reached: 0.8563485295192612 with params: {'solver': 'liblinear', 'max_iter': 30000, 'class_weight': None, 'C': 1} 


In [None]:
%time y_train_pred = rs_5.predict(X_train_tfidf_inf)
%time y_train_prob = rs_5.predict_proba(X_train_tfidf_inf)[:, 1]
print("Train Results:")
show_results(y_train_tfidf_inf, y_train_pred, y_train_prob)

%time y_test_pred = rs_5.predict(X_test_tfidf_inf)
%time y_test_prob = rs_5.predict_proba(X_test_tfidf_inf)[:, 1]
print("Test Results:")
show_results(y_test_tfidf_inf, y_test_pred, y_test_prob)

CPU times: user 25.1 ms, sys: 1.75 ms, total: 26.8 ms
Wall time: 21.9 ms
CPU times: user 27.3 ms, sys: 19.3 ms, total: 46.6 ms
Wall time: 24.5 ms
Train Results:
              precision    recall  f1-score   support

           0      0.926     0.804     0.861       453
           1      0.915     0.970     0.942       981

    accuracy                          0.918      1434
   macro avg      0.920     0.887     0.901      1434
weighted avg      0.918     0.918     0.916      1434

[[364  89]
 [ 29 952]]
Accuracy: 0.9177126917712691
AUC_ROC: 0.9657330335986392
f1 score: 0.9416419386745796
False Postive Rate: 0.0737913486005089

CPU times: user 17.9 ms, sys: 16.3 ms, total: 34.3 ms
Wall time: 17.1 ms
CPU times: user 18.7 ms, sys: 15.3 ms, total: 33.9 ms
Wall time: 17 ms
Test Results:
              precision    recall  f1-score   support

           0      0.794     0.691     0.739       223
           1      0.865     0.917     0.891       484

    accuracy                          0.8