In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, f1_score

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


## Reading in data sources

###**Spam** data sets

In [None]:
## Full features
X_train_full = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SPAM/X_train_fSP.pkl')
X_test_full = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SPAM/X_test_fSP.pkl')

## Vanilla features
X_train_v = X_train_full.drop(['text', 'cleaned_text', 'cleaned_text_full'], axis=1)
X_test_v = X_test_full.drop(['text', 'cleaned_text', 'cleaned_text_full'], axis=1)
scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_v), index = X_train_v.index, columns = X_train_v.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_v), index = X_test_v.index, columns = X_test_v.columns)

## target label
y_train = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SPAM/y_train_fSP.pkl')
y_test = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SPAM/y_test_fSP.pkl')

## TF-IDF with top 15 features
X_train_tfidf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF SPAM/X_train_tfSP.pkl')
X_test_tfidf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF SPAM/X_test_tfSP.pkl')

## TF-IDF GloVe with top 15 features
X_train_tfidf_glove = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF Glove SPAM/X_train_tfglSP.pkl')
X_test_tfidf_glove = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF Glove SPAM/X_test_tfglSP.pkl')

## TF-IDF FastText (cc) with top 15 features
X_train_tfidf_cc = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF CC SPAM/X_train_tfccSP.pkl')
X_test_tfidf_cc = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF CC SPAM/X_test_tfccSP.pkl')

## TF-IDF InferSent with top 15 features
X_train_tfidf_inf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SPAM/X_train_infSP.pkl')
X_test_tfidf_inf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SPAM/X_test_infSP.pkl')
y_train_tfidf_inf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SPAM/y_train_infSP.pkl')
y_test_tfidf_inf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SPAM/y_test_infSP.pkl')

##Logistic Regression Classifier

### Hyperparameter Tuning

In [None]:
def show_results(y_actual, y_pred, y_prob):
  print(classification_report(y_actual, y_pred, digits=3))
  print(confusion_matrix(y_actual, y_pred))
  print("Accuracy: " + str(accuracy_score(y_actual, y_pred)))
  print("AUC_ROC: " + str(roc_auc_score(y_actual, y_prob)))
  print("f1 score: " + str(f1_score(y_actual, y_pred)))
  tn, fp, fn, tp = confusion_matrix(y_actual, y_pred).ravel()
  fpr = fp/(fp+tn)
  print("False Postive Rate: " + str(fpr) + "\n")

In [None]:
# Inverse of regularization strength: Smaller values specify stronger regularization.
C = [0.001, 0.01, 0.1, 1, 10]
# Weights associated with classes in the form {class_label: weight}. If not given, all classes are supposed to have weight one.
class_weight = [None, 'balanced']
# Algorithm to use in the optimization problem.
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
# Maximum number of iterations taken for the solvers to converge. - Set at 30,000 after testing in order to ensure the logreg process converges.
max_iter = [30000]

# Create the param grid
param_test = {'C': C,
              'class_weight': class_weight,
              'solver': solver,
              'max_iter': max_iter}

###Training best model for scaled Vanilla features

In [None]:
clf_1 = LogisticRegression(random_state=1)
rs_1 = RandomizedSearchCV(estimator = clf_1, 
                            param_distributions = param_test, 
                            n_iter = 50,
                            refit = True,
                            random_state = 1,
                            n_jobs = -1
                            )
%time rs_1.fit(X_train_scaled, y_train)
print('Best score reached: {} with params: {} '.format(rs_1.best_score_, rs_1.best_params_))

CPU times: user 2.88 s, sys: 1.03 s, total: 3.91 s
Wall time: 39.5 s
Best score reached: 0.7577817522665757 with params: {'solver': 'lbfgs', 'max_iter': 30000, 'class_weight': 'balanced', 'C': 10} 


In [None]:
%time y_train_pred = rs_1.predict(X_train_scaled)
%time y_train_prob = rs_1.predict_proba(X_train_scaled)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

%time y_test_pred = rs_1.predict(X_test_scaled)
%time y_test_prob = rs_1.predict_proba(X_test_scaled)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

CPU times: user 3.08 ms, sys: 1.54 ms, total: 4.63 ms
Wall time: 2.33 ms
CPU times: user 4.49 ms, sys: 180 µs, total: 4.67 ms
Wall time: 2.98 ms
Train Results:
              precision    recall  f1-score   support

           0      0.761     0.823     0.790     12462
           1      0.756     0.679     0.715     10059

    accuracy                          0.759     22521
   macro avg      0.758     0.751     0.753     22521
weighted avg      0.758     0.759     0.757     22521

[[10252  2210]
 [ 3225  6834]]
Accuracy: 0.7586696860707783
AUC_ROC: 0.8200823454888505
f1 score: 0.7154897136575407
False Postive Rate: 0.17733911089712726

CPU times: user 2.67 ms, sys: 304 µs, total: 2.97 ms
Wall time: 2.2 ms
CPU times: user 4.44 ms, sys: 675 µs, total: 5.11 ms
Wall time: 2.87 ms
Test Results:
              precision    recall  f1-score   support

           0      0.761     0.832     0.795      6138
           1      0.764     0.676     0.717      4955

    accuracy                      

###Training best model for Top 15 Features + TF-IDF

In [None]:
clf_2 = LogisticRegression(random_state=1)
rs_2 = RandomizedSearchCV(estimator = clf_2, 
                      param_distributions = param_test, 
                      n_iter = 50,
                      refit = True,
                      random_state = 1,
                      n_jobs = -1
                      )

%time rs_2.fit(X_train_tfidf, y_train)
print('Best score reached: {} with params: {} '.format(rs_2.best_score_, rs_2.best_params_))



CPU times: user 16.9 s, sys: 2.54 s, total: 19.4 s
Wall time: 11min 31s
Best score reached: 0.9272233831365583 with params: {'solver': 'newton-cg', 'max_iter': 30000, 'class_weight': None, 'C': 10} 


In [None]:
%time y_train_pred = rs_2.predict(X_train_tfidf)
%time y_train_prob = rs_2.predict_proba(X_train_tfidf)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

%time y_test_pred = rs_2.predict(X_test_tfidf)
%time y_test_prob = rs_2.predict_proba(X_test_tfidf)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

CPU times: user 48.9 ms, sys: 16 ms, total: 65 ms
Wall time: 37.3 ms
CPU times: user 55.6 ms, sys: 18.7 ms, total: 74.3 ms
Wall time: 37.2 ms
Train Results:
              precision    recall  f1-score   support

           0      0.937     0.959     0.948     12462
           1      0.948     0.920     0.934     10059

    accuracy                          0.942     22521
   macro avg      0.942     0.939     0.941     22521
weighted avg      0.942     0.942     0.941     22521

[[11954   508]
 [  808  9251]]
Accuracy: 0.941565649837929
AUC_ROC: 0.9865576639792805
f1 score: 0.9335957210616611
False Postive Rate: 0.04076392232386455

CPU times: user 146 ms, sys: 91.1 ms, total: 237 ms
Wall time: 229 ms
CPU times: user 27.8 ms, sys: 9.99 ms, total: 37.8 ms
Wall time: 19.4 ms
Test Results:
              precision    recall  f1-score   support

           0      0.926     0.941     0.933      6138
           1      0.926     0.906     0.916      4955

    accuracy                          

###Training best model for Top 15 Features + TF-IDF weighted GloVe vectors

In [None]:
clf_3 = LogisticRegression(random_state=1)
rs_3 = RandomizedSearchCV(estimator=clf_3, 
                      param_distributions = param_test, 
                      n_iter = 50,
                      refit = True,
                      random_state = 1,
                      n_jobs = -1
                      )

%time rs_3.fit(X_train_tfidf_glove, y_train)
print('Best score reached: {} with params: {} '.format(rs_3.best_score_, rs_3.best_params_))

CPU times: user 33.2 s, sys: 2.86 s, total: 36 s
Wall time: 51min 23s
Best score reached: 0.9013365551991768 with params: {'solver': 'newton-cg', 'max_iter': 30000, 'class_weight': None, 'C': 10} 


In [None]:
%time y_train_pred = rs_3.predict(X_train_tfidf_glove)
%time y_train_prob = rs_3.predict_proba(X_train_tfidf_glove)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

%time y_test_pred = rs_3.predict(X_test_tfidf_glove)
%time y_test_prob = rs_3.predict_proba(X_test_tfidf_glove)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

CPU times: user 21.5 ms, sys: 6.95 ms, total: 28.5 ms
Wall time: 25.4 ms
CPU times: user 20.3 ms, sys: 7.77 ms, total: 28.1 ms
Wall time: 14.3 ms
Train Results:
              precision    recall  f1-score   support

           0      0.910     0.925     0.917     12462
           1      0.905     0.886     0.896     10059

    accuracy                          0.908     22521
   macro avg      0.908     0.906     0.907     22521
weighted avg      0.908     0.908     0.908     22521

[[11529   933]
 [ 1142  8917]]
Accuracy: 0.9078637715909595
AUC_ROC: 0.9655210394126427
f1 score: 0.8957757797980814
False Postive Rate: 0.07486759749638902

CPU times: user 53.8 ms, sys: 11.2 ms, total: 65 ms
Wall time: 60.5 ms
CPU times: user 9.88 ms, sys: 5.17 ms, total: 15.1 ms
Wall time: 7.54 ms
Test Results:
              precision    recall  f1-score   support

           0      0.903     0.925     0.914      6138
           1      0.904     0.877     0.890      4955

    accuracy                    

###Training best model for Top 15 Features + TF-IDF weighted FastText

In [None]:
clf_4 = LogisticRegression(random_state=1)
rs_4 = RandomizedSearchCV(estimator=clf_4, 
                      param_distributions = param_test,  
                      n_iter = 50,
                      refit = True,
                      random_state = 1,
                      n_jobs = -1
                      )

%time rs_4.fit(X_train_tfidf_cc, y_train)
print('Best score reached: {} with params: {} '.format(rs_4.best_score_, rs_4.best_params_))

CPU times: user 1min 21s, sys: 917 ms, total: 1min 22s
Wall time: 41min 19s
Best score reached: 0.9166113239088993 with params: {'solver': 'sag', 'max_iter': 30000, 'class_weight': None, 'C': 10} 


In [None]:
%time y_train_pred = rs_4.predict(X_train_tfidf_cc)
%time y_train_prob = rs_4.predict_proba(X_train_tfidf_cc)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

%time y_test_pred = rs_4.predict(X_test_tfidf_cc)
%time y_test_prob = rs_4.predict_proba(X_test_tfidf_cc)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

CPU times: user 21.1 ms, sys: 0 ns, total: 21.1 ms
Wall time: 15.1 ms
CPU times: user 28 ms, sys: 0 ns, total: 28 ms
Wall time: 15 ms
Train Results:
              precision    recall  f1-score   support

           0      0.923     0.941     0.932     12462
           1      0.926     0.902     0.914     10059

    accuracy                          0.924     22521
   macro avg      0.924     0.922     0.923     22521
weighted avg      0.924     0.924     0.924     22521

[[11732   730]
 [  984  9075]]
Accuracy: 0.9238932551840504
AUC_ROC: 0.9749692589679805
f1 score: 0.9137132501006847
False Postive Rate: 0.058578077355159686

CPU times: user 86.8 ms, sys: 22.1 ms, total: 109 ms
Wall time: 104 ms
CPU times: user 11.3 ms, sys: 4.85 ms, total: 16.2 ms
Wall time: 8.11 ms
Test Results:
              precision    recall  f1-score   support

           0      0.919     0.936     0.927      6138
           1      0.919     0.897     0.908      4955

    accuracy                          0.919

###Training best model for Top 15 Features + TF-IDF InferSent

In [14]:
clf_5 = LogisticRegression(random_state=1)
rs_5 = RandomizedSearchCV(estimator=clf_5, 
                      param_distributions = param_test,  
                      n_iter = 50,
                      refit = True,
                      random_state = 1,
                      n_jobs = -1
                      )

%time rs_5.fit(X_train_tfidf_inf, y_train_tfidf_inf)
print('Best score reached: {} with params: {} '.format(rs_5.best_score_, rs_5.best_params_))



CPU times: user 1min 14s, sys: 6.56 s, total: 1min 20s
Wall time: 1h 45min 17s
Best score reached: 0.7094479391111895 with params: {'solver': 'newton-cg', 'max_iter': 30000, 'class_weight': None, 'C': 1} 


In [15]:
%time y_train_pred = rs_5.predict(X_train_tfidf_inf)
%time y_train_prob = rs_5.predict_proba(X_train_tfidf_inf)[:, 1]
print("Train Results:")
show_results(y_train_tfidf_inf, y_train_pred, y_train_prob)

%time y_test_pred = rs_5.predict(X_test_tfidf_inf)
%time y_test_prob = rs_5.predict_proba(X_test_tfidf_inf)[:, 1]
print("Test Results:")
show_results(y_test_tfidf_inf, y_test_pred, y_test_prob)

CPU times: user 278 ms, sys: 17 ms, total: 295 ms
Wall time: 239 ms
CPU times: user 277 ms, sys: 116 ms, total: 393 ms
Wall time: 243 ms
Train Results:
              precision    recall  f1-score   support

           0      0.766     0.923     0.837     10560
           1      0.831     0.574     0.679      6979

    accuracy                          0.784     17539
   macro avg      0.799     0.749     0.758     17539
weighted avg      0.792     0.784     0.775     17539

[[9746  814]
 [2970 4009]]
Accuracy: 0.78425223786989
AUC_ROC: 0.8641809085264451
f1 score: 0.6793763768852736
False Postive Rate: 0.07708333333333334

CPU times: user 318 ms, sys: 14.8 ms, total: 333 ms
Wall time: 304 ms
CPU times: user 161 ms, sys: 112 ms, total: 273 ms
Wall time: 137 ms
Test Results:
              precision    recall  f1-score   support

           0      0.714     0.872     0.785      5191
           1      0.707     0.470     0.565      3422

    accuracy                          0.712      861