In [None]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, f1_score

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


## Reading in data sources

###**Spam** data sets

In [None]:
## Full features
X_train_full = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SPAM/X_train_fSP.pkl')
X_test_full = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SPAM/X_test_fSP.pkl')

## Vanilla features
X_train_v = X_train_full.drop(['text', 'cleaned_text', 'cleaned_text_full'], axis=1)
X_test_v = X_test_full.drop(['text', 'cleaned_text', 'cleaned_text_full'], axis=1)
scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_v), index = X_train_v.index, columns = X_train_v.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_v), index = X_test_v.index, columns = X_test_v.columns)

## target label
y_train = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SPAM/y_train_fSP.pkl')
y_test = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SPAM/y_test_fSP.pkl')

## TF-IDF with top 15 features
X_train_tfidf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF SPAM/X_train_tfSP.pkl')
X_test_tfidf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF SPAM/X_test_tfSP.pkl')

## TF-IDF GloVe with top 15 features
X_train_tfidf_glove = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF Glove SPAM/X_train_tfglSP.pkl')
X_test_tfidf_glove = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF Glove SPAM/X_test_tfglSP.pkl')

## TF-IDF FastText (cc) with top 15 features
X_train_tfidf_cc = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF CC SPAM/X_train_tfccSP.pkl')
X_test_tfidf_cc = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF CC SPAM/X_test_tfccSP.pkl')

## infersent
X_train_infersent = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SPAM/X_train_infSP.pkl')
X_test_infersent = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SPAM/X_test_infSP.pkl')
y_train_infersent = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SPAM/y_train_infSP.pkl')
y_test_infersent = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SPAM/y_test_infSP.pkl')

##XgBoost

###hyperparameter tuning

In [None]:
def show_results(y_actual, y_pred, y_prob):
  print(classification_report(y_actual, y_pred, digits=3))
  print(confusion_matrix(y_actual, y_pred))
  print("Accuracy: " + str(accuracy_score(y_actual, y_pred)))
  print("AUC_ROC: " + str(roc_auc_score(y_actual, y_prob)))
  print("f1 score: " + str(f1_score(y_actual, y_pred)))
  tn, fp, fn, tp = confusion_matrix(y_actual, y_pred).ravel()
  fpr = fp/(fp+tn)
  print("False Postive Rate: " + str(fpr) + "\n")

In [None]:
param_test = {'learning_rate': [0.01, 0.1],
                'max_depth': [3, 5, 7, 10],
                'min_child_weight': [1, 3, 5],
                'subsample': sp_uniform(loc=0.2, scale=0.8), 
                'colsample_bytree': sp_uniform(loc=0.2, scale=0.8), 
                'n_estimators' :  [int(x) for x in np.linspace(start = 20, stop = 1000, num = 10)],
                'objective': ['binary:logistic', 'binary:logitraw']
              }

###training best model for scaled Vanilla features

In [None]:
clf_1 = XGBClassifier(random_state=1, silent=True)
rs_1 = RandomizedSearchCV(estimator=clf_1, 
                          param_distributions = param_test, 
                          refit = True,
                          random_state = 1,
                          n_jobs = -1
                          )

%time rs_1.fit(X_train_scaled, y_train)
print('Best score reached: {} with params: {} '.format(rs_1.best_score_, rs_1.best_params_))

CPU times: user 23.8 s, sys: 421 ms, total: 24.2 s
Wall time: 7min 36s
Best score reached: 0.8714978719125976 with params: {'colsample_bytree': 0.6469518627566013, 'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 673, 'objective': 'binary:logitraw', 'subsample': 0.7727764127746914} 


In [None]:
y_train_pred = rs_1.predict(X_train_scaled)
y_train_prob = rs_1.predict_proba(X_train_scaled)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

y_test_pred = rs_1.predict(X_test_scaled)
y_test_prob = rs_1.predict_proba(X_test_scaled)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      0.956     0.999     0.977     12462
           1      0.998     0.944     0.970     10059

    accuracy                          0.974     22521
   macro avg      0.977     0.971     0.974     22521
weighted avg      0.975     0.974     0.974     22521

[[12445    17]
 [  566  9493]]
Accuracy: 0.9741130500421828
AUC_ROC: 0.9989013783530326
f1 score: 0.9702079820123665
False Postive Rate: 0.001364147006900979

Test Results:
              precision    recall  f1-score   support

           0      0.862     0.937     0.898      6138
           1      0.912     0.815     0.861      4955

    accuracy                          0.882     11093
   macro avg      0.887     0.876     0.879     11093
weighted avg      0.885     0.882     0.881     11093

[[5749  389]
 [ 917 4038]]
Accuracy: 0.8822680969981069
AUC_ROC: 0.9516957932569404
f1 score: 0.8607972713707098
False Postive Rate: 0.06337569240795048



### training best model for Top 15 Features + TF-IDF

In [None]:
clf_2 = XGBClassifier(random_state=1)
rs_2 = RandomizedSearchCV(estimator=clf_2, 
                          param_distributions = param_test, 
                          refit = True,
                          random_state = 1,
                          n_jobs = -1
                          )

%time rs_2.fit(X_train_tfidf, y_train)
print('Best score reached: {} with params: {} '.format(rs_2.best_score_, rs_2.best_params_))

Wall time: 2h 16min 41s
Best score reached: 0.9595932189022263 with params: {'colsample_bytree': 0.6469518627566013, 'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 673, 'objective': 'binary:logitraw', 'subsample': 0.7727764127746914} 


In [None]:
y_train_pred = rs_2.predict(X_train_tfidf)
y_train_prob = rs_2.predict_proba(X_train_tfidf)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

y_test_pred = rs_2.predict(X_test_tfidf)
y_test_prob = rs_2.predict_proba(X_test_tfidf)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      0.983     0.998     0.991     12462
           1      0.998     0.979     0.988     10059

    accuracy                          0.989     22521
   macro avg      0.990     0.988     0.989     22521
weighted avg      0.990     0.989     0.989     22521

[[12439    23]
 [  215  9844]]
Accuracy: 0.9894320856089872
AUC_ROC: 0.9994888247926546
f1 score: 0.9880558064839907
False Postive Rate: 0.001845610656395442

Test Results:
              precision    recall  f1-score   support

           0      0.956     0.983     0.969      6138
           1      0.978     0.944     0.961      4955

    accuracy                          0.965     11093
   macro avg      0.967     0.963     0.965     11093
weighted avg      0.966     0.965     0.965     11093

[[6032  106]
 [ 278 4677]]
Accuracy: 0.965383575227621
AUC_ROC: 0.9946034019436578
f1 score: 0.9605668515095502
False Postive Rate: 0.01726946888237211



### training best model for Top 15 Features + TF-IDF weighted GloVe vectors

In [None]:
clf_3 = XGBClassifier(random_state=1)
rs_3 = RandomizedSearchCV(estimator=clf_3, 
                          param_distributions = param_test, 
                          refit = True,
                          random_state = 1,
                          n_jobs = -1
                          )

%time rs_3.fit(X_train_tfidf_glove, y_train)
print('Best score reached: {} with params: {} '.format(rs_3.best_score_, rs_3.best_params_))

Wall time: 1h 3min 10s
Best score reached: 0.9557303706361395 with params: {'colsample_bytree': 0.6469518627566013, 'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 673, 'objective': 'binary:logitraw', 'subsample': 0.7727764127746914} 


In [None]:
y_train_pred = rs_3.predict(X_train_tfidf_glove)
y_train_prob = rs_3.predict_proba(X_train_tfidf_glove)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

y_test_pred = rs_3.predict(X_test_tfidf_glove)
y_test_prob = rs_3.predict_proba(X_test_tfidf_glove)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000     12462
           1      1.000     1.000     1.000     10059

    accuracy                          1.000     22521
   macro avg      1.000     1.000     1.000     22521
weighted avg      1.000     1.000     1.000     22521

[[12462     0]
 [    0 10059]]
Accuracy: 1.0
AUC_ROC: 1.0
f1 score: 1.0
False Postive Rate: 0.0

Test Results:
              precision    recall  f1-score   support

           0      0.950     0.976     0.963      6138
           1      0.969     0.937     0.953      4955

    accuracy                          0.959     11093
   macro avg      0.960     0.957     0.958     11093
weighted avg      0.959     0.959     0.959     11093

[[5991  147]
 [ 312 4643]]
Accuracy: 0.9586225547642657
AUC_ROC: 0.9933270072555903
f1 score: 0.9528989225243715
False Postive Rate: 0.023949169110459433



### training best model for Top 15 Features + TF-IDF weighted FastText

In [None]:
clf_4 = XGBClassifier(random_state=1, silent=True)
rs_4 = RandomizedSearchCV(estimator=clf_4, 
                          param_distributions = param_test, 
                          refit = True,
                          random_state = 1,
                          n_jobs = -1
                          )

%time rs_4.fit(X_train_tfidf_cc, y_train)
print('Best score reached: {} with params: {} '.format(rs_4.best_score_, rs_4.best_params_))

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Wall time: 1h 15s
Best score reached: 0.9615471362981334 with params: {'colsample_bytree': 0.5172645818368209, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 673, 'objective': 'binary:logitraw', 'subsample': 0.6310533872026856} 


In [None]:
y_train_pred = rs_4.predict(X_train_tfidf_cc)
y_train_prob = rs_4.predict_proba(X_train_tfidf_cc)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

y_test_pred = rs_4.predict(X_test_tfidf_cc)
y_test_prob = rs_4.predict_proba(X_test_tfidf_cc)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000     12462
           1      1.000     1.000     1.000     10059

    accuracy                          1.000     22521
   macro avg      1.000     1.000     1.000     22521
weighted avg      1.000     1.000     1.000     22521

[[12462     0]
 [    0 10059]]
Accuracy: 1.0
AUC_ROC: 1.0
f1 score: 1.0
False Postive Rate: 0.0

Test Results:
              precision    recall  f1-score   support

           0      0.955     0.983     0.969      6138
           1      0.978     0.943     0.960      4955

    accuracy                          0.965     11093
   macro avg      0.967     0.963     0.964     11093
weighted avg      0.965     0.965     0.965     11093

[[6031  107]
 [ 281 4674]]
Accuracy: 0.9650229874695754
AUC_ROC: 0.9952029490569902
f1 score: 0.9601479046836483
False Postive Rate: 0.017432388400130335



### InferSent

In [None]:
clf = XGBClassifier(random_state=1)
# rs = RandomizedSearchCV(estimator=clf, 
#                           param_distributions = param_test, 
#                           refit = True,
#                           random_state = 1,
#                           n_jobs = -1
#                           )

%time clf.fit(X_train_infersent, y_train_infersent)
# print('Best score reached: {} with params: {} '.format(rs.best_score_, rs.best_params_))

CPU times: user 5min 37s, sys: 996 ms, total: 5min 38s
Wall time: 5min 38s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
y_train_pred = clf.predict(X_train_infersent)
y_train_prob = clf.predict_proba(X_train_infersent)[:, 1]
print("Train Results:")
show_results(y_train_infersent, y_train_pred, y_train_prob)

y_test_pred = clf.predict(X_test_infersent)
y_test_prob = clf.predict_proba(X_test_infersent)[:, 1]
print("Test Results:")
show_results(y_test_infersent, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      0.844     0.936     0.887     10560
           1      0.884     0.737     0.804      6979

    accuracy                          0.857     17539
   macro avg      0.864     0.837     0.846     17539
weighted avg      0.860     0.857     0.854     17539

[[9885  675]
 [1833 5146]]
Accuracy: 0.8570043902160899
AUC_ROC: 0.9407187607193875
f1 score: 0.8040625
False Postive Rate: 0.06392045454545454

Test Results:
              precision    recall  f1-score   support

           0      0.846     0.928     0.885      5191
           1      0.872     0.743     0.802      3422

    accuracy                          0.855      8613
   macro avg      0.859     0.835     0.844      8613
weighted avg      0.856     0.855     0.852      8613

[[4818  373]
 [ 880 2542]]
Accuracy: 0.8545222338325786
AUC_ROC: 0.9317531433095607
f1 score: 0.8022723686286886
False Postive Rate: 0.07185513388557119

