In [None]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

from xgboost import XGBClassifier 
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, f1_score

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


## Reading in data sources

###**Scam** data sets

In [None]:
## Full features
X_train_full = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SCAM/X_train_fSC.pkl')
X_test_full = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SCAM/X_test_fSC.pkl')

## Vanilla features
X_train_v = X_train_full.drop(['text', 'cleaned_text', 'cleaned_text_full'], axis=1)
X_test_v = X_test_full.drop(['text', 'cleaned_text', 'cleaned_text_full'], axis=1)
scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_v), index = X_train_v.index, columns = X_train_v.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_v), index = X_test_v.index, columns = X_test_v.columns)

## target label
y_train = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SCAM/y_train_fSC.pkl')
y_test = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SCAM/y_test_fSC.pkl')

## TF-IDF with top 15 features
X_train_tfidf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF SCAM/X_train_tfSC.pkl')
X_test_tfidf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF SCAM/X_test_tfSC.pkl')

## TF-IDF GloVe with top 15 features
X_train_tfidf_glove = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF Glove SCAM/X_train_tfglSC.pkl')
X_test_tfidf_glove = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF Glove SCAM/X_test_tfglSC.pkl')

## TF-IDF FastText (cc) with top 15 features
X_train_tfidf_cc = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF CC SCAM/X_train_tfccSC.pkl')
X_test_tfidf_cc = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF CC SCAM/X_test_tfccSC.pkl')

## infersent
X_train_infersent = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SCAM/X_train_infSC.pkl')
X_test_infersent = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SCAM/X_test_infSC.pkl')
y_train_infersent = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SCAM/y_train_infSC.pkl')
y_test_infersent = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SCAM/y_test_infSC.pkl')

##XgBoost

###hyperparameter tuning

In [None]:
def show_results(y_actual, y_pred, y_prob):
  print(classification_report(y_actual, y_pred, digits=3))
  print(confusion_matrix(y_actual, y_pred))
  print("Accuracy: " + str(accuracy_score(y_actual, y_pred)))
  print("AUC_ROC: " + str(roc_auc_score(y_actual, y_prob)))
  print("f1 score: " + str(f1_score(y_actual, y_pred)))
  tn, fp, fn, tp = confusion_matrix(y_actual, y_pred).ravel()
  fpr = fp/(fp+tn)
  print("False Postive Rate: " + str(fpr) + "\n")

In [None]:
param_test = {'learning_rate': [0.01, 0.1],
                'max_depth': [3, 5, 7, 10],
                'min_child_weight': [1, 3, 5],
                'subsample': sp_uniform(loc=0.2, scale=0.8), 
                'colsample_bytree': sp_uniform(loc=0.2, scale=0.8), 
                'n_estimators' :  [int(x) for x in np.linspace(start = 20, stop = 1000, num = 10)],
                'objective': ['binary:logistic', 'binary:logitraw'],
                'early_stopping_round': sp_randint(10, 80) 
              }

###training best model for scaled Vanilla features

In [None]:
clf_1 = XGBClassifier(random_state=1, silent=True)
rs_1 = RandomizedSearchCV(estimator=clf_1, 
                          param_distributions = param_test, 
                          refit = True,
                          random_state = 1,
                          n_jobs = -1
                          )

%time rs_1.fit(X_train_scaled, y_train)
print('Best score reached: {} with params: {} '.format(rs_1.best_score_, rs_1.best_params_))

CPU times: user 1.59 s, sys: 79.8 ms, total: 1.67 s
Wall time: 33.2 s
Best score reached: 0.9493095362167796 with params: {'colsample_bytree': 0.5368861000040417, 'early_stopping_round': 31, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 891, 'objective': 'binary:logitraw', 'subsample': 0.3136961083799656} 


In [None]:
y_train_pred = rs_1.predict(X_train_scaled)
y_train_prob = rs_1.predict_proba(X_train_scaled)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

y_test_pred = rs_1.predict(X_test_scaled)
y_test_prob = rs_1.predict_proba(X_test_scaled)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000       538
           1      1.000     1.000     1.000       981

    accuracy                          1.000      1519
   macro avg      1.000     1.000     1.000      1519
weighted avg      1.000     1.000     1.000      1519

[[538   0]
 [  0 981]]
Accuracy: 1.0
AUC_ROC: 1.0
f1 score: 1.0
False Postive Rate: 0.0

Test Results:
              precision    recall  f1-score   support

           0      0.904     0.928     0.916       265
           1      0.960     0.946     0.953       484

    accuracy                          0.940       749
   macro avg      0.932     0.937     0.935       749
weighted avg      0.940     0.940     0.940       749

[[246  19]
 [ 26 458]]
Accuracy: 0.9399198931909212
AUC_ROC: 0.9838531108685482
f1 score: 0.9531737773152965
False Postive Rate: 0.07169811320754717



###training best model for Top 15 Features + TF-IDF

In [None]:
clf_2 = XGBClassifier(random_state=1, silent=True)
rs_2 = RandomizedSearchCV(estimator=clf_2, 
                          param_distributions = param_test, 
                          refit = True,
                          random_state = 1,
                          n_jobs = -1
                          )

%time rs_2.fit(X_train_tfidf, y_train)
print('Best score reached: {} with params: {} '.format(rs_2.best_score_, rs_2.best_params_))

CPU times: user 6.78 s, sys: 138 ms, total: 6.92 s
Wall time: 3min 14s
Best score reached: 0.9802457877366683 with params: {'colsample_bytree': 0.5172645818368209, 'early_stopping_round': 16, 'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 564, 'objective': 'binary:logistic', 'subsample': 0.6196385276582972} 


In [None]:
y_train_pred = rs_2.predict(X_train_tfidf)
y_train_prob = rs_2.predict_proba(X_train_tfidf)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

y_test_pred = rs_2.predict(X_test_tfidf)
y_test_prob = rs_2.predict_proba(X_test_tfidf)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000       538
           1      1.000     1.000     1.000       981

    accuracy                          1.000      1519
   macro avg      1.000     1.000     1.000      1519
weighted avg      1.000     1.000     1.000      1519

[[538   0]
 [  0 981]]
Accuracy: 1.0
AUC_ROC: 1.0
f1 score: 1.0
False Postive Rate: 0.0

Test Results:
              precision    recall  f1-score   support

           0      0.956     0.977     0.966       265
           1      0.987     0.975     0.981       484

    accuracy                          0.976       749
   macro avg      0.972     0.976     0.974       749
weighted avg      0.976     0.976     0.976       749

[[259   6]
 [ 12 472]]
Accuracy: 0.9759679572763685
AUC_ROC: 0.9961172618119445
f1 score: 0.9812889812889813
False Postive Rate: 0.022641509433962263



###training best model for Top 15 Features + TF-IDF weighted GloVe vectors

In [None]:
clf_3 = XGBClassifier(random_state=1, silent=True)
rs_3 = RandomizedSearchCV(estimator=clf_3, 
                          param_distributions = param_test, 
                          refit = True,
                          random_state = 1,
                          n_jobs = -1
                          )

%time rs_3.fit(X_train_tfidf_glove, y_train)
print('Best score reached: {} with params: {} '.format(rs_3.best_score_, rs_3.best_params_))

CPU times: user 5.33 s, sys: 114 ms, total: 5.45 s
Wall time: 2min 43s
Best score reached: 0.9868312489143651 with params: {'colsample_bytree': 0.5368861000040417, 'early_stopping_round': 31, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 891, 'objective': 'binary:logitraw', 'subsample': 0.3136961083799656} 


In [None]:
y_train_pred = rs_3.predict(X_train_tfidf_glove)
y_train_prob = rs_3.predict_proba(X_train_tfidf_glove)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

y_test_pred = rs_3.predict(X_test_tfidf_glove)
y_test_prob = rs_3.predict_proba(X_test_tfidf_glove)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000       538
           1      1.000     1.000     1.000       981

    accuracy                          1.000      1519
   macro avg      1.000     1.000     1.000      1519
weighted avg      1.000     1.000     1.000      1519

[[538   0]
 [  0 981]]
Accuracy: 1.0
AUC_ROC: 0.9999999999999999
f1 score: 1.0
False Postive Rate: 0.0

Test Results:
              precision    recall  f1-score   support

           0      0.974     0.974     0.974       265
           1      0.986     0.986     0.986       484

    accuracy                          0.981       749
   macro avg      0.980     0.980     0.980       749
weighted avg      0.981     0.981     0.981       749

[[258   7]
 [  7 477]]
Accuracy: 0.9813084112149533
AUC_ROC: 0.9979728676126618
f1 score: 0.9855371900826446
False Postive Rate: 0.026415094339622643



###training best model for Top 15 Features + TF-IDF weighted FastText

In [None]:
clf_4 = XGBClassifier(random_state=1, silent=True)
rs_4 = RandomizedSearchCV(estimator=clf_4, 
                          param_distributions = param_test, 
                          refit = True,
                          random_state = 1,
                          n_jobs = -1
                          )
%time rs_4.fit(X_train_tfidf_cc, y_train)
print('Best score reached: {} with params: {} '.format(rs_4.best_score_, rs_4.best_params_))

CPU times: user 5.01 s, sys: 103 ms, total: 5.11 s
Wall time: 2min 39s
Best score reached: 0.9934123675525448 with params: {'colsample_bytree': 0.5172645818368209, 'early_stopping_round': 16, 'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 564, 'objective': 'binary:logistic', 'subsample': 0.6196385276582972} 


In [None]:
y_train_pred = rs_4.predict(X_train_tfidf_cc)
y_train_prob = rs_4.predict_proba(X_train_tfidf_cc)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

y_test_pred = rs_4.predict(X_test_tfidf_cc)
y_test_prob = rs_4.predict_proba(X_test_tfidf_cc)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000       538
           1      1.000     1.000     1.000       981

    accuracy                          1.000      1519
   macro avg      1.000     1.000     1.000      1519
weighted avg      1.000     1.000     1.000      1519

[[538   0]
 [  0 981]]
Accuracy: 1.0
AUC_ROC: 1.0
f1 score: 1.0
False Postive Rate: 0.0

Test Results:
              precision    recall  f1-score   support

           0      0.974     0.985     0.979       265
           1      0.992     0.986     0.989       484

    accuracy                          0.985       749
   macro avg      0.983     0.985     0.984       749
weighted avg      0.985     0.985     0.985       749

[[261   4]
 [  7 477]]
Accuracy: 0.9853137516688919
AUC_ROC: 0.9982067675035086
f1 score: 0.9886010362694301
False Postive Rate: 0.01509433962264151



### InferSent


In [None]:
clf = XGBClassifier(random_state=1, silent=True)
rs = RandomizedSearchCV(estimator=clf, 
                          param_distributions = param_test, 
                          refit = True,
                          random_state = 1,
                          n_jobs = -1
                          )

%time rs.fit(X_train_infersent, y_train_infersent)
print('Best score reached: {} with params: {} '.format(rs.best_score_, rs.best_params_))

CPU times: user 1min 47s, sys: 1.91 s, total: 1min 49s
Wall time: 53min 2s
Best score reached: 0.895397285641188 with params: {'colsample_bytree': 0.5172645818368209, 'early_stopping_round': 16, 'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 564, 'objective': 'binary:logistic', 'subsample': 0.6196385276582972} 


In [None]:
y_train_pred = rs.predict(X_train_infersent)
y_train_prob = rs.predict_proba(X_train_infersent)[:, 1]
print("Train Results:")
show_results(y_train_infersent, y_train_pred, y_train_prob)

y_test_pred = rs.predict(X_test_infersent)
y_test_prob = rs.predict_proba(X_test_infersent)[:, 1]
print("Test Results:")
show_results(y_test_infersent, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000       453
           1      1.000     1.000     1.000       981

    accuracy                          1.000      1434
   macro avg      1.000     1.000     1.000      1434
weighted avg      1.000     1.000     1.000      1434

[[453   0]
 [  0 981]]
Accuracy: 1.0
AUC_ROC: 1.0
f1 score: 1.0
False Postive Rate: 0.0

Test Results:
              precision    recall  f1-score   support

           0      0.898     0.825     0.860       223
           1      0.922     0.957     0.939       484

    accuracy                          0.915       707
   macro avg      0.910     0.891     0.899       707
weighted avg      0.915     0.915     0.914       707

[[184  39]
 [ 21 463]]
Accuracy: 0.9151343705799151
AUC_ROC: 0.9548604677018864
f1 score: 0.9391480730223123
False Postive Rate: 0.17488789237668162

