In [None]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

import lightgbm as lgbm
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, f1_score

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


## Reading in data sources

###**Spam** data sets

In [None]:
## Full features
X_train_full = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SPAM/X_train_fSP.pkl')
X_test_full = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SPAM/X_test_fSP.pkl')

## Vanilla features
X_train_v = X_train_full.drop(['text', 'cleaned_text', 'cleaned_text_full'], axis=1)
X_test_v = X_test_full.drop(['text', 'cleaned_text', 'cleaned_text_full'], axis=1)
scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_v), index = X_train_v.index, columns = X_train_v.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_v), index = X_test_v.index, columns = X_test_v.columns)

## target label
y_train = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SPAM/y_train_fSP.pkl')
y_test = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SPAM/y_test_fSP.pkl')

## TF-IDF with top 15 features
X_train_tfidf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF SPAM/X_train_tfSP.pkl')
X_test_tfidf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF SPAM/X_test_tfSP.pkl')

## TF-IDF GloVe with top 15 features
X_train_tfidf_glove = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF Glove SPAM/X_train_tfglSP.pkl')
X_test_tfidf_glove = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF Glove SPAM/X_test_tfglSP.pkl')

## TF-IDF FastText (cc) with top 15 features
X_train_tfidf_cc = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF CC SPAM/X_train_tfccSP.pkl')
X_test_tfidf_cc = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF CC SPAM/X_test_tfccSP.pkl')

## infersent
X_train_infersent = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SPAM/X_train_infSP.pkl')
X_test_infersent = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SPAM/X_test_infSP.pkl')
y_train_infersent = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SPAM/y_train_infSP.pkl')
y_test_infersent = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SPAM/y_test_infSP.pkl')

##LightGBM

###hyperparameter tuning

In [None]:
def show_results(y_actual, y_pred, y_prob):
  print(classification_report(y_actual, y_pred, digits=3))
  print(confusion_matrix(y_actual, y_pred))
  print("Accuracy: " + str(accuracy_score(y_actual, y_pred)))
  print("AUC_ROC: " + str(roc_auc_score(y_actual, y_prob)))
  print("f1 score: " + str(f1_score(y_actual, y_pred)))
  tn, fp, fn, tp = confusion_matrix(y_actual, y_pred).ravel()
  fpr = fp/(fp+tn)
  print("False Postive Rate: " + str(fpr) + "\n")

In [None]:
param_test ={'num_leaves': sp_randint(6, 80), 
             'min_data_in_leaf': sp_randint(20, 100), 
             'min_sum_hessian_in_leaf': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'feature_fraction': sp_uniform(loc=0.4, scale=0.6),
             'lambda_l1': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'lambda_l2': [0, 1e-1, 1, 5, 10, 20, 50, 100],
             'boosting_type': ['goss', 'gbdt', 'dart'],
             'max_depth': sp_randint(-1, 20), 
             'bagging_fraction': sp_uniform(loc=0.1, scale=0.6),
             'early_stopping_round': sp_randint(10, 80), 
             'metric': ['auc', 'l1', 'average_precision'],
             'objective' : ['binary']
             }

###training best model for scaled Vanilla features

In [None]:
clf_1 = lgbm.LGBMClassifier(random_state=1, silent=True)
rs_1 = RandomizedSearchCV(estimator = clf_1, 
                        param_distributions = param_test, 
                        refit = True,
                        random_state = 1,
                        n_jobs = -1
                        )

In [None]:
fit_params={"early_stopping_rounds":30, 
            "eval_metric" : 'auc', 
            "eval_set" : [(X_train_scaled, y_train)],
            'eval_names': ['valid'],
            'categorical_feature': 'auto',
            }
%time rs_1.fit(X_train_scaled, y_train, **fit_params)
print('Best score reached: {} with params: {} '.format(rs_1.best_score_, rs_1.best_params_))



[1]	valid's l1: 0.471296	valid's auc: 0.891815
Training until validation scores don't improve for 34 rounds.
[2]	valid's l1: 0.45361	valid's auc: 0.912962
[3]	valid's l1: 0.435139	valid's auc: 0.922822
[4]	valid's l1: 0.417811	valid's auc: 0.927525
[5]	valid's l1: 0.401031	valid's auc: 0.929198
[6]	valid's l1: 0.388025	valid's auc: 0.93139
[7]	valid's l1: 0.373551	valid's auc: 0.932609
[8]	valid's l1: 0.360321	valid's auc: 0.933232
[9]	valid's l1: 0.350048	valid's auc: 0.934732
[10]	valid's l1: 0.3406	valid's auc: 0.935588
[11]	valid's l1: 0.331171	valid's auc: 0.935976
[12]	valid's l1: 0.322409	valid's auc: 0.936047
[13]	valid's l1: 0.314956	valid's auc: 0.936154
[14]	valid's l1: 0.307721	valid's auc: 0.936898
[15]	valid's l1: 0.301158	valid's auc: 0.937028
[16]	valid's l1: 0.295295	valid's auc: 0.937623
[17]	valid's l1: 0.289787	valid's auc: 0.937839
[18]	valid's l1: 0.284993	valid's auc: 0.938474
[19]	valid's l1: 0.280673	valid's auc: 0.93892
[20]	valid's l1: 0.276421	valid's auc: 0

In [None]:
y_train_pred = rs_1.predict(X_train_scaled)
y_train_prob = rs_1.predict_proba(X_train_scaled)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

y_test_pred = rs_1.predict(X_test_scaled)
y_test_prob = rs_1.predict_proba(X_test_scaled)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      0.893     0.932     0.912     12462
           1      0.911     0.862     0.886     10059

    accuracy                          0.901     22521
   macro avg      0.902     0.897     0.899     22521
weighted avg      0.901     0.901     0.901     22521

[[11620   842]
 [ 1391  8668]]
Accuracy: 0.9008480973313796
AUC_ROC: 0.9668434330851922
f1 score: 0.8858909499718943
False Postive Rate: 0.06756539881238967

Test Results:
              precision    recall  f1-score   support

           0      0.875     0.906     0.890      6138
           1      0.878     0.839     0.858      4955

    accuracy                          0.876     11093
   macro avg      0.877     0.873     0.874     11093
weighted avg      0.876     0.876     0.876     11093

[[5562  576]
 [ 796 4159]]
Accuracy: 0.8763183989903542
AUC_ROC: 0.9472140104866905
f1 score: 0.8584107327141384
False Postive Rate: 0.093841642228739



###training best model for Top 15 Features + TF-IDF

In [None]:
clf_2 = lgbm.LGBMClassifier(random_state=1, silent=True)
rs_2 = RandomizedSearchCV(estimator = clf_2, 
                        param_distributions = param_test, 
                        refit = True,
                        random_state = 1,
                        n_jobs = -1
                        )

In [None]:
fit_params={"early_stopping_rounds":30, 
            "eval_metric" : ['auc', 'l1', 'average_precision'], 
            "eval_set" : [(X_train_tfidf, y_train)],
            'eval_names': ['valid'],
            'categorical_feature': 'auto',
            }
%time rs_2.fit(X_train_tfidf, y_train, **fit_params)
print('Best score reached: {} with params: {} '.format(rs_2.best_score_, rs_2.best_params_))



[1]	valid's l1: 0.46907	valid's auc: 0.909908
Training until validation scores don't improve for 34 rounds.
[2]	valid's l1: 0.447259	valid's auc: 0.942038
[3]	valid's l1: 0.427371	valid's auc: 0.947796
[4]	valid's l1: 0.405712	valid's auc: 0.954968
[5]	valid's l1: 0.385863	valid's auc: 0.961044
[6]	valid's l1: 0.368825	valid's auc: 0.966309
[7]	valid's l1: 0.352401	valid's auc: 0.968916
[8]	valid's l1: 0.342214	valid's auc: 0.969711
[9]	valid's l1: 0.329228	valid's auc: 0.970305
[10]	valid's l1: 0.315424	valid's auc: 0.971877
[11]	valid's l1: 0.304533	valid's auc: 0.972526
[12]	valid's l1: 0.293076	valid's auc: 0.973198
[13]	valid's l1: 0.283654	valid's auc: 0.973734
[14]	valid's l1: 0.275495	valid's auc: 0.973807
[15]	valid's l1: 0.268601	valid's auc: 0.974037
[16]	valid's l1: 0.26114	valid's auc: 0.974501
[17]	valid's l1: 0.25408	valid's auc: 0.974712
[18]	valid's l1: 0.248719	valid's auc: 0.975247
[19]	valid's l1: 0.241178	valid's auc: 0.976346
[20]	valid's l1: 0.234857	valid's auc:

In [None]:
y_train_pred = rs_2.predict(X_train_tfidf)
y_train_prob = rs_2.predict_proba(X_train_tfidf)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

y_test_pred = rs_2.predict(X_test_tfidf)
y_test_prob = rs_2.predict_proba(X_test_tfidf)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      0.957     0.970     0.963     12462
           1      0.962     0.946     0.954     10059

    accuracy                          0.959     22521
   macro avg      0.959     0.958     0.958     22521
weighted avg      0.959     0.959     0.959     22521

[[12083   379]
 [  546  9513]]
Accuracy: 0.9589272234803072
AUC_ROC: 0.9920320773461293
f1 score: 0.9536364092025463
False Postive Rate: 0.03041245385973359

Test Results:
              precision    recall  f1-score   support

           0      0.946     0.960     0.953      6138
           1      0.949     0.932     0.941      4955

    accuracy                          0.947     11093
   macro avg      0.948     0.946     0.947     11093
weighted avg      0.947     0.947     0.947     11093

[[5890  248]
 [ 335 4620]]
Accuracy: 0.9474443342648518
AUC_ROC: 0.9886267380684881
f1 score: 0.9406494960806271
False Postive Rate: 0.04040404040404041



###training best model for Top 15 Features + TF-IDF weighted GloVe vectors

In [None]:
clf_3 = lgbm.LGBMClassifier(random_state=1, silent=True)
rs_3 = RandomizedSearchCV(estimator = clf_3, 
                        param_distributions = param_test, 
                        refit = True,
                        random_state = 1,
                        n_jobs = -1
                        )

In [None]:
fit_params={"early_stopping_rounds":30, 
            "eval_metric" : ['auc', 'l1', 'average_precision'], 
            "eval_set" : [(X_train_tfidf_glove, y_train)],
            'eval_names': ['valid'],
            'categorical_feature': 'auto',
            }
%time rs_3.fit(X_train_tfidf_glove, y_train, **fit_params)
print('Best score reached: {} with params: {} '.format(rs_3.best_score_, rs_3.best_params_))



[1]	valid's l1: 0.466298	valid's auc: 0.922959
Training until validation scores don't improve for 34 rounds.
[2]	valid's l1: 0.444775	valid's auc: 0.949514
[3]	valid's l1: 0.422044	valid's auc: 0.961373
[4]	valid's l1: 0.402508	valid's auc: 0.968439
[5]	valid's l1: 0.382746	valid's auc: 0.971453
[6]	valid's l1: 0.363559	valid's auc: 0.974338
[7]	valid's l1: 0.347591	valid's auc: 0.976327
[8]	valid's l1: 0.331039	valid's auc: 0.978342
[9]	valid's l1: 0.316494	valid's auc: 0.97906
[10]	valid's l1: 0.302493	valid's auc: 0.980886
[11]	valid's l1: 0.291114	valid's auc: 0.981527
[12]	valid's l1: 0.279733	valid's auc: 0.982644
[13]	valid's l1: 0.268268	valid's auc: 0.983541
[14]	valid's l1: 0.258443	valid's auc: 0.984396
[15]	valid's l1: 0.249253	valid's auc: 0.985124
[16]	valid's l1: 0.24026	valid's auc: 0.985815
[17]	valid's l1: 0.233145	valid's auc: 0.986379
[18]	valid's l1: 0.224779	valid's auc: 0.987022
[19]	valid's l1: 0.217305	valid's auc: 0.987651
[20]	valid's l1: 0.210428	valid's auc

In [None]:
y_train_pred = rs_3.predict(X_train_tfidf_glove)
y_train_prob = rs_3.predict_proba(X_train_tfidf_glove)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

y_test_pred = rs_3.predict(X_test_tfidf_glove)
y_test_prob = rs_3.predict_proba(X_test_tfidf_glove)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      0.993     0.997     0.995     12462
           1      0.996     0.991     0.994     10059

    accuracy                          0.994     22521
   macro avg      0.995     0.994     0.994     22521
weighted avg      0.994     0.994     0.994     22521

[[12424    38]
 [   88  9971]]
Accuracy: 0.9944052217929932
AUC_ROC: 0.9997342512748848
f1 score: 0.993721347418776
False Postive Rate: 0.0030492697801316002

Test Results:
              precision    recall  f1-score   support

           0      0.953     0.958     0.955      6138
           1      0.947     0.941     0.944      4955

    accuracy                          0.950     11093
   macro avg      0.950     0.949     0.950     11093
weighted avg      0.950     0.950     0.950     11093

[[5879  259]
 [ 292 4663]]
Accuracy: 0.9503290363292166
AUC_ROC: 0.9902216395917773
f1 score: 0.9442138301103573
False Postive Rate: 0.042196155099380905



###training best model for Top 15 Features + TF-IDF weighted FastText

In [None]:
clf_4 = lgbm.LGBMClassifier(random_state=1, silent=True)
rs_4 = RandomizedSearchCV(estimator = clf_4, 
                        param_distributions = param_test, 
                        refit = True,
                        random_state = 1,
                        n_jobs = -1
                        )

In [None]:
fit_params={"early_stopping_rounds":30, 
            "eval_metric" : ['auc', 'l1', 'average_precision'], 
            "eval_set" : [(X_train_tfidf_cc, y_train)],
            'eval_names': ['valid'],
            'categorical_feature': 'auto',
            }
%time rs_4.fit(X_train_tfidf_cc, y_train, **fit_params)
print('Best score reached: {} with params: {} '.format(rs_4.best_score_, rs_4.best_params_))



[1]	valid's l1: 0.46636	valid's auc: 0.922108
Training until validation scores don't improve for 34 rounds.
[2]	valid's l1: 0.443972	valid's auc: 0.954029
[3]	valid's l1: 0.419795	valid's auc: 0.968742
[4]	valid's l1: 0.400122	valid's auc: 0.972706
[5]	valid's l1: 0.381059	valid's auc: 0.975488
[6]	valid's l1: 0.361832	valid's auc: 0.978712
[7]	valid's l1: 0.345643	valid's auc: 0.979493
[8]	valid's l1: 0.328837	valid's auc: 0.982295
[9]	valid's l1: 0.314118	valid's auc: 0.983049
[10]	valid's l1: 0.300214	valid's auc: 0.98389
[11]	valid's l1: 0.289555	valid's auc: 0.984494
[12]	valid's l1: 0.278177	valid's auc: 0.985435
[13]	valid's l1: 0.267509	valid's auc: 0.98642
[14]	valid's l1: 0.25764	valid's auc: 0.987047
[15]	valid's l1: 0.248393	valid's auc: 0.987869
[16]	valid's l1: 0.238743	valid's auc: 0.988642
[17]	valid's l1: 0.231343	valid's auc: 0.989382
[18]	valid's l1: 0.222772	valid's auc: 0.989912
[19]	valid's l1: 0.21538	valid's auc: 0.990336
[20]	valid's l1: 0.208013	valid's auc: 0

In [None]:
y_train_pred = rs_4.predict(X_train_tfidf_cc)
y_train_prob = rs_4.predict_proba(X_train_tfidf_cc)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

y_test_pred = rs_4.predict(X_test_tfidf_cc)
y_test_prob = rs_4.predict_proba(X_test_tfidf_cc)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      0.995     0.998     0.997     12462
           1      0.997     0.994     0.996     10059

    accuracy                          0.996     22521
   macro avg      0.996     0.996     0.996     22521
weighted avg      0.996     0.996     0.996     22521

[[12436    26]
 [   59 10000]]
Accuracy: 0.9962257448603525
AUC_ROC: 0.9998700413507984
f1 score: 0.9957679860592481
False Postive Rate: 0.0020863424811426736

Test Results:
              precision    recall  f1-score   support

           0      0.958     0.970     0.964      6138
           1      0.963     0.947     0.955      4955

    accuracy                          0.960     11093
   macro avg      0.960     0.959     0.959     11093
weighted avg      0.960     0.960     0.960     11093

[[5956  182]
 [ 264 4691]]
Accuracy: 0.959794464977914
AUC_ROC: 0.9929039261466591
f1 score: 0.9546194546194546
False Postive Rate: 0.029651352231997392



### InferSent


In [None]:
clf = lgbm.LGBMClassifier(random_state=1)
# rs = RandomizedSearchCV(estimator = clf, 
#                         param_distributions = param_test, 
#                         refit = True,
#                         random_state = 1,
#                         n_jobs = -1
#                         )

# fit_params={"early_stopping_rounds":30, 
#             "eval_metric" : ['auc', 'l1', 'average_precision'], 
#             "eval_set" : [(X_train_scaled, y_train)],
#             'eval_names': ['valid'],
#             'categorical_feature': 'auto',
#             }
%time clf.fit(X_train_infersent, y_train_infersent)
# print('Best score reached: {} with params: {} '.format(rs.best_score_, rs.best_params_))

CPU times: user 4min 37s, sys: 2.03 s, total: 4min 39s
Wall time: 2min 25s


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=1, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [None]:
y_train_pred = clf.predict(X_train_infersent)
y_train_prob = clf.predict_proba(X_train_infersent)[:, 1]
print("Train Results:")
show_results(y_train_infersent, y_train_pred, y_train_prob)

y_test_pred = clf.predict(X_test_infersent)
y_test_prob = clf.predict_proba(X_test_infersent)[:, 1]
print("Test Results:")
show_results(y_test_infersent, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      0.952     0.979     0.966     10560
           1      0.968     0.926     0.946      6979

    accuracy                          0.958     17539
   macro avg      0.960     0.953     0.956     17539
weighted avg      0.958     0.958     0.958     17539

[[10343   217]
 [  519  6460]]
Accuracy: 0.9580363760761731
AUC_ROC: 0.9939218358538819
f1 score: 0.9461042765084944
False Postive Rate: 0.020549242424242425

Test Results:
              precision    recall  f1-score   support

           0      0.874     0.914     0.893      5191
           1      0.860     0.800     0.829      3422

    accuracy                          0.869      8613
   macro avg      0.867     0.857     0.861      8613
weighted avg      0.868     0.869     0.868      8613

[[4745  446]
 [ 686 2736]]
Accuracy: 0.8685707651224892
AUC_ROC: 0.9399785021078495
f1 score: 0.8285887341005451
False Postive Rate: 0.08591793488730495

