In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

import lightgbm as lgbm
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, f1_score

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Reading in data sources

###**Scam** data sets

In [None]:
## Full features
X_train_full = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SCAM/X_train_fSC.pkl')
X_test_full = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SCAM/X_test_fSC.pkl')

## Vanilla features
X_train_v = X_train_full.drop(['text', 'cleaned_text', 'cleaned_text_full'], axis=1)
X_test_v = X_test_full.drop(['text', 'cleaned_text', 'cleaned_text_full'], axis=1)
scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_v), index = X_train_v.index, columns = X_train_v.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_v), index = X_test_v.index, columns = X_test_v.columns)

## target label
y_train = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SCAM/y_train_fSC.pkl')
y_test = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SCAM/y_test_fSC.pkl')
 
## TF-IDF with top 15 features
X_train_tfidf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF SCAM/X_train_tfSC.pkl')
X_test_tfidf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF SCAM/X_test_tfSC.pkl')

## TF-IDF GloVe with top 15 features
X_train_tfidf_glove = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF Glove SCAM/X_train_tfglSC.pkl')
X_test_tfidf_glove = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF Glove SCAM/X_test_tfglSC.pkl')

## TF-IDF FastText (cc) with top 15 features
X_train_tfidf_cc = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF CC SCAM/X_train_tfccSC.pkl')
X_test_tfidf_cc = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF CC SCAM/X_test_tfccSC.pkl')

## infersent
X_train_infersent = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SCAM/X_train_infSC.pkl')
X_test_infersent = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SCAM/X_test_infSC.pkl')
y_train_infersent = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SCAM/y_train_infSC.pkl')
y_test_infersent = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SCAM/y_test_infSC.pkl')

##LightGBM

###hyperparameter tuning

In [None]:
def show_results(y_actual, y_pred, y_prob):
  print(classification_report(y_actual, y_pred, digits=3))
  print(confusion_matrix(y_actual, y_pred))
  print("Accuracy: " + str(accuracy_score(y_actual, y_pred)))
  print("AUC_ROC: " + str(roc_auc_score(y_actual, y_prob)))
  print("f1 score: " + str(f1_score(y_actual, y_pred)))
  tn, fp, fn, tp = confusion_matrix(y_actual, y_pred).ravel()
  fpr = fp/(fp+tn)
  print("False Postive Rate: " + str(fpr) + "\n")

In [None]:
param_test ={'num_leaves': sp_randint(6, 80), 
             'min_data_in_leaf': sp_randint(20, 100), 
             'min_sum_hessian_in_leaf': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'feature_fraction': sp_uniform(loc=0.4, scale=0.6),
             'lambda_l1': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'lambda_l2': [0, 1e-1, 1, 5, 10, 20, 50, 100],
             'boosting_type': ['goss', 'gbdt', 'dart'],
             'max_depth': sp_randint(-1, 20), 
             'bagging_fraction': sp_uniform(loc=0.1, scale=0.6),
             'early_stopping_round': sp_randint(10, 80), 
             'metric': ['auc', 'l1', 'average_precision'],
             'objective' : ['binary']
             }

###training best model for scaled Vanilla features

In [None]:
clf_1 = lgbm.LGBMClassifier(random_state=1, silent=True)
rs_1 = RandomizedSearchCV(estimator = clf_1, 
                        param_distributions = param_test, 
                        refit = True,
                        random_state = 1,
                        n_jobs = -1
                        )

In [None]:
fit_params={"early_stopping_rounds":30, 
            "eval_metric" : 'auc', 
            "eval_set" : [(X_train_scaled, y_train)],
            'eval_names': ['valid'],
            'categorical_feature': 'auto',
            }
%time rs_1.fit(X_train_scaled, y_train, **fit_params)
print('Best score reached: {} with params: {} '.format(rs_1.best_score_, rs_1.best_params_))

[1]	valid's auc: 0.939849
Training until validation scores don't improve for 67 rounds.
[2]	valid's auc: 0.949051
[3]	valid's auc: 0.959125
[4]	valid's auc: 0.969025
[5]	valid's auc: 0.970442
[6]	valid's auc: 0.971942
[7]	valid's auc: 0.97343
[8]	valid's auc: 0.975183
[9]	valid's auc: 0.978641
[10]	valid's auc: 0.979345
[11]	valid's auc: 0.979065
[12]	valid's auc: 0.980384
[13]	valid's auc: 0.979914
[14]	valid's auc: 0.980097
[15]	valid's auc: 0.979677
[16]	valid's auc: 0.979467
[17]	valid's auc: 0.981201
[18]	valid's auc: 0.981401
[19]	valid's auc: 0.981293
[20]	valid's auc: 0.981271
[21]	valid's auc: 0.981638
[22]	valid's auc: 0.981727
[23]	valid's auc: 0.982
[24]	valid's auc: 0.982856
[25]	valid's auc: 0.983048
[26]	valid's auc: 0.983209
[27]	valid's auc: 0.983425
[28]	valid's auc: 0.983376
[29]	valid's auc: 0.983819
[30]	valid's auc: 0.984482
[31]	valid's auc: 0.985475
[32]	valid's auc: 0.986055
[33]	valid's auc: 0.986771
[34]	valid's auc: 0.986936
[35]	valid's auc: 0.987339
[36]	v



In [None]:
y_train_pred = rs_1.predict(X_train_scaled)
y_train_prob = rs_1.predict_proba(X_train_scaled)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

y_test_pred = rs_1.predict(X_test_scaled)
y_test_prob = rs_1.predict_proba(X_test_scaled)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      0.968     0.957     0.963       538
           1      0.977     0.983     0.980       981

    accuracy                          0.974      1519
   macro avg      0.972     0.970     0.971      1519
weighted avg      0.974     0.974     0.974      1519

[[515  23]
 [ 17 964]]
Accuracy: 0.9736668861092824
AUC_ROC: 0.9959680016976835
f1 score: 0.9796747967479675
False Postive Rate: 0.04275092936802974

Test Results:
              precision    recall  f1-score   support

           0      0.917     0.917     0.917       265
           1      0.955     0.955     0.955       484

    accuracy                          0.941       749
   macro avg      0.936     0.936     0.936       749
weighted avg      0.941     0.941     0.941       749

[[243  22]
 [ 22 462]]
Accuracy: 0.9412550066755674
AUC_ROC: 0.9817558085139559
f1 score: 0.9545454545454546
False Postive Rate: 0.0830188679245283



###training best model for Top 15 Features + TF-IDF

In [None]:
clf_2 = lgbm.LGBMClassifier(random_state=1, silent=True)
rs_2 = RandomizedSearchCV(estimator = clf_2, 
                        param_distributions = param_test, 
                        refit = True,
                        random_state = 1,
                        n_jobs = -1
                        )

In [None]:
fit_params={"early_stopping_rounds":30, 
            "eval_metric" : 'auc', 
            "eval_set" : [(X_train_tfidf, y_train)],
            'eval_names': ['valid'],
            'categorical_feature': 'auto',
            }
%time rs_2.fit(X_train_tfidf, y_train, **fit_params)
print('Best score reached: {} with params: {} '.format(rs_2.best_score_, rs_2.best_params_))



[1]	valid's auc: 0.969409
Training until validation scores don't improve for 67 rounds.
[2]	valid's auc: 0.985618
[3]	valid's auc: 0.988777
[4]	valid's auc: 0.989828
[5]	valid's auc: 0.991779
[6]	valid's auc: 0.993152
[7]	valid's auc: 0.993875
[8]	valid's auc: 0.994774
[9]	valid's auc: 0.99538
[10]	valid's auc: 0.995149
[11]	valid's auc: 0.995438
[12]	valid's auc: 0.995349
[13]	valid's auc: 0.995294
[14]	valid's auc: 0.995796
[15]	valid's auc: 0.995972
[16]	valid's auc: 0.995989
[17]	valid's auc: 0.996074
[18]	valid's auc: 0.996144
[19]	valid's auc: 0.996301
[20]	valid's auc: 0.996313
[21]	valid's auc: 0.996445
[22]	valid's auc: 0.996449
[23]	valid's auc: 0.996612
[24]	valid's auc: 0.996768
[25]	valid's auc: 0.99684
[26]	valid's auc: 0.996967
[27]	valid's auc: 0.997029
[28]	valid's auc: 0.997236
[29]	valid's auc: 0.997319
[30]	valid's auc: 0.99734
[31]	valid's auc: 0.997461
[32]	valid's auc: 0.997488
[33]	valid's auc: 0.997556
[34]	valid's auc: 0.997639
[35]	valid's auc: 0.997738
[36]	

In [None]:
y_train_pred = rs_2.predict(X_train_tfidf)
y_train_prob = rs_2.predict_proba(X_train_tfidf)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

y_test_pred = rs_2.predict(X_test_tfidf)
y_test_prob = rs_2.predict_proba(X_test_tfidf)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      0.989     0.987     0.988       538
           1      0.993     0.994     0.993       981

    accuracy                          0.991      1519
   macro avg      0.991     0.990     0.991      1519
weighted avg      0.991     0.991     0.991      1519

[[531   7]
 [  6 975]]
Accuracy: 0.9914417379855168
AUC_ROC: 0.9997536843142382
f1 score: 0.9933774834437087
False Postive Rate: 0.013011152416356878

Test Results:
              precision    recall  f1-score   support

           0      0.974     0.974     0.974       265
           1      0.986     0.986     0.986       484

    accuracy                          0.981       749
   macro avg      0.980     0.980     0.980       749
weighted avg      0.981     0.981     0.981       749

[[258   7]
 [  7 477]]
Accuracy: 0.9813084112149533
AUC_ROC: 0.9957898019647591
f1 score: 0.9855371900826446
False Postive Rate: 0.026415094339622643



###training best model for Top 15 Features + TF-IDF weighted GloVe vectors

In [None]:
clf_3 = lgbm.LGBMClassifier(random_state=1, silent=True)
rs_3 = RandomizedSearchCV(estimator = clf_3, 
                        param_distributions = param_test, 
                        refit = True,
                        random_state = 1,
                        n_jobs = -1
                        )

In [None]:
fit_params={"early_stopping_rounds":30, 
            "eval_metric" : 'auc', 
            "eval_set" : [(X_train_tfidf_glove, y_train)],
            'eval_names': ['valid'],
            'categorical_feature': 'auto',
            }
%time rs_3.fit(X_train_tfidf_glove, y_train, **fit_params)
print('Best score reached: {} with params: {} '.format(rs_3.best_score_, rs_3.best_params_))



[1]	valid's auc: 0.988109
Training until validation scores don't improve for 67 rounds.
[2]	valid's auc: 0.993925
[3]	valid's auc: 0.995395
[4]	valid's auc: 0.995748
[5]	valid's auc: 0.996027
[6]	valid's auc: 0.996265
[7]	valid's auc: 0.996409
[8]	valid's auc: 0.997559
[9]	valid's auc: 0.997747
[10]	valid's auc: 0.998026
[11]	valid's auc: 0.998089
[12]	valid's auc: 0.998293
[13]	valid's auc: 0.998378
[14]	valid's auc: 0.998367
[15]	valid's auc: 0.99835
[16]	valid's auc: 0.99848
[17]	valid's auc: 0.998554
[18]	valid's auc: 0.998651
[19]	valid's auc: 0.998685
[20]	valid's auc: 0.998761
[21]	valid's auc: 0.998839
[22]	valid's auc: 0.998914
[23]	valid's auc: 0.998926
[24]	valid's auc: 0.998996
[25]	valid's auc: 0.999022
[26]	valid's auc: 0.999109
[27]	valid's auc: 0.999134
[28]	valid's auc: 0.999185
[29]	valid's auc: 0.999233
[30]	valid's auc: 0.999289
[31]	valid's auc: 0.999348
[32]	valid's auc: 0.999439
[33]	valid's auc: 0.999464
[34]	valid's auc: 0.999475
[35]	valid's auc: 0.999532
[36]

In [None]:
y_train_pred = rs_3.predict(X_train_tfidf_glove)
y_train_prob = rs_3.predict_proba(X_train_tfidf_glove)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

y_test_pred = rs_3.predict(X_test_tfidf_glove)
y_test_prob = rs_3.predict_proba(X_test_tfidf_glove)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      1.000     0.996     0.998       538
           1      0.998     1.000     0.999       981

    accuracy                          0.999      1519
   macro avg      0.999     0.998     0.999      1519
weighted avg      0.999     0.999     0.999      1519

[[536   2]
 [  0 981]]
Accuracy: 0.9986833443054641
AUC_ROC: 1.0
f1 score: 0.9989816700610997
False Postive Rate: 0.0037174721189591076

Test Results:
              precision    recall  f1-score   support

           0      0.970     0.970     0.970       265
           1      0.983     0.983     0.983       484

    accuracy                          0.979       749
   macro avg      0.977     0.977     0.977       749
weighted avg      0.979     0.979     0.979       749

[[257   8]
 [  8 476]]
Accuracy: 0.9786381842456608
AUC_ROC: 0.9977779510369562
f1 score: 0.9834710743801653
False Postive Rate: 0.03018867924528302



###training best model for Top 15 Features + TF-IDF weighted FastText

In [None]:
clf_4 = lgbm.LGBMClassifier(random_state=1, silent=True)
rs_4 = RandomizedSearchCV(estimator = clf_4, 
                        param_distributions = param_test, 
                        refit = True,
                        random_state = 1,
                        n_jobs = -1
                        )

In [None]:
fit_params={"early_stopping_rounds":30, 
            "eval_metric" : 'auc', 
            "eval_set" : [(X_train_tfidf_cc, y_train)],
            'eval_names': ['valid'],
            'categorical_feature': 'auto',
            }
%time rs_4.fit(X_train_tfidf_cc, y_train, **fit_params)
print('Best score reached: {} with params: {} '.format(rs_4.best_score_, rs_4.best_params_))



[1]	valid's auc: 0.991628
Training until validation scores don't improve for 67 rounds.
[2]	valid's auc: 0.99653
[3]	valid's auc: 0.996838
[4]	valid's auc: 0.996913
[5]	valid's auc: 0.996948
[6]	valid's auc: 0.997654
[7]	valid's auc: 0.998219
[8]	valid's auc: 0.998572
[9]	valid's auc: 0.998602
[10]	valid's auc: 0.998628
[11]	valid's auc: 0.99882
[12]	valid's auc: 0.998901
[13]	valid's auc: 0.998892
[14]	valid's auc: 0.999075
[15]	valid's auc: 0.99914
[16]	valid's auc: 0.99918
[17]	valid's auc: 0.999178
[18]	valid's auc: 0.999242
[19]	valid's auc: 0.999259
[20]	valid's auc: 0.999322
[21]	valid's auc: 0.999354
[22]	valid's auc: 0.999363
[23]	valid's auc: 0.999454
[24]	valid's auc: 0.999509
[25]	valid's auc: 0.999576
[26]	valid's auc: 0.999602
[27]	valid's auc: 0.999676
[28]	valid's auc: 0.999701
[29]	valid's auc: 0.999727
[30]	valid's auc: 0.999769
[31]	valid's auc: 0.999775
[32]	valid's auc: 0.99979
[33]	valid's auc: 0.99982
[34]	valid's auc: 0.999829
[35]	valid's auc: 0.999865
[36]	val

In [None]:
y_train_pred = rs_4.predict(X_train_tfidf_cc)
y_train_prob = rs_4.predict_proba(X_train_tfidf_cc)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

y_test_pred = rs_4.predict(X_test_tfidf_cc)
y_test_prob = rs_4.predict_proba(X_test_tfidf_cc)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      1.000     0.998     0.999       538
           1      0.999     1.000     0.999       981

    accuracy                          0.999      1519
   macro avg      0.999     0.999     0.999      1519
weighted avg      0.999     0.999     0.999      1519

[[537   1]
 [  0 981]]
Accuracy: 0.9993416721527321
AUC_ROC: 1.0
f1 score: 0.999490575649516
False Postive Rate: 0.0018587360594795538

Test Results:
              precision    recall  f1-score   support

           0      0.974     0.974     0.974       265
           1      0.986     0.986     0.986       484

    accuracy                          0.981       749
   macro avg      0.980     0.980     0.980       749
weighted avg      0.981     0.981     0.981       749

[[258   7]
 [  7 477]]
Accuracy: 0.9813084112149533
AUC_ROC: 0.9975284578200531
f1 score: 0.9855371900826446
False Postive Rate: 0.026415094339622643



### InferSent


In [None]:
clf = lgbm.LGBMClassifier(random_state=1, silent=True)
rs = RandomizedSearchCV(estimator = clf, 
                        param_distributions = param_test, 
                        refit = True,
                        random_state = 1,
                        n_jobs = -1
                        )

In [None]:
fit_params={"early_stopping_rounds":30, 
            "eval_metric" : 'auc', 
            "eval_set" : [(X_train_infersent, y_train_infersent)],
            'eval_names': ['valid'],
            'categorical_feature': 'auto',
            }
%time rs.fit(X_train_infersent, y_train_infersent, **fit_params)
print('Best score reached: {} with params: {} '.format(rs.best_score_, rs.best_params_))



[1]	valid's auc: 0.945728
Training until validation scores don't improve for 67 rounds.
[2]	valid's auc: 0.958882
[3]	valid's auc: 0.967227
[4]	valid's auc: 0.972885
[5]	valid's auc: 0.975995
[6]	valid's auc: 0.976983
[7]	valid's auc: 0.979118
[8]	valid's auc: 0.980414
[9]	valid's auc: 0.98081
[10]	valid's auc: 0.981767
[11]	valid's auc: 0.980371
[12]	valid's auc: 0.98085
[13]	valid's auc: 0.980808
[14]	valid's auc: 0.980686
[15]	valid's auc: 0.98155
[16]	valid's auc: 0.982088
[17]	valid's auc: 0.982398
[18]	valid's auc: 0.982903
[19]	valid's auc: 0.98329
[20]	valid's auc: 0.984023
[21]	valid's auc: 0.984829
[22]	valid's auc: 0.985115
[23]	valid's auc: 0.985781
[24]	valid's auc: 0.986685
[25]	valid's auc: 0.987457
[26]	valid's auc: 0.988186
[27]	valid's auc: 0.988776
[28]	valid's auc: 0.988978
[29]	valid's auc: 0.989269
[30]	valid's auc: 0.989629
[31]	valid's auc: 0.990205
[32]	valid's auc: 0.991318
[33]	valid's auc: 0.991535
[34]	valid's auc: 0.992048
[35]	valid's auc: 0.992684
[36]	v

In [None]:
y_train_pred = rs.predict(X_train_infersent)
y_train_prob = rs.predict_proba(X_train_infersent)[:, 1]
print("Train Results:")
show_results(y_train_infersent, y_train_pred, y_train_prob)

y_test_pred = rs.predict(X_test_infersent)
y_test_prob = rs.predict_proba(X_test_infersent)[:, 1]
print("Test Results:")
show_results(y_test_infersent, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      1.000     0.989     0.994       453
           1      0.995     1.000     0.997       981

    accuracy                          0.997      1434
   macro avg      0.997     0.994     0.996      1434
weighted avg      0.997     0.997     0.997      1434

[[448   5]
 [  0 981]]
Accuracy: 0.9965132496513249
AUC_ROC: 0.9999887486976617
f1 score: 0.9974580579562786
False Postive Rate: 0.011037527593818985

Test Results:
              precision    recall  f1-score   support

           0      0.882     0.834     0.857       223
           1      0.925     0.948     0.937       484

    accuracy                          0.912       707
   macro avg      0.903     0.891     0.897       707
weighted avg      0.912     0.912     0.912       707

[[186  37]
 [ 25 459]]
Accuracy: 0.9123055162659123
AUC_ROC: 0.9511914909387391
f1 score: 0.9367346938775509
False Postive Rate: 0.16591928251121077

