In [None]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, f1_score

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


## Reading in data sources

###**Spam** data sets

In [None]:
## Full features
X_train_full = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SPAM/X_train_fSP.pkl')
X_test_full = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SPAM/X_test_fSP.pkl')

## Vanilla features
X_train_v = X_train_full.drop(['text', 'cleaned_text', 'cleaned_text_full'], axis=1)
X_test_v = X_test_full.drop(['text', 'cleaned_text', 'cleaned_text_full'], axis=1)
scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_v), index = X_train_v.index, columns = X_train_v.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_v), index = X_test_v.index, columns = X_test_v.columns)

## target label
y_train = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SPAM/y_train_fSP.pkl')
y_test = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SPAM/y_test_fSP.pkl')

## TF-IDF with top 15 features
X_train_tfidf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF SPAM/X_train_tfSP.pkl')
X_test_tfidf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF SPAM/X_test_tfSP.pkl')

## TF-IDF GloVe with top 15 features
X_train_tfidf_glove = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF Glove SPAM/X_train_tfglSP.pkl')
X_test_tfidf_glove = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF Glove SPAM/X_test_tfglSP.pkl')

## TF-IDF FastText (cc) with top 15 features
X_train_tfidf_cc = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF CC SPAM/X_train_tfccSP.pkl')
X_test_tfidf_cc = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF CC SPAM/X_test_tfccSP.pkl')

## infersent
X_train_infersent = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SPAM/X_train_infSP.pkl')
X_test_infersent = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SPAM/X_test_infSP.pkl')
y_train_infersent = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SPAM/y_train_infSP.pkl')
y_test_infersent = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SPAM/y_test_infSP.pkl')

##Random Forest Classifier


### Hyperparameter Tuning settings & evaluation function 

In [None]:
def show_results(y_actual, y_pred, y_prob):
  print(classification_report(y_actual, y_pred, digits=3))
  print(confusion_matrix(y_actual, y_pred))
  print("Accuracy: " + str(accuracy_score(y_actual, y_pred)))
  print("AUC_ROC: " + str(roc_auc_score(y_actual, y_prob)))
  print("f1 score: " + str(f1_score(y_actual, y_pred)))
  tn, fp, fn, tp = confusion_matrix(y_actual, y_pred).ravel()
  fpr = fp/(fp+tn)
  print("False Postive Rate: " + str(fpr) + "\n")

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 500, num = 10)]
# Number of features to consider at every split
max_features = ['auto']
# Maximum number of levels in tree
max_depth = [None, 2, 4]
# Minimum number of samples required to split a node
min_samples_split = [2, 3, 4]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 3]
# Method of selecting samples for training each tree
bootstrap = [True]
criterion = ['gini']

# Create the param grid
param_test = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'criterion' : criterion,
               'bootstrap': bootstrap}

###training best model for scaled Vanilla features

In [None]:
clf_1 = RandomForestClassifier(random_state=1)
rs_1 = RandomizedSearchCV(estimator = clf_1, 
                            param_distributions = param_test, 
                            refit = True,
                            random_state = 1,
                            n_jobs = -1
                            )
%time rs_1.fit(X_train_scaled, y_train)
print('Best score reached: {} with params: {} '.format(rs_1.best_score_, rs_1.best_params_))

CPU times: user 5.43 s, sys: 143 ms, total: 5.57 s
Wall time: 2min 45s
Best score reached: 0.8714534275119613 with params: {'n_estimators': 126, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': None, 'criterion': 'gini', 'bootstrap': True} 


In [None]:
y_train_pred = rs_1.predict(X_train_scaled)
y_train_prob = rs_1.predict_proba(X_train_scaled)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

y_test_pred = rs_1.predict(X_test_scaled)
y_test_prob = rs_1.predict_proba(X_test_scaled)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      0.997     0.999     0.998     12462
           1      0.999     0.996     0.998     10059

    accuracy                          0.998     22521
   macro avg      0.998     0.998     0.998     22521
weighted avg      0.998     0.998     0.998     22521

[[12455     7]
 [   41 10018]]
Accuracy: 0.9978686559211403
AUC_ROC: 0.9999334451531343
f1 score: 0.9976100378410675
False Postive Rate: 0.0005617075910768737

Test Results:
              precision    recall  f1-score   support

           0      0.884     0.915     0.899      6138
           1      0.890     0.851     0.870      4955

    accuracy                          0.887     11093
   macro avg      0.887     0.883     0.885     11093
weighted avg      0.887     0.887     0.886     11093

[[5619  519]
 [ 739 4216]]
Accuracy: 0.8865951500946543
AUC_ROC: 0.9525352151113031
f1 score: 0.8701754385964913
False Postive Rate: 0.08455522971652003



###training best model for Top 15 Features + TF-IDF

In [None]:
clf_2 = RandomForestClassifier(random_state=1)
rs_2 = RandomizedSearchCV(estimator = clf_2, 
                      param_distributions = param_test, 
                      refit = True,
                      random_state = 1,
                      n_jobs = -1
                      )

%time rs_2.fit(X_train_tfidf, y_train)
print('Best score reached: {} with params: {} '.format(rs_2.best_score_, rs_2.best_params_))



CPU times: user 23.3 s, sys: 468 ms, total: 23.7 s
Wall time: 10min 23s
Best score reached: 0.9571510735062482 with params: {'n_estimators': 126, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': None, 'criterion': 'gini', 'bootstrap': True} 


In [None]:
y_train_pred = rs_2.predict(X_train_tfidf)
y_train_prob = rs_2.predict_proba(X_train_tfidf)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

y_test_pred = rs_2.predict(X_test_tfidf)
y_test_prob = rs_2.predict_proba(X_test_tfidf)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000     12462
           1      1.000     1.000     1.000     10059

    accuracy                          1.000     22521
   macro avg      1.000     1.000     1.000     22521
weighted avg      1.000     1.000     1.000     22521

[[12462     0]
 [    2 10057]]
Accuracy: 0.9999111939967141
AUC_ROC: 0.9999999960113359
f1 score: 0.9999005766553987
False Postive Rate: 0.0

Test Results:
              precision    recall  f1-score   support

           0      0.962     0.969     0.966      6138
           1      0.961     0.952     0.957      4955

    accuracy                          0.962     11093
   macro avg      0.962     0.961     0.961     11093
weighted avg      0.962     0.962     0.962     11093

[[5949  189]
 [ 236 4719]]
Accuracy: 0.9616875507076534
AUC_ROC: 0.9924488529709714
f1 score: 0.9569096623745309
False Postive Rate: 0.030791788856304986



###training best model for Top 15 Features + TF-IDF weighted GloVe vectors

In [None]:
clf_3 = RandomForestClassifier(random_state=1)
rs_3 = RandomizedSearchCV(estimator=clf_3, 
                      param_distributions = param_test, 
                      refit = True,
                      random_state = 1,
                      n_jobs = -1
                      )

%time rs_3.fit(X_train_tfidf_glove, y_train)
print('Best score reached: {} with params: {} '.format(rs_3.best_score_, rs_3.best_params_))

CPU times: user 2min 46s, sys: 865 ms, total: 2min 47s
Wall time: 26min 59s
Best score reached: 0.9332181333943141 with params: {'n_estimators': 446, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': None, 'criterion': 'gini', 'bootstrap': True} 


In [None]:
y_train_pred = rs_3.predict(X_train_tfidf_glove)
y_train_prob = rs_3.predict_proba(X_train_tfidf_glove)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

y_test_pred = rs_3.predict(X_test_tfidf_glove)
y_test_prob = rs_3.predict_proba(X_test_tfidf_glove)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000     12462
           1      1.000     1.000     1.000     10059

    accuracy                          1.000     22521
   macro avg      1.000     1.000     1.000     22521
weighted avg      1.000     1.000     1.000     22521

[[12460     2]
 [    4 10055]]
Accuracy: 0.9997335819901425
AUC_ROC: 0.9999987874461556
f1 score: 0.9997017299661961
False Postive Rate: 0.00016048788316482104

Test Results:
              precision    recall  f1-score   support

           0      0.927     0.967     0.947      6138
           1      0.957     0.906     0.931      4955

    accuracy                          0.940     11093
   macro avg      0.942     0.937     0.939     11093
weighted avg      0.941     0.940     0.940     11093

[[5935  203]
 [ 464 4491]]
Accuracy: 0.9398719913458938
AUC_ROC: 0.9862349447405272
f1 score: 0.9308736656648358
False Postive Rate: 0.03307266210492017



###training best model for Top 15 Features + TF-IDF weighted FastText

In [None]:
clf_4 = RandomForestClassifier(random_state=1)
rs_4 = RandomizedSearchCV(estimator=clf_4, 
                      param_distributions = param_test, 
                      refit = True,
                      random_state = 1,
                      n_jobs = -1
                      )

%time rs_4.fit(X_train_tfidf_cc, y_train)
print('Best score reached: {} with params: {} '.format(rs_4.best_score_, rs_4.best_params_))

CPU times: user 3min 8s, sys: 630 ms, total: 3min 8s
Wall time: 27min 17s
Best score reached: 0.9384576048322073 with params: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': None, 'criterion': 'gini', 'bootstrap': True} 


In [None]:
y_train_pred = rs_4.predict(X_train_tfidf_cc)
y_train_prob = rs_4.predict_proba(X_train_tfidf_cc)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

y_test_pred = rs_4.predict(X_test_tfidf_cc)
y_test_prob = rs_4.predict_proba(X_test_tfidf_cc)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000     12462
           1      1.000     1.000     1.000     10059

    accuracy                          1.000     22521
   macro avg      1.000     1.000     1.000     22521
weighted avg      1.000     1.000     1.000     22521

[[12460     2]
 [    5 10054]]
Accuracy: 0.9996891789884996
AUC_ROC: 0.9999988991287465
f1 score: 0.9996520009942828
False Postive Rate: 0.00016048788316482104

Test Results:
              precision    recall  f1-score   support

           0      0.926     0.974     0.949      6138
           1      0.966     0.903     0.934      4955

    accuracy                          0.943     11093
   macro avg      0.946     0.939     0.942     11093
weighted avg      0.944     0.943     0.942     11093

[[5980  158]
 [ 479 4476]]
Accuracy: 0.942576399531236
AUC_ROC: 0.9885850135744344
f1 score: 0.93356971529878
False Postive Rate: 0.025741283805799934



### InferSent

In [None]:
clf = RandomForestClassifier(random_state=1)
# rs = RandomizedSearchCV(estimator = clf, 
#                             param_distributions = param_test, 
#                             refit = True,
#                             random_state = 1,
#                             n_jobs = -1
#                             )
%time clf.fit(X_train_infersent, y_train_infersent)
# print('Best score reached: {} with params: {} '.format(rs.best_score_, rs.best_params_))

CPU times: user 2min 7s, sys: 183 ms, total: 2min 7s
Wall time: 2min 7s


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [None]:
y_train_pred = clf.predict(X_train_infersent)
y_train_prob = clf.predict_proba(X_train_infersent)[:, 1]
print("Train Results:")
show_results(y_train_infersent, y_train_pred, y_train_prob)

y_test_pred = clf.predict(X_test_infersent)
y_test_prob = clf.predict_proba(X_test_infersent)[:, 1]
print("Test Results:")
show_results(y_test_infersent, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000     10560
           1      1.000     1.000     1.000      6979

    accuracy                          1.000     17539
   macro avg      1.000     1.000     1.000     17539
weighted avg      1.000     1.000     1.000     17539

[[10560     0]
 [    0  6979]]
Accuracy: 1.0
AUC_ROC: 1.0
f1 score: 1.0
False Postive Rate: 0.0

Test Results:
              precision    recall  f1-score   support

           0      0.686     0.929     0.789      5191
           1      0.767     0.355     0.485      3422

    accuracy                          0.701      8613
   macro avg      0.726     0.642     0.637      8613
weighted avg      0.718     0.701     0.668      8613

[[4822  369]
 [2208 1214]]
Accuracy: 0.700801114594218
AUC_ROC: 0.7738267835543715
f1 score: 0.48511488511488515
False Postive Rate: 0.07108456944712002

