In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, f1_score

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


## Reading in data sources

###**Scam** data sets

In [None]:
## Full features
X_train_full = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SCAM/X_train_fSC.pkl')
X_test_full = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SCAM/X_test_fSC.pkl')

## Vanilla features
X_train_v = X_train_full.drop(['text', 'cleaned_text', 'cleaned_text_full'], axis=1)
X_test_v = X_test_full.drop(['text', 'cleaned_text', 'cleaned_text_full'], axis=1)
scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_v), index = X_train_v.index, columns = X_train_v.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_v), index = X_test_v.index, columns = X_test_v.columns)

## target label
y_train = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SCAM/y_train_fSC.pkl')
y_test = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/Full Text SCAM/y_test_fSC.pkl')
 
## TF-IDF with top 15 features
X_train_tfidf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF SCAM/X_train_tfSC.pkl')
X_test_tfidf = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF SCAM/X_test_tfSC.pkl')

## TF-IDF GloVe with top 15 features
X_train_tfidf_glove = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF Glove SCAM/X_train_tfglSC.pkl')
X_test_tfidf_glove = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF Glove SCAM/X_test_tfglSC.pkl')

## TF-IDF FastText (cc) with top 15 features
X_train_tfidf_cc = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF CC SCAM/X_train_tfccSC.pkl')
X_test_tfidf_cc = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/TFIDF CC SCAM/X_test_tfccSC.pkl')

## infersent
X_train_infersent = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SCAM/X_train_infSC.pkl')
X_test_infersent = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SCAM/X_test_infSC.pkl')
y_train_infersent = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SCAM/y_train_infSC.pkl')
y_test_infersent = pd.read_pickle('/content/gdrive/My Drive/BT4222/Codes/Cleaned Input Data/InferSent SCAM/y_test_infSC.pkl')

##Random Forest Classifier


### Hyperparameter Tuning

In [None]:
def show_results(y_actual, y_pred, y_prob):
  print(classification_report(y_actual, y_pred, digits=3))
  print(confusion_matrix(y_actual, y_pred))
  print("Accuracy: " + str(accuracy_score(y_actual, y_pred)))
  print("AUC_ROC: " + str(roc_auc_score(y_actual, y_prob)))
  print("f1 score: " + str(f1_score(y_actual, y_pred)))
  tn, fp, fn, tp = confusion_matrix(y_actual, y_pred).ravel()
  fpr = fp/(fp+tn)
  print("False Postive Rate: " + str(fpr) + "\n")

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 500, num = 10)]
# Number of features to consider at every split
max_features = ['auto']
# Maximum number of levels in tree
max_depth = [None, 2, 4]
# Minimum number of samples required to split a node
min_samples_split = [2, 3, 4]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 3]
# Method of selecting samples for training each tree
bootstrap = [True]
criterion = ['gini']

# Create the param grid
param_test = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'criterion' : criterion,
               'bootstrap': bootstrap}

###training best model for scaled Vanilla features

In [None]:
clf_1 = RandomForestClassifier(random_state=1)
rs_1 = RandomizedSearchCV(estimator = clf_1, 
                            param_distributions = param_test, 
                            refit = True,
                            random_state = 1,
                            n_jobs = -1
                            )
%time rs_1.fit(X_train_scaled, y_train)
print('Best score reached: {} with params: {} '.format(rs_1.best_score_, rs_1.best_params_))

CPU times: user 825 ms, sys: 42.2 ms, total: 867 ms
Wall time: 24.5 s
Best score reached: 0.9335092061837763 with params: {'n_estimators': 126, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': None, 'criterion': 'gini', 'bootstrap': True} 


In [None]:
y_train_pred = rs_1.predict(X_train_scaled)
y_train_prob = rs_1.predict_proba(X_train_scaled)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

y_test_pred = rs_1.predict(X_test_scaled)
y_test_prob = rs_1.predict_proba(X_test_scaled)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000       538
           1      1.000     1.000     1.000       981

    accuracy                          1.000      1519
   macro avg      1.000     1.000     1.000      1519
weighted avg      1.000     1.000     1.000      1519

[[538   0]
 [  0 981]]
Accuracy: 1.0
AUC_ROC: 1.0
f1 score: 1.0
False Postive Rate: 0.0

Test Results:
              precision    recall  f1-score   support

           0      0.924     0.917     0.920       265
           1      0.955     0.959     0.957       484

    accuracy                          0.944       749
   macro avg      0.939     0.938     0.939       749
weighted avg      0.944     0.944     0.944       749

[[243  22]
 [ 20 464]]
Accuracy: 0.9439252336448598
AUC_ROC: 0.9817012318727585
f1 score: 0.9567010309278351
False Postive Rate: 0.0830188679245283



###training best model for Top 15 Features + TF-IDF

In [None]:
clf_2 = RandomForestClassifier(random_state=1)
rs_2 = RandomizedSearchCV(estimator = clf_2, 
                      param_distributions = param_test, 
                      refit = True,
                      random_state = 1,
                      n_jobs = -1
                      )

%time rs_2.fit(X_train_tfidf, y_train)
print('Best score reached: {} with params: {} '.format(rs_2.best_score_, rs_2.best_params_))

CPU times: user 1.31 s, sys: 56.1 ms, total: 1.37 s
Wall time: 28.7 s
Best score reached: 0.9828773666840368 with params: {'n_estimators': 126, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': None, 'criterion': 'gini', 'bootstrap': True} 


In [None]:
y_train_pred = rs_2.predict(X_train_tfidf)
y_train_prob = rs_2.predict_proba(X_train_tfidf)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

y_test_pred = rs_2.predict(X_test_tfidf)
y_test_prob = rs_2.predict_proba(X_test_tfidf)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000       538
           1      1.000     1.000     1.000       981

    accuracy                          1.000      1519
   macro avg      1.000     1.000     1.000      1519
weighted avg      1.000     1.000     1.000      1519

[[538   0]
 [  0 981]]
Accuracy: 1.0
AUC_ROC: 1.0
f1 score: 1.0
False Postive Rate: 0.0

Test Results:
              precision    recall  f1-score   support

           0      0.959     0.981     0.970       265
           1      0.990     0.977     0.983       484

    accuracy                          0.979       749
   macro avg      0.974     0.979     0.977       749
weighted avg      0.979     0.979     0.979       749

[[260   5]
 [ 11 473]]
Accuracy: 0.9786381842456608
AUC_ROC: 0.9973803212225169
f1 score: 0.9833679833679835
False Postive Rate: 0.018867924528301886



###training best model for Top 15 Features + TF-IDF weighted GloVe vectors

In [None]:
clf_3 = RandomForestClassifier(random_state=1)
rs_3 = RandomizedSearchCV(estimator=clf_3, 
                      param_distributions = param_test, 
                      refit = True,
                      random_state = 1,
                      n_jobs = -1
                      )

%time rs_3.fit(X_train_tfidf_glove, y_train)
print('Best score reached: {} with params: {} '.format(rs_3.best_score_, rs_3.best_params_))

CPU times: user 1.82 s, sys: 67.9 ms, total: 1.89 s
Wall time: 1min 8s
Best score reached: 0.9815615772103528 with params: {'n_estimators': 73, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': None, 'criterion': 'gini', 'bootstrap': True} 


In [None]:
y_train_pred = rs_3.predict(X_train_tfidf_glove)
y_train_prob = rs_3.predict_proba(X_train_tfidf_glove)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

y_test_pred = rs_3.predict(X_test_tfidf_glove)
y_test_prob = rs_3.predict_proba(X_test_tfidf_glove)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000       538
           1      1.000     1.000     1.000       981

    accuracy                          1.000      1519
   macro avg      1.000     1.000     1.000      1519
weighted avg      1.000     1.000     1.000      1519

[[538   0]
 [  0 981]]
Accuracy: 1.0
AUC_ROC: 1.0
f1 score: 1.0
False Postive Rate: 0.0

Test Results:
              precision    recall  f1-score   support

           0      0.963     0.974     0.968       265
           1      0.985     0.979     0.982       484

    accuracy                          0.977       749
   macro avg      0.974     0.976     0.975       749
weighted avg      0.977     0.977     0.977       749

[[258   7]
 [ 10 474]]
Accuracy: 0.9773030707610146
AUC_ROC: 0.997282862934664
f1 score: 0.9823834196891191
False Postive Rate: 0.026415094339622643



###training best model for Top 15 Features + TF-IDF weighted FastText

In [None]:
clf_4 = RandomForestClassifier(random_state=1)
rs_4 = RandomizedSearchCV(estimator=clf_4, 
                      param_distributions = param_test, 
                      refit = True,
                      random_state = 1,
                      n_jobs = -1
                      )

%time rs_4.fit(X_train_tfidf_cc, y_train)
print('Best score reached: {} with params: {} '.format(rs_4.best_score_, rs_4.best_params_))

CPU times: user 7.01 s, sys: 55.7 ms, total: 7.06 s
Wall time: 1min 12s
Best score reached: 0.9848553934340802 with params: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': None, 'criterion': 'gini', 'bootstrap': True} 


In [None]:
y_train_pred = rs_4.predict(X_train_tfidf_cc)
y_train_prob = rs_4.predict_proba(X_train_tfidf_cc)[:, 1]
print("Train Results:")
show_results(y_train, y_train_pred, y_train_prob)

y_test_pred = rs_4.predict(X_test_tfidf_cc)
y_test_prob = rs_4.predict_proba(X_test_tfidf_cc)[:, 1]
print("Test Results:")
show_results(y_test, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      1.000     1.000     1.000       538
           1      1.000     1.000     1.000       981

    accuracy                          1.000      1519
   macro avg      1.000     1.000     1.000      1519
weighted avg      1.000     1.000     1.000      1519

[[538   0]
 [  0 981]]
Accuracy: 1.0
AUC_ROC: 1.0
f1 score: 1.0
False Postive Rate: 0.0

Test Results:
              precision    recall  f1-score   support

           0      0.959     0.981     0.970       265
           1      0.990     0.977     0.983       484

    accuracy                          0.979       749
   macro avg      0.974     0.979     0.977       749
weighted avg      0.979     0.979     0.979       749

[[260   5]
 [ 11 473]]
Accuracy: 0.9786381842456608
AUC_ROC: 0.9977467643848433
f1 score: 0.9833679833679835
False Postive Rate: 0.018867924528301886



### InferSent


In [None]:
clf = RandomForestClassifier(random_state=1)
rs = RandomizedSearchCV(estimator = clf, 
                            param_distributions = param_test, 
                            refit = True,
                            random_state = 1,
                            n_jobs = -1
                            )
%time rs.fit(X_train_infersent, y_train_infersent)
print('Best score reached: {} with params: {} '.format(rs.best_score_, rs.best_params_))

CPU times: user 30.6 s, sys: 209 ms, total: 30.8 s
Wall time: 4min 58s
Best score reached: 0.7573231646402379 with params: {'n_estimators': 446, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': None, 'criterion': 'gini', 'bootstrap': True} 


In [None]:
y_train_pred = rs.predict(X_train_infersent)
y_train_prob = rs.predict_proba(X_train_infersent)[:, 1]
print("Train Results:")
show_results(y_train_infersent, y_train_pred, y_train_prob)

y_test_pred = rs.predict(X_test_infersent)
y_test_prob = rs.predict_proba(X_test_infersent)[:, 1]
print("Test Results:")
show_results(y_test_infersent, y_test_pred, y_test_prob)

Train Results:
              precision    recall  f1-score   support

           0      1.000     0.991     0.996       453
           1      0.996     1.000     0.998       981

    accuracy                          0.997      1434
   macro avg      0.998     0.996     0.997      1434
weighted avg      0.997     0.997     0.997      1434

[[449   4]
 [  0 981]]
Accuracy: 0.99721059972106
AUC_ROC: 1.0
f1 score: 0.9979654120040692
False Postive Rate: 0.008830022075055188

Test Results:
              precision    recall  f1-score   support

           0      0.732     0.269     0.393       223
           1      0.739     0.955     0.833       484

    accuracy                          0.738       707
   macro avg      0.735     0.612     0.613       707
weighted avg      0.737     0.738     0.694       707

[[ 60 163]
 [ 22 462]]
Accuracy: 0.7383309759547383
AUC_ROC: 0.854514879739095
f1 score: 0.8331830477908025
False Postive Rate: 0.7309417040358744

