# **Setup**

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from matplotlib import pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn import metrics

from sklearn.metrics import plot_roc_curve
from sklearn.decomposition import PCA
from matplotlib.pyplot import figure
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedShuffleSplit

import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('preprocessed_spam_ham_phishing.csv')

**Remove phishing emails, only consider ham and spam:**

In [4]:
df = df[df['label'] != 2]
print(df.shape)

(75419, 95)


In [5]:
df['label'].value_counts()

1    50199
0    25220
Name: label, dtype: int64

**Reduce feature set:**

In [6]:
feature_list = ['time_zone',
'str_content-type_texthtml',
'str_to_chevron',
'domain_match_from_return-path',
'missing_importance',
'missing_x-mailer',
'date_comp_date_received',
'str_return-path_bounce',
'missing_user-agent',
'length_from',
'missing_thread-index',
'missing_mime-version',
'domain_val_message-id',
'str_from_question',
'str_from_chevron',
'domain_match_message-id_from',
'missing_domainkey-signature',
'missing_x-mailing-list',
'domain_match_message-id_return-path',
'missing_content-disposition',
'missing_x-mailman-version',
'domain_match_to_from',
'missing_list-unsubscribe',
'domain_match_errors-to_from',
'span_time',
'domain_match_message-id_reply-to',
'content-length',
'lines',
'day_of_week',
'missing_precedence',
'domain_match_errors-to_message-id',
'missing_reply-to',
'domain_match_sender_from',
'missing_mailing-list',
'received_str_forged',
'str_precedence_list',
'domain_match_to_received',
'missing_x-spam-status',
'missing_content-type',
'content-encoding-val',
'domain_match_errors-to_reply-to',
'missing_received-spf',
'missing_references',
'domain_match_to_message-id',
'missing_x-original-to',
'label']

df = df[feature_list]

In [7]:
df_Y = df['label']
df_X = df.drop('label', axis=1)

In [8]:
df_X.shape

(75419, 45)

In [9]:
features_list = df_X.columns

**Apply a standard scaler to the full data set:**

In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(df_X)
df_X = scaler.transform(df_X)
df_X = pd.DataFrame(df_X, columns=features_list)

**Breaking the data into a test and training set (75% test, 25% train, due to hardware and time limitations)**

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y, test_size=0.75, random_state=42)

In [12]:
X_train.shape

(18854, 45)

# **Hyperparameter Tuning and Testing:**

**Random Forest:**

In [16]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("rf", RandomForestClassifier())
                ])

param_grid_list = {'rf__n_estimators': [100, 150],
                  'rf__criterion': ['entropy', 'gini'],
                  'rf__min_samples_split': [2, 3],
                  'rf__min_samples_leaf': [1, 2],
                  'rf__max_features': ['auto', 'sqrt', 'log2']}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=10)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
rf_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

# Get the evaluation metrics
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

rf_df[rf_df['rank_test_score'] <= 5].head(5)

Fitting 10 folds for each of 48 candidates, totalling 480 fits
{'rf__criterion': 'gini', 'rf__max_features': 'auto', 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 150} 

Pipeline(steps=[('scale', StandardScaler()),
                ('rf', RandomForestClassifier(n_estimators=150))])
Accuracy: 99.68708565367277
F1 Score: 99.76523642151336
Recall: 99.84601906177821
Precision: 99.68458439355385
ROC AUC: 99.60816139881165
Confusion Matrix: [[18779   119]
 [   58 37609]]
Wall time: 2min 9s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_rf__criterion,param_rf__max_features,param_rf__min_samples_leaf,param_rf__min_samples_split,param_rf__n_estimators,params,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
1,3.124561,0.134633,0.104022,0.011885,entropy,auto,1,2,150,"{'rf__criterion': 'entropy', 'rf__max_features...",...,0.997349,0.996817,0.999469,0.994164,0.998408,0.997347,0.995225,0.996818,0.001643,2
9,2.869559,0.361402,0.078291,0.011829,entropy,sqrt,1,2,150,"{'rf__criterion': 'entropy', 'rf__max_features...",...,0.997349,0.996286,0.999469,0.994164,0.997878,0.996817,0.995756,0.996712,0.001691,3
11,2.259803,0.067694,0.075399,0.015237,entropy,sqrt,1,3,150,"{'rf__criterion': 'entropy', 'rf__max_features...",...,0.996819,0.996817,0.999469,0.994164,0.997878,0.996817,0.995756,0.996659,0.00161,4
25,2.18779,0.013332,0.067021,0.001466,gini,auto,1,2,150,"{'rf__criterion': 'gini', 'rf__max_features': ...",...,0.996819,0.996817,0.998939,0.994695,0.998408,0.997347,0.995756,0.996924,0.001419,1
40,1.382523,0.014878,0.047826,0.000724,gini,log2,1,2,100,"{'rf__criterion': 'gini', 'rf__max_features': ...",...,0.997879,0.996286,0.999469,0.993103,0.998939,0.996817,0.995756,0.996659,0.001898,5


**MLP:**

In [17]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("mlp", MLPClassifier())
                ])

param_grid_list = {'mlp__hidden_layer_sizes': [(20,), (20,20), (40,), (40,40)],
                   'mlp__activation': ['tanh', 'relu'],
                   'mlp__learning_rate': ['constant', 'adaptive'],
                   'mlp__solver': ['adam', 'sgd'],
                   'mlp__alpha': [0.0001, 0.001, 0.01]}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=10)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
mlp_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

# Get the evaluation metrics
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

mlp_df[mlp_df['rank_test_score'] <= 5].head(5)

Fitting 10 folds for each of 96 candidates, totalling 960 fits
{'mlp__activation': 'tanh', 'mlp__alpha': 0.001, 'mlp__hidden_layer_sizes': (40,), 'mlp__learning_rate': 'adaptive', 'mlp__solver': 'adam'} 

Pipeline(steps=[('scale', StandardScaler()),
                ('mlp',
                 MLPClassifier(activation='tanh', alpha=0.001,
                               hidden_layer_sizes=(40,),
                               learning_rate='adaptive'))])
Accuracy: 99.56510209493503
F1 Score: 99.67341086506293
Recall: 99.66017999840709
Precision: 99.68664524523993
ROC AUC: 99.51788764974859
Confusion Matrix: [[18780   118]
 [  128 37539]]
Wall time: 29min 42s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_mlp__activation,param_mlp__alpha,param_mlp__hidden_layer_sizes,param_mlp__learning_rate,param_mlp__solver,params,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
10,11.634529,1.227771,0.007679,0.000639,tanh,0.0001,"(40,)",adaptive,adam,"{'mlp__activation': 'tanh', 'mlp__alpha': 0.00...",...,0.995228,0.996817,0.997347,0.993634,0.998939,0.995756,0.995225,0.995651,0.002092,5
16,10.140037,1.263275,0.005984,0.000446,tanh,0.001,"(20,)",constant,adam,"{'mlp__activation': 'tanh', 'mlp__alpha': 0.00...",...,0.996288,0.995756,0.997347,0.995756,0.997347,0.996286,0.993634,0.99581,0.00122,4
26,10.776736,0.986898,0.007183,0.000397,tanh,0.001,"(40,)",adaptive,adam,"{'mlp__activation': 'tanh', 'mlp__alpha': 0.00...",...,0.996288,0.996817,0.998939,0.994164,0.997347,0.995225,0.996286,0.995969,0.001541,1
42,11.083675,1.37157,0.00748,0.000498,tanh,0.01,"(40,)",adaptive,adam,"{'mlp__activation': 'tanh', 'mlp__alpha': 0.01...",...,0.996819,0.996817,0.997347,0.994164,0.996817,0.996817,0.995225,0.995863,0.001569,2
78,10.014183,1.535262,0.007579,0.000914,relu,0.001,"(40, 40)",adaptive,adam,"{'mlp__activation': 'relu', 'mlp__alpha': 0.00...",...,0.996819,0.996286,0.995756,0.994164,0.997347,0.997347,0.996286,0.99581,0.001287,3


**Logistic Regression:**

In [18]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("lr", LogisticRegression())
                ])

param_grid_list = {'lr__max_iter': [500],
                  'lr__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
                  'lr__fit_intercept': [True, False],
                  'lr__tol': [0.0001, 0.001],
                  'lr__penalty': ['l1', 'l2', 'elasticnet'],
                  'lr__C': [0.1, 1, 10]}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=10)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
lr_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

# Get the evaluation metrics
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

lr_df[lr_df['rank_test_score'] <= 5].head(5)

Fitting 10 folds for each of 144 candidates, totalling 1440 fits


 0.96027397 0.96032702 0.96106956 0.96106956 0.96106956 0.96106956
 0.96106956 0.96101654 0.96106956 0.96101654        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.960274   0.96038004 0.96069832 0.96069832 0.96069832 0.96069832
 0.96069832 0.9606453  0.96069832 0.96069832        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.96287308 0.9629261  0.96287302 0.96287302 0.96287302 0.96287302
 0.96287302 0.96287302 0.96287302 0.96287302        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.96287308 0.9629261  0.96297909 0.96297909 0.96297909 0.96297909
 0.96297909 0.96297909 0.96297909 0.96297909        nan        nan
        nan        nan        nan        nan        nan       

{'lr__C': 10, 'lr__fit_intercept': False, 'lr__max_iter': 500, 'lr__penalty': 'l1', 'lr__solver': 'saga', 'lr__tol': 0.001} 

Pipeline(steps=[('scale', StandardScaler()),
                ('lr',
                 LogisticRegression(C=10, fit_intercept=False, max_iter=500,
                                    penalty='l1', solver='saga', tol=0.001))])
Accuracy: 96.6268894192522
F1 Score: 97.48165355577846
Recall: 98.03807045955345
Precision: 96.93151691734258
ROC AUC: 95.92611534407453
Confusion Matrix: [[17729  1169]
 [  739 36928]]
Wall time: 2min 55s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lr__C,param_lr__fit_intercept,param_lr__max_iter,param_lr__penalty,param_lr__solver,param_lr__tol,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
102,9.581773,2.758161,0.00369,0.000639,10,True,500,l1,saga,0.0001,...,0.960233,0.964987,0.975597,0.963395,0.95756,0.964456,0.961273,0.962979,0.004857,3
103,1.924592,0.598678,0.00369,0.000457,10,True,500,l1,saga,0.001,...,0.960233,0.964987,0.975597,0.963395,0.95756,0.964456,0.961273,0.962979,0.004857,3
104,0.780396,0.125293,0.003392,0.000489,10,True,500,l2,newton-cg,0.0001,...,0.960233,0.964987,0.975597,0.963395,0.95756,0.964456,0.961273,0.962979,0.004857,3
105,0.823314,0.103674,0.003989,0.000446,10,True,500,l2,newton-cg,0.001,...,0.960233,0.964987,0.975597,0.963395,0.95756,0.964456,0.961273,0.962979,0.004857,3
106,0.63249,0.119054,0.004289,0.000457,10,True,500,l2,lbfgs,0.0001,...,0.960233,0.964987,0.975597,0.963395,0.95756,0.964456,0.961273,0.962979,0.004857,3


**SVM:**

In [19]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("svc", SVC())
                ])

param_grid_list = {'svc__C': [0.1, 1, 10],
                  'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                  'svc__degree': [3, 4, 5],
                  'svc__tol': [0.001, 0.0001, 0.01]}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=10)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
svm_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

# Get the evaluation metrics
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

svm_df[svm_df['rank_test_score'] <= 5].head(5)

Fitting 10 folds for each of 108 candidates, totalling 1080 fits
{'svc__C': 10, 'svc__degree': 3, 'svc__kernel': 'rbf', 'svc__tol': 0.001} 

Pipeline(steps=[('scale', StandardScaler()), ('svc', SVC(C=10))])
Accuracy: 99.48024396711747
F1 Score: 99.60949952183614
Recall: 99.54867656038442
Precision: 99.67039685282157
ROC AUC: 99.44626123500224
Confusion Matrix: [[18774   124]
 [  170 37497]]
Wall time: 36min 5s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svc__C,param_svc__degree,param_svc__kernel,param_svc__tol,params,split0_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
78,1.91698,0.031876,0.311781,0.015337,10,3,rbf,0.001,"{'svc__C': 10, 'svc__degree': 3, 'svc__kernel'...",0.994698,...,0.994698,0.993103,0.996817,0.994164,0.994695,0.994695,0.992042,0.99459,0.00138,1
79,1.928704,0.025686,0.305337,0.01457,10,3,rbf,0.0001,"{'svc__C': 10, 'svc__degree': 3, 'svc__kernel'...",0.994698,...,0.994698,0.993103,0.996817,0.994164,0.994695,0.994695,0.992042,0.99459,0.00138,1
80,1.849701,0.024869,0.292236,0.01013,10,3,rbf,0.01,"{'svc__C': 10, 'svc__degree': 3, 'svc__kernel'...",0.994698,...,0.994698,0.993103,0.996817,0.994164,0.994695,0.994695,0.992042,0.99459,0.00138,1
90,1.921064,0.032349,0.302699,0.01265,10,4,rbf,0.001,"{'svc__C': 10, 'svc__degree': 4, 'svc__kernel'...",0.994698,...,0.994698,0.993103,0.996817,0.994164,0.994695,0.994695,0.992042,0.99459,0.00138,1
91,1.895848,0.035626,0.29646,0.012194,10,4,rbf,0.0001,"{'svc__C': 10, 'svc__degree': 4, 'svc__kernel'...",0.994698,...,0.994698,0.993103,0.996817,0.994164,0.994695,0.994695,0.992042,0.99459,0.00138,1


**Decision Tree:**

In [20]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("dt", DecisionTreeClassifier())
                ])

param_grid_list = {'dt__criterion': ['entropy', 'gini'],
                  'dt__min_samples_split': [2, 3, 4],
                  'dt__min_samples_leaf': [1, 2, 3],
                  'dt__ccp_alpha': [0, 0.005, 0.01, 0.025, 0.05, 0.1]}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=10)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
dt_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

# Get the evaluation metrics
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

dt_df[dt_df['rank_test_score'] <= 5].head(5)

Fitting 10 folds for each of 108 candidates, totalling 1080 fits
{'dt__ccp_alpha': 0, 'dt__criterion': 'entropy', 'dt__min_samples_leaf': 2, 'dt__min_samples_split': 3} 

Pipeline(steps=[('scale', StandardScaler()),
                ('dt',
                 DecisionTreeClassifier(ccp_alpha=0, criterion='entropy',
                                        min_samples_leaf=2,
                                        min_samples_split=3))])
Accuracy: 99.34234950941395
F1 Score: 99.50524019790392
Recall: 99.31239546552686
Precision: 99.69883531888809
ROC AUC: 99.35722429642097
Confusion Matrix: [[18785   113]
 [  259 37408]]
Wall time: 18.5 s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_dt__ccp_alpha,param_dt__criterion,param_dt__min_samples_leaf,param_dt__min_samples_split,params,split0_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.142619,0.015636,0.004489,0.001428,0,entropy,1,2,"{'dt__ccp_alpha': 0, 'dt__criterion': 'entropy...",0.992577,...,0.995228,0.994164,0.997878,0.993103,0.993634,0.993103,0.993634,0.994378,0.001504,3
1,0.126761,0.011419,0.00419,0.001073,0,entropy,1,3,"{'dt__ccp_alpha': 0, 'dt__criterion': 'entropy...",0.993107,...,0.994698,0.994695,0.997347,0.992042,0.995756,0.992042,0.993103,0.994484,0.001779,2
2,0.115989,0.00559,0.004687,0.000638,0,entropy,1,4,"{'dt__ccp_alpha': 0, 'dt__criterion': 'entropy...",0.993107,...,0.994698,0.993634,0.997347,0.993103,0.994695,0.991512,0.993634,0.994272,0.001552,5
4,0.112799,0.004813,0.004289,0.000458,0,entropy,2,3,"{'dt__ccp_alpha': 0, 'dt__criterion': 'entropy...",0.993637,...,0.994168,0.995756,0.996286,0.994695,0.994695,0.991512,0.992573,0.99459,0.00164,1
5,0.11978,0.006188,0.004489,0.000498,0,entropy,2,4,"{'dt__ccp_alpha': 0, 'dt__criterion': 'entropy...",0.993107,...,0.994168,0.994164,0.996817,0.994164,0.995225,0.990981,0.993103,0.994325,0.00161,4


**Naive Bayes (Gaussian):**

In [21]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("gnb", GaussianNB())
                ])

param_grid_list = {'gnb__var_smoothing': [1E-9, 1E-10, 1E-8]}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=10)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
nb_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

# Get the evaluation metrics
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

nb_df[nb_df['rank_test_score'] <= 5].head(5)

Fitting 10 folds for each of 3 candidates, totalling 30 fits
{'gnb__var_smoothing': 1e-10} 

Pipeline(steps=[('scale', StandardScaler()),
                ('gnb', GaussianNB(var_smoothing=1e-10))])
Accuracy: 92.15769468752762
F1 Score: 94.24180274669644
Recall: 96.3734834205007
Precision: 92.20238246425033
ROC AUC: 90.06418905917616
Confusion Matrix: [[15828  3070]
 [ 1366 36301]]
Wall time: 860 ms


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gnb__var_smoothing,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.10143,0.007739,0.006782,0.000399,1e-09,{'gnb__var_smoothing': 1e-09},0.909862,0.930541,0.916225,0.919936,0.92679,0.931034,0.920424,0.903448,0.933687,0.915119,0.920707,0.009361,2
1,0.10442,0.007235,0.006683,0.000898,1e-10,{'gnb__var_smoothing': 1e-10},0.909862,0.932662,0.916755,0.919936,0.927321,0.930504,0.919894,0.903448,0.935279,0.915119,0.921078,0.009783,1
2,0.09335,0.005357,0.005486,0.001742,1e-08,{'gnb__var_smoothing': 1e-08},0.909862,0.930011,0.915695,0.919936,0.927851,0.929973,0.919894,0.903448,0.932095,0.91565,0.920442,0.009055,3


**AdaBoost:**

In [22]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("ab", AdaBoostClassifier())
                ])

param_grid_list = {'ab__n_estimators': [50, 100, 150, 200],
                  'ab__learning_rate': [0.95, 1, 1.05, 1.25, 1.5, 1.75, 2],
                  'ab__algorithm': ['SAMME', 'SAMME.R']}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=10)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
ab_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

# Get the evaluation metrics
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

ab_df[ab_df['rank_test_score'] <= 5].head(5)

Fitting 10 folds for each of 56 candidates, totalling 560 fits
{'ab__algorithm': 'SAMME.R', 'ab__learning_rate': 1.5, 'ab__n_estimators': 150} 

Pipeline(steps=[('scale', StandardScaler()),
                ('ab',
                 AdaBoostClassifier(learning_rate=1.5, n_estimators=150))])
Accuracy: 96.03465040219217
F1 Score: 97.05515511967127
Recall: 98.12833514747658
Precision: 96.0051948051948
ROC AUC: 94.99495390033368
Confusion Matrix: [[17360  1538]
 [  705 36962]]
Wall time: 3min 41s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_ab__algorithm,param_ab__learning_rate,param_ab__n_estimators,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
22,3.327472,0.034215,0.079639,0.003137,SAMME,1.75,150,"{'ab__algorithm': 'SAMME', 'ab__learning_rate'...",0.956522,0.959703,...,0.945917,0.954377,0.9687,0.958621,0.954907,0.962865,0.954377,0.957092,0.005715,4
46,3.669588,0.017137,0.115292,0.002194,SAMME.R,1.5,150,"{'ab__algorithm': 'SAMME.R', 'ab__learning_rat...",0.954931,0.958643,...,0.949099,0.95756,0.967109,0.959682,0.955438,0.959682,0.954907,0.957304,0.004396,1
47,4.895139,0.026019,0.155993,0.005719,SAMME.R,1.5,200,"{'ab__algorithm': 'SAMME.R', 'ab__learning_rat...",0.95228,0.958112,...,0.947508,0.959151,0.967109,0.959151,0.952785,0.960212,0.957029,0.95688,0.005053,5
50,3.664235,0.014736,0.116688,0.002274,SAMME.R,1.75,150,"{'ab__algorithm': 'SAMME.R', 'ab__learning_rat...",0.950159,0.957582,...,0.947508,0.960212,0.967639,0.960743,0.954907,0.95756,0.961804,0.957251,0.005556,2
51,4.865057,0.024427,0.159374,0.011291,SAMME.R,1.75,200,"{'ab__algorithm': 'SAMME.R', 'ab__learning_rat...",0.95281,0.959703,...,0.95281,0.954907,0.96817,0.956499,0.953846,0.961273,0.955438,0.957145,0.004509,3


**GradientBoostingClassifier:**

In [23]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("gbc", GradientBoostingClassifier())
                ])

param_grid_list = {'gbc__max_features': ['auto', 'sqrt', 'log2'],
                   'gbc__learning_rate': [0.05, 0.1, 0.2, 0.25, 0.30, 0.35, 0.40, 0.5, 0.6, 0.7, 0.9],
                   'gbc__n_estimators': [100, 200]
                  }

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=10)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
gb_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

# Get the evaluation metrics
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

gb_df[gb_df['rank_test_score'] <= 5].head(5)

Fitting 10 folds for each of 66 candidates, totalling 660 fits
{'gbc__learning_rate': 0.5, 'gbc__max_features': 'auto', 'gbc__n_estimators': 200} 

Pipeline(steps=[('scale', StandardScaler()),
                ('gbc',
                 GradientBoostingClassifier(learning_rate=0.5,
                                            max_features='auto',
                                            n_estimators=200))])
Accuracy: 99.46079731282596
F1 Score: 99.59532433759239
Recall: 99.64159609206999
Precision: 99.54909553869821
ROC AUC: 99.37101500021004
Confusion Matrix: [[18728   170]
 [  135 37532]]
Wall time: 3min 39s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gbc__learning_rate,param_gbc__max_features,param_gbc__n_estimators,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
37,6.871876,0.133228,0.012766,0.000747,0.4,auto,200,"{'gbc__learning_rate': 0.4, 'gbc__max_features...",0.991516,0.997349,...,0.996819,0.994695,0.998939,0.993634,0.993634,0.993634,0.993634,0.994855,0.002096,3
43,6.689923,0.052856,0.012467,0.000499,0.5,auto,200,"{'gbc__learning_rate': 0.5, 'gbc__max_features...",0.992577,0.998409,...,0.996819,0.995225,0.998939,0.994164,0.992573,0.995225,0.992573,0.995067,0.002226,1
49,6.697878,0.049405,0.012617,0.00055,0.6,auto,200,"{'gbc__learning_rate': 0.6, 'gbc__max_features...",0.992047,0.996288,...,0.993107,0.994695,0.997347,0.995756,0.996817,0.994695,0.993634,0.994908,0.001594,2
51,1.681824,0.011717,0.013664,0.001265,0.6,sqrt,200,"{'gbc__learning_rate': 0.6, 'gbc__max_features...",0.990986,0.997349,...,0.995228,0.993634,0.996817,0.993634,0.996817,0.994164,0.993103,0.994643,0.001874,5
53,1.567908,0.025413,0.012766,0.000399,0.6,log2,200,"{'gbc__learning_rate': 0.6, 'gbc__max_features...",0.989926,0.997349,...,0.993637,0.994695,0.998408,0.994695,0.995756,0.995225,0.993103,0.994696,0.002212,4


**KNN:**

In [12]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("knn", KNeighborsClassifier())
                ])

param_grid_list = {'knn__n_neighbors': [1, 10, 20],
                  'knn__weights': ['uniform', 'distance'],
                  'knn__p': [1, 2],
                  'knn__algorithm': ['auto'],
                  'knn__leaf_size': [15, 30, 45]}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=10)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
knn_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

# Get the evaluation metrics
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

knn_df[knn_df['rank_test_score'] <= 5].head(5)

Fitting 10 folds for each of 36 candidates, totalling 360 fits
{'knn__algorithm': 'auto', 'knn__leaf_size': 15, 'knn__n_neighbors': 1, 'knn__p': 1, 'knn__weights': 'uniform'} 

Pipeline(steps=[('scale', StandardScaler()),
                ('knn',
                 KNeighborsClassifier(leaf_size=15, n_neighbors=1, p=1))])
Accuracy: 99.61106691416954
F1 Score: 99.70805244439728
Recall: 99.73717046751798
Precision: 99.67895141818568
ROC AUC: 99.54844553643652
Confusion Matrix: [[18777   121]
 [   99 37568]]
Wall time: 2min 51s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_knn__algorithm,param_knn__leaf_size,param_knn__n_neighbors,param_knn__p,param_knn__weights,params,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.063031,0.00425,3.936335,0.058645,auto,15,1,1,uniform,"{'knn__algorithm': 'auto', 'knn__leaf_size': 1...",...,0.995758,0.996817,0.998939,0.996286,0.996817,0.996817,0.995225,0.996128,0.001626,1
1,0.068716,0.007362,4.194745,0.508753,auto,15,1,1,distance,"{'knn__algorithm': 'auto', 'knn__leaf_size': 1...",...,0.995758,0.996817,0.998939,0.996286,0.996817,0.996817,0.995225,0.996128,0.001626,1
12,0.06333,0.020701,3.491888,0.088241,auto,30,1,1,uniform,"{'knn__algorithm': 'auto', 'knn__leaf_size': 3...",...,0.995758,0.996817,0.998939,0.996286,0.996817,0.996817,0.995225,0.996128,0.001626,1
13,0.052559,0.004113,3.949128,0.694975,auto,30,1,1,distance,"{'knn__algorithm': 'auto', 'knn__leaf_size': 3...",...,0.995758,0.996817,0.998939,0.996286,0.996817,0.996817,0.995225,0.996128,0.001626,1
24,0.064697,0.020816,3.513008,0.090767,auto,45,1,1,uniform,"{'knn__algorithm': 'auto', 'knn__leaf_size': 4...",...,0.995758,0.996817,0.998939,0.996286,0.996817,0.996817,0.995225,0.996128,0.001626,1


# **Stacked Testing:**

In [13]:
from sklearn.ensemble import StackingClassifier

'''
base_learners = [('rf', RandomForestClassifier(criterion='entropy', max_features='auto', min_samples_leaf=1, min_samples_split=3, n_estimators=100)), 
                ('mlp', MLPClassifier(max_iter=500, activation='relu', alpha=0.001, hidden_layer_sizes=(20,), learning_rate='adaptive', solver='adam')),
                ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=15, n_neighbors=20, p=1, weights='distance')), 
                ('svm', SVC(C=10, kernel='rbf', tol=0.001))]
'''

base_learners_set1 = [('rf', RandomForestClassifier(criterion='entropy', max_features='auto', min_samples_leaf=1, min_samples_split=3, n_estimators=100)), 
                ('mlp', MLPClassifier(max_iter=500, activation='relu', alpha=0.001, hidden_layer_sizes=(20,), learning_rate='adaptive', solver='adam')),
                ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=15, n_neighbors=20, p=1, weights='distance'))]

base_learners_set2 = [('rf', RandomForestClassifier(criterion='entropy', max_features='auto', min_samples_leaf=1, min_samples_split=3, n_estimators=100)), 
                ('mlp', MLPClassifier(max_iter=500, activation='relu', alpha=0.001, hidden_layer_sizes=(20,), learning_rate='adaptive', solver='adam')), 
                ('svm', SVC(C=10, kernel='rbf', tol=0.001))]

base_learners_set3 = [('rf', RandomForestClassifier(criterion='entropy', max_features='auto', min_samples_leaf=1, min_samples_split=3, n_estimators=100)),
                ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=15, n_neighbors=20, p=1, weights='distance')), 
                ('svm', SVC(C=10, kernel='rbf', tol=0.001))]

base_learners_set4 = [('mlp', MLPClassifier(max_iter=500, activation='relu', alpha=0.001, hidden_layer_sizes=(20,), learning_rate='adaptive', solver='adam')),
                ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=15, n_neighbors=20, p=1, weights='distance')), 
                ('svm', SVC(C=10, kernel='rbf', tol=0.001))]

base_learners = []
base_learners.append(base_learners_set1)
base_learners.append(base_learners_set2)
base_learners.append(base_learners_set3)
base_learners.append(base_learners_set4)

for base_learner_group in base_learners:

    meta_learner = LogisticRegression()

    clf = StackingClassifier(estimators=base_learner_group, final_estimator=meta_learner)

    # Train the stacked model on the full training data
    clf.fit(X_train, y_train)

    predictions = clf.predict(X_test)

    # Get the evaluation metrics
    print('Accuracy:', accuracy_score(y_test, predictions)*100)
    print('F1 Score:', f1_score(y_test, predictions)*100)
    print('Recall:', recall_score(y_test, predictions)*100)
    print('Precision:', precision_score(y_test, predictions)*100)
    print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
    print('Confusion Matrix:', confusion_matrix(y_test, predictions))
    print('-----------------------------------------\n')

Accuracy: 99.7065323079643
F1 Score: 99.77981164610691
Recall: 99.85398359306555
Precision: 99.70574980780957
ROC AUC: 99.6333099254353
Confusion Matrix: [[18787   111]
 [   55 37612]]
-----------------------------------------

Accuracy: 99.6800141430213
F1 Score: 99.75980997120374
Recall: 99.79026734276688
Precision: 99.72937118599098
ROC AUC: 99.62526384388846
Confusion Matrix: [[18796   102]
 [   79 37588]]
-----------------------------------------

Accuracy: 99.71713957394148
F1 Score: 99.7877252102847
Recall: 99.84070937425332
Precision: 99.73479725249955
ROC AUC: 99.65577642487668
Confusion Matrix: [[18798   100]
 [   60 37607]]
-----------------------------------------

Accuracy: 99.63051356846107
F1 Score: 99.72260933041343
Recall: 99.73717046751798
Precision: 99.70805244439727
ROC AUC: 99.57754914528401
Confusion Matrix: [[18788   110]
 [   99 37568]]
-----------------------------------------

