# **Setup**

In [44]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from matplotlib import pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn import metrics

from sklearn.metrics import plot_roc_curve
from sklearn.decomposition import PCA
from matplotlib.pyplot import figure
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedShuffleSplit

import pandas as pd
import numpy as np

In [45]:
df = pd.read_csv('preprocessed_spam_ham_phishing.csv')

**Remove spam emails, only consider ham and phishing:**

In [46]:
df = df[df['label'] != 1]
print(df.shape)

(26508, 95)


In [47]:
df['label'].value_counts()

0    25220
2     1288
Name: label, dtype: int64

**Randomly Sample 1288 Ham emails to create a balanced dataset:**

In [48]:
df_ham = df[df['label'] == 0].sample(1288)
df_phish = df[df['label'] == 2]

In [49]:
df_phish = df_phish.assign(label=1)

In [50]:
df_new = df_ham.append(df_phish, ignore_index=True)
df_new

Unnamed: 0,hops,missing_subject,missing_to,missing_content-type,missing_mime-version,missing_x-mailer,missing_content-transfer-encoding,missing_x-mimeole,missing_x-priority,missing_list-id,...,domain_match_errors-to_reply-to,domain_match_sender_from,domain_match_references_reply-to,domain_match_references_in-reply-to,domain_match_references_to,domain_match_from_reply-to,domain_match_to_from,domain_match_to_message-id,domain_match_to_received,label
0,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
1,2,0,0,0,0,1,0,1,1,0,...,1,1,0,0,0,1,1,1,0,0
2,2,0,0,0,0,1,0,1,1,0,...,1,0,0,1,0,0,0,0,0,0
3,1,0,0,0,0,1,0,1,1,0,...,0,0,1,1,0,0,0,0,0,0
4,1,0,0,0,0,1,0,1,1,0,...,0,0,0,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2571,0,0,0,0,0,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,1
2572,1,0,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,1,1,0,1
2573,0,0,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1
2574,1,0,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1


In [51]:
df_new = df_new.sample(frac=1)
df = df_new.reset_index(drop=True)
df

Unnamed: 0,hops,missing_subject,missing_to,missing_content-type,missing_mime-version,missing_x-mailer,missing_content-transfer-encoding,missing_x-mimeole,missing_x-priority,missing_list-id,...,domain_match_errors-to_reply-to,domain_match_sender_from,domain_match_references_reply-to,domain_match_references_in-reply-to,domain_match_references_to,domain_match_from_reply-to,domain_match_to_from,domain_match_to_message-id,domain_match_to_received,label
0,1,0,0,1,1,1,1,1,1,0,...,1,1,0,0,0,1,1,1,0,0
1,0,0,0,0,0,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,1
2,1,0,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,1,1,1,1,1,1,...,0,0,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2571,0,0,0,0,0,1,1,1,1,1,...,0,0,0,1,0,0,1,0,0,1
2572,2,0,0,0,0,1,0,1,1,0,...,0,0,0,1,1,0,0,0,0,0
2573,0,0,0,0,0,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,1
2574,1,0,0,0,0,1,0,1,1,0,...,0,0,0,1,1,0,0,0,0,0


**Reduce feature set:**

The only features that are kept are domain matching features, as these should generalize across very different email datasets without issue.

In [52]:
feature_list = [
'domain_match_from_return-path',
'domain_match_message-id_from',
'domain_match_message-id_return-path',
'domain_match_to_from',
'domain_match_errors-to_from',
'domain_match_message-id_reply-to',
'domain_match_errors-to_message-id',
'domain_match_sender_from',
'domain_match_to_received',
'domain_match_errors-to_reply-to',
'domain_match_to_message-id',
'label']

df = df[feature_list]

In [53]:
df_Y = df['label']
df_X = df.drop('label', axis=1)

In [54]:
df_X.shape

(2576, 11)

In [55]:
features_list = df_X.columns

**Apply a standard scaler to the full data set:**

In [56]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(df_X)
df_X = scaler.transform(df_X)
df_X = pd.DataFrame(df_X, columns=features_list)

**Breaking the data into a test and training set (20% test, 80% train)**

In [57]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y, test_size=0.20, random_state=42)

In [58]:
X_train.shape

(2060, 11)

# **Hyperparameter Tuning and Testing:**

**Random Forest:**

In [59]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("rf", RandomForestClassifier())
                ])

param_grid_list = {'rf__n_estimators': [100, 150],
                  'rf__criterion': ['entropy', 'gini'],
                  'rf__min_samples_split': [2, 3],
                  'rf__min_samples_leaf': [1, 2],
                  'rf__max_features': ['auto', 'sqrt', 'log2']}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=10)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
rf_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

# Get the evaluation metrics
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

rf_df[rf_df['rank_test_score'] <= 5].head(5)

Fitting 10 folds for each of 48 candidates, totalling 480 fits
{'rf__criterion': 'entropy', 'rf__max_features': 'auto', 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 100} 

Pipeline(steps=[('scale', StandardScaler()),
                ('rf', RandomForestClassifier(criterion='entropy'))])
Accuracy: 97.09302325581395
F1 Score: 97.10982658959537
Recall: 95.81749049429658
Precision: 98.4375
ROC AUC: 97.1182314131562
Confusion Matrix: [[249   4]
 [ 11 252]]
Wall time: 25.9 s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_rf__criterion,param_rf__max_features,param_rf__min_samples_leaf,param_rf__min_samples_split,param_rf__n_estimators,params,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.402732,0.023047,0.02992,0.003122,entropy,auto,1,2,100,"{'rf__criterion': 'entropy', 'rf__max_features...",...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1
1,0.597729,0.043302,0.042985,0.005685,entropy,auto,1,2,150,"{'rf__criterion': 'entropy', 'rf__max_features...",...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1
2,0.36672,0.008261,0.029521,0.001739,entropy,auto,1,3,100,"{'rf__criterion': 'entropy', 'rf__max_features...",...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1
3,0.530681,0.008937,0.039494,0.001353,entropy,auto,1,3,150,"{'rf__criterion': 'entropy', 'rf__max_features...",...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1
8,0.381294,0.009017,0.029721,0.003534,entropy,sqrt,1,2,100,"{'rf__criterion': 'entropy', 'rf__max_features...",...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1


**MLP:**

In [60]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("mlp", MLPClassifier())
                ])

param_grid_list = {'mlp__hidden_layer_sizes': [(20,), (20,20), (40,), (40,40)],
                   'mlp__activation': ['tanh', 'relu'],
                   'mlp__learning_rate': ['constant', 'adaptive'],
                   'mlp__solver': ['adam', 'sgd'],
                   'mlp__alpha': [0.0001, 0.001, 0.01]}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=10)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
mlp_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

# Get the evaluation metrics
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

mlp_df[mlp_df['rank_test_score'] <= 5].head(5)

Fitting 10 folds for each of 96 candidates, totalling 960 fits
{'mlp__activation': 'tanh', 'mlp__alpha': 0.0001, 'mlp__hidden_layer_sizes': (20,), 'mlp__learning_rate': 'constant', 'mlp__solver': 'adam'} 

Pipeline(steps=[('scale', StandardScaler()),
                ('mlp',
                 MLPClassifier(activation='tanh', hidden_layer_sizes=(20,)))])
Accuracy: 97.09302325581395
F1 Score: 97.10982658959537
Recall: 95.81749049429658
Precision: 98.4375
ROC AUC: 97.1182314131562
Confusion Matrix: [[249   4]
 [ 11 252]]
Wall time: 2min 37s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_mlp__activation,param_mlp__alpha,param_mlp__hidden_layer_sizes,param_mlp__learning_rate,param_mlp__solver,params,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,1.20573,0.180466,0.003391,0.001902,tanh,0.0001,"(20,)",constant,adam,"{'mlp__activation': 'tanh', 'mlp__alpha': 0.00...",...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1
2,0.989812,0.08285,0.002992,0.000446,tanh,0.0001,"(20,)",adaptive,adam,"{'mlp__activation': 'tanh', 'mlp__alpha': 0.00...",...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1
4,1.393986,0.280146,0.003491,0.000499,tanh,0.0001,"(20, 20)",constant,adam,"{'mlp__activation': 'tanh', 'mlp__alpha': 0.00...",...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1
6,1.450422,0.299983,0.003391,0.000798,tanh,0.0001,"(20, 20)",adaptive,adam,"{'mlp__activation': 'tanh', 'mlp__alpha': 0.00...",...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1
8,1.277862,0.214146,0.004189,0.00359,tanh,0.0001,"(40,)",constant,adam,"{'mlp__activation': 'tanh', 'mlp__alpha': 0.00...",...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1


**Logistic Regression:**

In [61]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("lr", LogisticRegression())
                ])

param_grid_list = {'lr__max_iter': [500],
                  'lr__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
                  'lr__fit_intercept': [True, False],
                  'lr__tol': [0.0001, 0.001],
                  'lr__penalty': ['l1', 'l2', 'elasticnet'],
                  'lr__C': [0.1, 1, 10]}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=10)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
lr_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

# Get the evaluation metrics
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

lr_df[lr_df['rank_test_score'] <= 5].head(5)

Fitting 10 folds for each of 144 candidates, totalling 1440 fits
{'lr__C': 1, 'lr__fit_intercept': False, 'lr__max_iter': 500, 'lr__penalty': 'l1', 'lr__solver': 'saga', 'lr__tol': 0.0001} 

Pipeline(steps=[('scale', StandardScaler()),
                ('lr',
                 LogisticRegression(C=1, fit_intercept=False, max_iter=500,
                                    penalty='l1', solver='saga'))])
Accuracy: 97.09302325581395
F1 Score: 97.10982658959537
Recall: 95.81749049429658
Precision: 98.4375
ROC AUC: 97.1182314131562
Confusion Matrix: [[249   4]
 [ 11 252]]
Wall time: 4.91 s


 0.95194175 0.95291262 0.95291262 0.95291262 0.95291262 0.95291262
 0.95291262 0.95291262 0.95291262 0.95291262        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.95145631 0.95242718 0.95291262 0.95291262 0.95291262 0.95291262
 0.95291262 0.95291262 0.95291262 0.95291262        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.95291262 0.95291262 0.95291262 0.95291262 0.95291262 0.95291262
 0.95291262 0.95291262 0.95291262 0.95291262        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.9538835  0.9538835  0.95291262 0.95291262 0.95291262 0.95291262
 0.95291262 0.95291262 0.95291262 0.95291262        nan        nan
        nan        nan        nan        nan        nan       

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lr__C,param_lr__fit_intercept,param_lr__max_iter,param_lr__penalty,param_lr__solver,param_lr__tol,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
78,0.029222,0.004846,0.002294,0.000457,1,False,500,l1,saga,0.0001,...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1
79,0.019148,0.001773,0.002593,0.000489,1,False,500,l1,saga,0.001,...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1
126,0.062832,0.01285,0.002294,0.000457,10,False,500,l1,saga,0.0001,...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1
127,0.025133,0.003332,0.002194,0.000399,10,False,500,l1,saga,0.001,...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1
128,0.020944,0.001411,0.002493,0.000669,10,False,500,l2,newton-cg,0.0001,...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1


**SVM:**

In [62]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("svc", SVC())
                ])

param_grid_list = {'svc__C': [0.1, 1, 10],
                  'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                  'svc__degree': [3, 4, 5],
                  'svc__tol': [0.001, 0.0001, 0.01]}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=10)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
svm_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

# Get the evaluation metrics
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

svm_df[svm_df['rank_test_score'] <= 5].head(5)

Fitting 10 folds for each of 108 candidates, totalling 1080 fits
{'svc__C': 0.1, 'svc__degree': 3, 'svc__kernel': 'linear', 'svc__tol': 0.001} 

Pipeline(steps=[('scale', StandardScaler()),
                ('svc', SVC(C=0.1, kernel='linear'))])
Accuracy: 97.09302325581395
F1 Score: 97.10982658959537
Recall: 95.81749049429658
Precision: 98.4375
ROC AUC: 97.1182314131562
Confusion Matrix: [[249   4]
 [ 11 252]]
Wall time: 8.33 s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svc__C,param_svc__degree,param_svc__kernel,param_svc__tol,params,split0_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.035006,0.003636,0.004488,0.000499,0.1,3,linear,0.001,"{'svc__C': 0.1, 'svc__degree': 3, 'svc__kernel...",0.936893,...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1
1,0.030817,0.00225,0.004887,0.000941,0.1,3,linear,0.0001,"{'svc__C': 0.1, 'svc__degree': 3, 'svc__kernel...",0.936893,...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1
2,0.030518,0.002102,0.004289,0.000779,0.1,3,linear,0.01,"{'svc__C': 0.1, 'svc__degree': 3, 'svc__kernel...",0.936893,...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1
12,0.02992,0.002484,0.004488,0.000499,0.1,4,linear,0.001,"{'svc__C': 0.1, 'svc__degree': 4, 'svc__kernel...",0.936893,...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1
13,0.030917,0.001944,0.004388,0.000489,0.1,4,linear,0.0001,"{'svc__C': 0.1, 'svc__degree': 4, 'svc__kernel...",0.936893,...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1


**Decision Tree:**

In [63]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("dt", DecisionTreeClassifier())
                ])

param_grid_list = {'dt__criterion': ['entropy', 'gini'],
                  'dt__min_samples_split': [2, 3, 4],
                  'dt__min_samples_leaf': [1, 2, 3],
                  'dt__ccp_alpha': [0, 0.005, 0.01, 0.025, 0.05, 0.1]}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=10)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
dt_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

# Get the evaluation metrics
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

dt_df[dt_df['rank_test_score'] <= 5].head(5)

Fitting 10 folds for each of 108 candidates, totalling 1080 fits
{'dt__ccp_alpha': 0, 'dt__criterion': 'entropy', 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 2} 

Pipeline(steps=[('scale', StandardScaler()),
                ('dt',
                 DecisionTreeClassifier(ccp_alpha=0, criterion='entropy'))])
Accuracy: 97.09302325581395
F1 Score: 97.10982658959537
Recall: 95.81749049429658
Precision: 98.4375
ROC AUC: 97.1182314131562
Confusion Matrix: [[249   4]
 [ 11 252]]
Wall time: 1.98 s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_dt__ccp_alpha,param_dt__criterion,param_dt__min_samples_leaf,param_dt__min_samples_split,params,split0_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.011925,0.004128,0.003294,0.000676,0,entropy,1,2,"{'dt__ccp_alpha': 0, 'dt__criterion': 'entropy...",0.936893,...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1
1,0.009574,0.001493,0.003391,0.000662,0,entropy,1,3,"{'dt__ccp_alpha': 0, 'dt__criterion': 'entropy...",0.936893,...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1
2,0.008577,0.001017,0.002992,0.000446,0,entropy,1,4,"{'dt__ccp_alpha': 0, 'dt__criterion': 'entropy...",0.936893,...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1
9,0.007679,0.000897,0.002793,0.00087,0,gini,1,2,"{'dt__ccp_alpha': 0, 'dt__criterion': 'gini', ...",0.936893,...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1
10,0.008278,0.001002,0.002693,0.000638,0,gini,1,3,"{'dt__ccp_alpha': 0, 'dt__criterion': 'gini', ...",0.936893,...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1


**Naive Bayes (Gaussian):**

In [64]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("gnb", GaussianNB())
                ])

param_grid_list = {'gnb__var_smoothing': [1E-9, 1E-10, 1E-8]}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=10)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
nb_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

# Get the evaluation metrics
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

nb_df[nb_df['rank_test_score'] <= 5].head(5)

Fitting 10 folds for each of 3 candidates, totalling 30 fits
{'gnb__var_smoothing': 1e-09} 

Pipeline(steps=[('scale', StandardScaler()), ('gnb', GaussianNB())])
Accuracy: 75.96899224806202
F1 Score: 80.80495356037152
Recall: 99.23954372623575
Precision: 68.1462140992167
ROC AUC: 75.50909992635899
Confusion Matrix: [[131 122]
 [  2 261]]
Wall time: 106 ms


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gnb__var_smoothing,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.008876,0.001133,0.00379,0.002034,1e-09,{'gnb__var_smoothing': 1e-09},0.718447,0.728155,0.752427,0.703883,0.757282,0.713592,0.73301,0.694175,0.699029,0.728155,0.722816,0.020185,1
1,0.008178,0.001246,0.003192,0.000399,1e-10,{'gnb__var_smoothing': 1e-10},0.718447,0.728155,0.752427,0.703883,0.757282,0.713592,0.73301,0.694175,0.699029,0.728155,0.722816,0.020185,1
2,0.007283,0.000897,0.002194,0.00087,1e-08,{'gnb__var_smoothing': 1e-08},0.718447,0.728155,0.752427,0.703883,0.757282,0.713592,0.73301,0.694175,0.699029,0.728155,0.722816,0.020185,1


**AdaBoost:**

In [65]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("ab", AdaBoostClassifier())
                ])

param_grid_list = {'ab__n_estimators': [50, 100, 150, 200],
                  'ab__learning_rate': [0.95, 1, 1.05, 1.25, 1.5, 1.75, 2],
                  'ab__algorithm': ['SAMME', 'SAMME.R']}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=10)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
ab_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

# Get the evaluation metrics
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

ab_df[ab_df['rank_test_score'] <= 5].head(5)

Fitting 10 folds for each of 56 candidates, totalling 560 fits
{'ab__algorithm': 'SAMME', 'ab__learning_rate': 0.95, 'ab__n_estimators': 50} 

Pipeline(steps=[('scale', StandardScaler()),
                ('ab',
                 AdaBoostClassifier(algorithm='SAMME', learning_rate=0.95))])
Accuracy: 96.51162790697676
F1 Score: 96.55172413793103
Recall: 95.81749049429658
Precision: 97.2972972972973
ROC AUC: 96.52534603766212
Confusion Matrix: [[246   7]
 [ 11 252]]
Wall time: 26.3 s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_ab__algorithm,param_ab__learning_rate,param_ab__n_estimators,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.141421,0.017238,0.011071,0.001297,SAMME,0.95,50,"{'ab__algorithm': 'SAMME', 'ab__learning_rate'...",0.932039,0.946602,...,0.961165,0.927184,0.941748,0.932039,0.932039,0.956311,0.970874,0.945146,0.013909,1
1,0.253522,0.014438,0.017254,0.001414,SAMME,0.95,100,"{'ab__algorithm': 'SAMME', 'ab__learning_rate'...",0.932039,0.946602,...,0.961165,0.927184,0.941748,0.932039,0.932039,0.956311,0.970874,0.945146,0.013909,1
2,0.393049,0.030183,0.026429,0.005375,SAMME,0.95,150,"{'ab__algorithm': 'SAMME', 'ab__learning_rate'...",0.932039,0.946602,...,0.961165,0.927184,0.941748,0.932039,0.932039,0.956311,0.970874,0.945146,0.013909,1
3,0.52551,0.024921,0.032513,0.003737,SAMME,0.95,200,"{'ab__algorithm': 'SAMME', 'ab__learning_rate'...",0.932039,0.946602,...,0.961165,0.927184,0.941748,0.932039,0.932039,0.956311,0.970874,0.945146,0.013909,1
4,0.143325,0.020553,0.010724,0.001501,SAMME,1.0,50,"{'ab__algorithm': 'SAMME', 'ab__learning_rate'...",0.932039,0.946602,...,0.961165,0.927184,0.941748,0.932039,0.932039,0.956311,0.970874,0.945146,0.013909,1


**GradientBoostingClassifier:**

In [66]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("gbc", GradientBoostingClassifier())
                ])

param_grid_list = {'gbc__max_features': ['auto', 'sqrt', 'log2'],
                   'gbc__learning_rate': [0.05, 0.1, 0.2, 0.25, 0.30, 0.35, 0.40, 0.5, 0.6, 0.7, 0.9],
                   'gbc__n_estimators': [100, 200]
                  }

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=10)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
gb_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

# Get the evaluation metrics
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

gb_df[gb_df['rank_test_score'] <= 5].head(5)

Fitting 10 folds for each of 66 candidates, totalling 660 fits
{'gbc__learning_rate': 0.05, 'gbc__max_features': 'auto', 'gbc__n_estimators': 200} 

Pipeline(steps=[('scale', StandardScaler()),
                ('gbc',
                 GradientBoostingClassifier(learning_rate=0.05,
                                            max_features='auto',
                                            n_estimators=200))])
Accuracy: 97.09302325581395
F1 Score: 97.10982658959537
Recall: 95.81749049429658
Precision: 98.4375
ROC AUC: 97.1182314131562
Confusion Matrix: [[249   4]
 [ 11 252]]
Wall time: 19.8 s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gbc__learning_rate,param_gbc__max_features,param_gbc__n_estimators,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
1,0.348276,0.015002,0.004089,0.000299,0.05,auto,200,"{'gbc__learning_rate': 0.05, 'gbc__max_feature...",0.936893,0.956311,...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1
3,0.283695,0.017854,0.004388,0.000489,0.05,sqrt,200,"{'gbc__learning_rate': 0.05, 'gbc__max_feature...",0.936893,0.956311,...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1
5,0.28055,0.023355,0.003889,0.000537,0.05,log2,200,"{'gbc__learning_rate': 0.05, 'gbc__max_feature...",0.936893,0.956311,...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1
6,0.197671,0.02578,0.003491,0.000499,0.1,auto,100,"{'gbc__learning_rate': 0.1, 'gbc__max_features...",0.936893,0.956311,...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1
7,0.376899,0.021774,0.004488,0.000804,0.1,auto,200,"{'gbc__learning_rate': 0.1, 'gbc__max_features...",0.936893,0.956311,...,0.966019,0.936893,0.941748,0.956311,0.946602,0.961165,0.970874,0.953883,0.01194,1


**KNN:**

In [67]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("knn", KNeighborsClassifier())
                ])

param_grid_list = {'knn__n_neighbors': [1, 10, 20],
                  'knn__weights': ['uniform', 'distance'],
                  'knn__p': [1, 2],
                  'knn__algorithm': ['auto'],
                  'knn__leaf_size': [15, 30, 45]}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=10)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
knn_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

# Get the evaluation metrics
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

knn_df[knn_df['rank_test_score'] <= 5].head(5)

Fitting 10 folds for each of 36 candidates, totalling 360 fits
{'knn__algorithm': 'auto', 'knn__leaf_size': 15, 'knn__n_neighbors': 10, 'knn__p': 1, 'knn__weights': 'distance'} 

Pipeline(steps=[('scale', StandardScaler()),
                ('knn',
                 KNeighborsClassifier(leaf_size=15, n_neighbors=10, p=1,
                                      weights='distance'))])
Accuracy: 97.09302325581395
F1 Score: 97.10982658959537
Recall: 95.81749049429658
Precision: 98.4375
ROC AUC: 97.1182314131562
Confusion Matrix: [[249   4]
 [ 11 252]]
Wall time: 1.8 s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_knn__algorithm,param_knn__leaf_size,param_knn__n_neighbors,param_knn__p,param_knn__weights,params,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
5,0.017054,0.000828,0.014562,0.000662,auto,15,10,1,distance,"{'knn__algorithm': 'auto', 'knn__leaf_size': 1...",...,0.966019,0.936893,0.941748,0.951456,0.946602,0.961165,0.970874,0.953398,0.01193,1
7,0.017752,0.001246,0.013963,0.000446,auto,15,10,2,distance,"{'knn__algorithm': 'auto', 'knn__leaf_size': 1...",...,0.966019,0.936893,0.941748,0.951456,0.946602,0.961165,0.970874,0.953398,0.01193,1
9,0.018051,0.001574,0.01516,0.000598,auto,15,20,1,distance,"{'knn__algorithm': 'auto', 'knn__leaf_size': 1...",...,0.966019,0.936893,0.941748,0.951456,0.946602,0.961165,0.970874,0.953398,0.01193,1
11,0.017453,0.000499,0.01496,0.001093,auto,15,20,2,distance,"{'knn__algorithm': 'auto', 'knn__leaf_size': 1...",...,0.966019,0.936893,0.941748,0.951456,0.946602,0.961165,0.970874,0.953398,0.01193,1
17,0.016456,0.000669,0.013564,0.001277,auto,30,10,1,distance,"{'knn__algorithm': 'auto', 'knn__leaf_size': 3...",...,0.966019,0.936893,0.941748,0.951456,0.946602,0.961165,0.970874,0.953398,0.01193,1
