# **Setup**

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from matplotlib import pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn import metrics

from sklearn.metrics import plot_roc_curve
from sklearn.decomposition import PCA
from matplotlib.pyplot import figure
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedShuffleSplit

import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('preprocessed_spam_ham_phishing.csv')

In [3]:
df.columns

Index(['hops', 'missing_subject', 'missing_to', 'missing_content-type',
       'missing_mime-version', 'missing_x-mailer',
       'missing_content-transfer-encoding', 'missing_x-mimeole',
       'missing_x-priority', 'missing_list-id', 'missing_lines',
       'missing_x-virus-scanned', 'missing_status', 'missing_content-length',
       'missing_precedence', 'missing_delivered-to',
       'missing_list-unsubscribe', 'missing_list-subscribe',
       'missing_list-post', 'missing_list-help', 'missing_x-msmail-priority',
       'missing_x-spam-status', 'missing_sender', 'missing_errors-to',
       'missing_x-beenthere', 'missing_list-archive', 'missing_reply-to',
       'missing_x-mailman-version', 'missing_x-miltered', 'missing_x-uuid',
       'missing_x-virus-status', 'missing_x-spam-level',
       'missing_x-spam-checker-version', 'missing_references',
       'missing_in-reply-to', 'missing_user-agent', 'missing_thread-index',
       'missing_cc', 'missing_received-spf', 'missing_x-orig

**Remove spam emails, only consider ham and phishing:**

In [4]:
df = df[df['label'] != 1]
print(df.shape)

(26508, 95)


In [5]:
df['label'].value_counts()

0    25220
2     1288
Name: label, dtype: int64

**Randomly Sample 1288 Ham emails to create a balanced dataset:**

In [6]:
df_ham = df[df['label'] == 0].sample(1288)
df_phish = df[df['label'] == 2]

In [7]:
df_phish = df_phish.assign(label=1)

In [8]:
df_new = df_ham.append(df_phish, ignore_index=True)
df_new

Unnamed: 0,hops,missing_subject,missing_to,missing_content-type,missing_mime-version,missing_x-mailer,missing_content-transfer-encoding,missing_x-mimeole,missing_x-priority,missing_list-id,...,domain_match_errors-to_reply-to,domain_match_sender_from,domain_match_references_reply-to,domain_match_references_in-reply-to,domain_match_references_to,domain_match_from_reply-to,domain_match_to_from,domain_match_to_message-id,domain_match_to_received,label
0,1,0,0,0,0,1,0,1,1,0,...,0,0,1,1,0,0,0,0,0,0
1,1,0,0,0,1,1,0,1,1,0,...,1,1,0,0,0,1,1,1,0,0
2,1,0,0,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,1,1,0,1,1,0,...,1,1,0,0,0,1,1,1,0,0
4,2,0,0,0,0,1,0,1,1,0,...,0,1,0,1,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2571,0,0,0,0,0,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,1
2572,1,0,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,1,1,0,1
2573,0,0,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1
2574,1,0,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1


In [9]:
df_new = df_new.sample(frac=1)
df = df_new.reset_index(drop=True)
df

Unnamed: 0,hops,missing_subject,missing_to,missing_content-type,missing_mime-version,missing_x-mailer,missing_content-transfer-encoding,missing_x-mimeole,missing_x-priority,missing_list-id,...,domain_match_errors-to_reply-to,domain_match_sender_from,domain_match_references_reply-to,domain_match_references_in-reply-to,domain_match_references_to,domain_match_from_reply-to,domain_match_to_from,domain_match_to_message-id,domain_match_to_received,label
0,1,0,0,0,0,1,0,1,1,0,...,0,0,0,1,1,0,0,0,0,0
1,0,0,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,1,0,0,1
2,0,0,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,1,0,0,1
3,2,0,0,0,0,0,0,1,1,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2571,2,0,0,1,1,1,1,1,1,0,...,1,1,0,0,0,1,1,0,0,0
2572,1,0,0,0,1,1,0,1,1,0,...,1,1,0,0,0,1,1,1,0,0
2573,0,0,1,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1
2574,1,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,1,0,0,0,1


**Reduce feature set:**

The only features that are kept are domain matching features, as these should generalize across very different email datasets without issue.

In [10]:
feature_list = [
'domain_match_from_return-path',
'domain_match_message-id_from',
'domain_match_message-id_return-path',
'domain_match_to_from',
'domain_match_errors-to_from',
'domain_match_message-id_reply-to',
'domain_match_errors-to_message-id',
'domain_match_sender_from',
'domain_match_to_received',
'domain_match_errors-to_reply-to',
'domain_match_to_message-id',
'label']

feature_list = ['domain_val_message-id',
       'domain_match_message-id_from', 'domain_match_from_return-path',
       'domain_match_message-id_return-path', 'domain_match_message-id_sender',
       'domain_match_message-id_reply-to', 'domain_match_return-path_reply-to',
       'domain_match_reply-to_to', 'domain_match_to_in-reply-to',
       'domain_match_errors-to_message-id', 'domain_match_errors-to_from',
       'domain_match_errors-to_sender', 'domain_match_errors-to_reply-to',
       'domain_match_sender_from', 'domain_match_references_reply-to',
       'domain_match_references_in-reply-to', 'domain_match_references_to',
       'domain_match_from_reply-to', 'domain_match_to_from',
       'domain_match_to_message-id', 'domain_match_to_received', 'label']

df = df[feature_list]

In [11]:
df_Y = df['label']
df_X = df.drop('label', axis=1)

In [12]:
df_X.shape

(2576, 21)

In [13]:
features_list = df_X.columns

**Apply a standard scaler to the full data set:**

In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(df_X)
df_X = scaler.transform(df_X)
df_X = pd.DataFrame(df_X, columns=features_list)

**Breaking the data into a test and training set (20% test, 80% train)**

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y, test_size=0.20, random_state=42)

In [16]:
X_train.shape

(2060, 21)

In [26]:
X_test.shape

(516, 21)

# **Hyperparameter Tuning and Testing:**

**Random Forest:**

In [17]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("rf", RandomForestClassifier())
                ])

param_grid_list = {'rf__n_estimators': [100, 150],
                  'rf__criterion': ['entropy', 'gini'],
                  'rf__min_samples_split': [2, 3],
                  'rf__min_samples_leaf': [1, 2],
                  'rf__max_features': ['auto', 'sqrt', 'log2']}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=10)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
rf_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

# Get the evaluation metrics
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

rf_df[rf_df['rank_test_score'] <= 5].head(5)

Fitting 10 folds for each of 48 candidates, totalling 480 fits
{'rf__criterion': 'entropy', 'rf__max_features': 'auto', 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 3, 'rf__n_estimators': 100} 

Pipeline(steps=[('scale', StandardScaler()),
                ('rf',
                 RandomForestClassifier(criterion='entropy',
                                        min_samples_split=3))])
Accuracy: 97.67441860465115
F1 Score: 97.52066115702479
Recall: 99.15966386554622
Precision: 95.9349593495935
ROC AUC: 97.78127078169398
Confusion Matrix: [[268  10]
 [  2 236]]
Wall time: 25.8 s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_rf__criterion,param_rf__max_features,param_rf__min_samples_leaf,param_rf__min_samples_split,param_rf__n_estimators,params,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
2,0.267684,0.009121,0.019049,0.000537,entropy,auto,1,3,100,"{'rf__criterion': 'entropy', 'rf__max_features...",...,0.966019,0.966019,0.966019,0.985437,0.980583,0.970874,0.980583,0.974272,0.008697,1
11,0.382776,0.00821,0.02643,0.001624,entropy,sqrt,1,3,150,"{'rf__criterion': 'entropy', 'rf__max_features...",...,0.966019,0.966019,0.966019,0.985437,0.980583,0.970874,0.980583,0.974272,0.008697,1
19,0.477125,0.022752,0.031516,0.002491,entropy,log2,1,3,150,"{'rf__criterion': 'entropy', 'rf__max_features...",...,0.966019,0.966019,0.966019,0.985437,0.980583,0.970874,0.980583,0.974272,0.008697,1
24,0.344479,0.023661,0.021842,0.00137,gini,auto,1,2,100,"{'rf__criterion': 'gini', 'rf__max_features': ...",...,0.966019,0.966019,0.966019,0.985437,0.980583,0.970874,0.980583,0.974272,0.008697,1
25,0.498907,0.041224,0.030619,0.002565,gini,auto,1,2,150,"{'rf__criterion': 'gini', 'rf__max_features': ...",...,0.966019,0.966019,0.966019,0.985437,0.980583,0.970874,0.980583,0.974272,0.008697,1


**MLP:**

In [18]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("mlp", MLPClassifier())
                ])

param_grid_list = {'mlp__hidden_layer_sizes': [(20,), (20,20), (40,), (40,40)],
                   'mlp__activation': ['tanh', 'relu'],
                   'mlp__learning_rate': ['constant', 'adaptive'],
                   'mlp__solver': ['adam', 'sgd'],
                   'mlp__alpha': [0.0001, 0.001, 0.01]}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=10)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
mlp_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

# Get the evaluation metrics
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

mlp_df[mlp_df['rank_test_score'] <= 5].head(5)

Fitting 10 folds for each of 96 candidates, totalling 960 fits
{'mlp__activation': 'relu', 'mlp__alpha': 0.001, 'mlp__hidden_layer_sizes': (20,), 'mlp__learning_rate': 'adaptive', 'mlp__solver': 'adam'} 

Pipeline(steps=[('scale', StandardScaler()),
                ('mlp',
                 MLPClassifier(alpha=0.001, hidden_layer_sizes=(20,),
                               learning_rate='adaptive'))])
Accuracy: 97.86821705426357
F1 Score: 97.72256728778468
Recall: 99.15966386554622
Precision: 96.3265306122449
ROC AUC: 97.96112689680189
Confusion Matrix: [[269   9]
 [  2 236]]
Wall time: 3min 16s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_mlp__activation,param_mlp__alpha,param_mlp__hidden_layer_sizes,param_mlp__learning_rate,param_mlp__solver,params,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
2,1.224298,0.098342,0.002892,0.000299,tanh,0.0001,"(20,)",adaptive,adam,"{'mlp__activation': 'tanh', 'mlp__alpha': 0.00...",...,0.966019,0.966019,0.966019,0.985437,0.985437,0.970874,0.980583,0.975243,0.008805,2
12,1.653705,0.236727,0.005115,0.003364,tanh,0.0001,"(40, 40)",constant,adam,"{'mlp__activation': 'tanh', 'mlp__alpha': 0.00...",...,0.966019,0.966019,0.966019,0.985437,0.985437,0.970874,0.980583,0.975243,0.008805,2
28,1.654091,0.175986,0.004088,0.001511,tanh,0.001,"(40, 40)",constant,adam,"{'mlp__activation': 'tanh', 'mlp__alpha': 0.00...",...,0.966019,0.966019,0.966019,0.985437,0.985437,0.970874,0.980583,0.975243,0.008805,2
48,1.254272,0.140579,0.003491,0.000669,relu,0.0001,"(20,)",constant,adam,"{'mlp__activation': 'relu', 'mlp__alpha': 0.00...",...,0.966019,0.966019,0.966019,0.985437,0.985437,0.970874,0.980583,0.975243,0.008805,2
50,1.252115,0.118412,0.00369,0.000638,relu,0.0001,"(20,)",adaptive,adam,"{'mlp__activation': 'relu', 'mlp__alpha': 0.00...",...,0.966019,0.966019,0.966019,0.985437,0.985437,0.970874,0.980583,0.975243,0.008805,2


**Logistic Regression:**

In [19]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("lr", LogisticRegression())
                ])

param_grid_list = {'lr__max_iter': [500],
                  'lr__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
                  'lr__fit_intercept': [True, False],
                  'lr__tol': [0.0001, 0.001],
                  'lr__penalty': ['l1', 'l2', 'elasticnet'],
                  'lr__C': [0.1, 1, 10]}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=10)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
lr_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

# Get the evaluation metrics
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

lr_df[lr_df['rank_test_score'] <= 5].head(5)

Fitting 10 folds for each of 144 candidates, totalling 1440 fits
{'lr__C': 10, 'lr__fit_intercept': True, 'lr__max_iter': 500, 'lr__penalty': 'l2', 'lr__solver': 'newton-cg', 'lr__tol': 0.0001} 

Pipeline(steps=[('scale', StandardScaler()),
                ('lr',
                 LogisticRegression(C=10, max_iter=500, solver='newton-cg'))])
Accuracy: 97.48062015503875
F1 Score: 97.31958762886597
Recall: 99.15966386554622
Precision: 95.54655870445345
ROC AUC: 97.60141466658607
Confusion Matrix: [[267  11]
 [  2 236]]
Wall time: 9.29 s


 0.96990291 0.96990291 0.96893204 0.96893204 0.96893204 0.96893204
 0.96893204 0.96893204 0.96893204 0.96893204        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.96990291 0.96990291 0.96796117 0.96796117 0.96796117 0.96796117
 0.96796117 0.96796117 0.96796117 0.96796117        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.96990291 0.96990291 0.97087379 0.97087379 0.97087379 0.97087379
 0.97087379 0.97184466 0.97184466 0.97184466        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.96990291 0.96990291 0.96990291 0.96990291 0.96990291 0.96990291
 0.96990291 0.96990291 0.96990291 0.96990291        nan        nan
        nan        nan        nan        nan        nan       

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lr__C,param_lr__fit_intercept,param_lr__max_iter,param_lr__penalty,param_lr__solver,param_lr__tol,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
104,0.031117,0.001323,0.002493,0.0004987481,10,True,500,l2,newton-cg,0.0001,...,0.966019,0.966019,0.961165,0.980583,0.980583,0.970874,0.975728,0.972816,0.008464,1
105,0.030219,0.001184,0.002693,0.0004570956,10,True,500,l2,newton-cg,0.001,...,0.966019,0.966019,0.961165,0.980583,0.980583,0.970874,0.975728,0.972816,0.008464,1
106,0.035904,0.001892,0.002593,0.0004884807,10,True,500,l2,lbfgs,0.0001,...,0.966019,0.966019,0.961165,0.980583,0.980583,0.970874,0.975728,0.972816,0.008464,1
107,0.032613,0.003398,0.002094,0.0002992552,10,True,500,l2,lbfgs,0.001,...,0.966019,0.966019,0.961165,0.980583,0.980583,0.970874,0.975728,0.972816,0.008464,1
108,0.245944,0.010753,0.001995,3.693565e-07,10,True,500,l2,sag,0.0001,...,0.966019,0.966019,0.961165,0.980583,0.980583,0.970874,0.975728,0.972816,0.008464,1


**SVM:**

In [20]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("svc", SVC())
                ])

param_grid_list = {'svc__C': [0.1, 1, 10],
                  'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                  'svc__degree': [3, 4, 5],
                  'svc__tol': [0.001, 0.0001, 0.01]}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=10)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
svm_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

# Get the evaluation metrics
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

svm_df[svm_df['rank_test_score'] <= 5].head(5)

Fitting 10 folds for each of 108 candidates, totalling 1080 fits
{'svc__C': 10, 'svc__degree': 3, 'svc__kernel': 'rbf', 'svc__tol': 0.001} 

Pipeline(steps=[('scale', StandardScaler()), ('svc', SVC(C=10))])
Accuracy: 97.86821705426357
F1 Score: 97.72256728778468
Recall: 99.15966386554622
Precision: 96.3265306122449
ROC AUC: 97.96112689680189
Confusion Matrix: [[269   9]
 [  2 236]]
Wall time: 8.87 s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svc__C,param_svc__degree,param_svc__kernel,param_svc__tol,params,split0_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
78,0.028424,0.000804,0.008178,0.000399,10,3,rbf,0.001,"{'svc__C': 10, 'svc__degree': 3, 'svc__kernel'...",0.970874,...,0.966019,0.966019,0.966019,0.985437,0.985437,0.970874,0.975728,0.975243,0.008533,1
79,0.029421,0.000804,0.009375,0.002608,10,3,rbf,0.0001,"{'svc__C': 10, 'svc__degree': 3, 'svc__kernel'...",0.970874,...,0.966019,0.966019,0.966019,0.985437,0.985437,0.970874,0.975728,0.975243,0.008533,1
80,0.028424,0.00363,0.008079,0.000537,10,3,rbf,0.01,"{'svc__C': 10, 'svc__degree': 3, 'svc__kernel'...",0.970874,...,0.966019,0.966019,0.966019,0.985437,0.985437,0.970874,0.975728,0.975243,0.008533,1
90,0.027825,0.000829,0.008478,0.00092,10,4,rbf,0.001,"{'svc__C': 10, 'svc__degree': 4, 'svc__kernel'...",0.970874,...,0.966019,0.966019,0.966019,0.985437,0.985437,0.970874,0.975728,0.975243,0.008533,1
91,0.030617,0.001672,0.008079,0.000829,10,4,rbf,0.0001,"{'svc__C': 10, 'svc__degree': 4, 'svc__kernel'...",0.970874,...,0.966019,0.966019,0.966019,0.985437,0.985437,0.970874,0.975728,0.975243,0.008533,1


**Decision Tree:**

In [21]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("dt", DecisionTreeClassifier())
                ])

param_grid_list = {'dt__criterion': ['entropy', 'gini'],
                  'dt__min_samples_split': [2, 3, 4],
                  'dt__min_samples_leaf': [1, 2, 3],
                  'dt__ccp_alpha': [0, 0.005, 0.01, 0.025, 0.05, 0.1]}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=10)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
dt_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

# Get the evaluation metrics
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

dt_df[dt_df['rank_test_score'] <= 5].head(5)

Fitting 10 folds for each of 108 candidates, totalling 1080 fits
{'dt__ccp_alpha': 0, 'dt__criterion': 'entropy', 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 2} 

Pipeline(steps=[('scale', StandardScaler()),
                ('dt',
                 DecisionTreeClassifier(ccp_alpha=0, criterion='entropy'))])
Accuracy: 97.48062015503875
F1 Score: 97.31958762886597
Recall: 99.15966386554622
Precision: 95.54655870445345
ROC AUC: 97.60141466658607
Confusion Matrix: [[267  11]
 [  2 236]]
Wall time: 1.93 s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_dt__ccp_alpha,param_dt__criterion,param_dt__min_samples_leaf,param_dt__min_samples_split,params,split0_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.010372,0.001738,0.00359,0.000798,0,entropy,1,2,"{'dt__ccp_alpha': 0, 'dt__criterion': 'entropy...",0.970874,...,0.956311,0.966019,0.966019,0.985437,0.980583,0.970874,0.975728,0.973301,0.009525,1
1,0.011269,0.001097,0.002792,0.000598,0,entropy,1,3,"{'dt__ccp_alpha': 0, 'dt__criterion': 'entropy...",0.970874,...,0.941748,0.966019,0.966019,0.985437,0.980583,0.970874,0.975728,0.971845,0.012621,3
2,0.009474,0.000804,0.002992,0.000446,0,entropy,1,4,"{'dt__ccp_alpha': 0, 'dt__criterion': 'entropy...",0.970874,...,0.941748,0.966019,0.966019,0.985437,0.980583,0.970874,0.975728,0.971845,0.012621,3
9,0.009076,0.000299,0.002693,0.000457,0,gini,1,2,"{'dt__ccp_alpha': 0, 'dt__criterion': 'gini', ...",0.970874,...,0.956311,0.966019,0.966019,0.985437,0.980583,0.970874,0.975728,0.972816,0.009757,2
10,0.009075,0.001041,0.002693,0.000638,0,gini,1,3,"{'dt__ccp_alpha': 0, 'dt__criterion': 'gini', ...",0.970874,...,0.941748,0.966019,0.966019,0.985437,0.980583,0.970874,0.975728,0.971359,0.012742,5


**Naive Bayes (Gaussian):**

In [22]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("gnb", GaussianNB())
                ])

param_grid_list = {'gnb__var_smoothing': [1E-9, 1E-10, 1E-8]}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=10)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
nb_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

# Get the evaluation metrics
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

nb_df[nb_df['rank_test_score'] <= 5].head(5)

Fitting 10 folds for each of 3 candidates, totalling 30 fits
{'gnb__var_smoothing': 1e-10} 

Pipeline(steps=[('scale', StandardScaler()),
                ('gnb', GaussianNB(var_smoothing=1e-10))])
Accuracy: 90.31007751937985
F1 Score: 90.49429657794677
Recall: 100.0
Precision: 82.63888888888889
ROC AUC: 91.00719424460432
Confusion Matrix: [[228  50]
 [  0 238]]
Wall time: 119 ms


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gnb__var_smoothing,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.01137,0.003128,0.00379,0.001245,1e-09,{'gnb__var_smoothing': 1e-09},0.868932,0.878641,0.883495,0.849515,0.898058,0.873786,0.868932,0.893204,0.912621,0.902913,0.88301,0.017961,2
1,0.01117,0.001164,0.00389,0.001509,1e-10,{'gnb__var_smoothing': 1e-10},0.868932,0.878641,0.883495,0.849515,0.898058,0.873786,0.868932,0.907767,0.917476,0.902913,0.884951,0.020021,1
2,0.009873,0.002294,0.002992,0.000631,1e-08,{'gnb__var_smoothing': 1e-08},0.868932,0.873786,0.883495,0.84466,0.898058,0.878641,0.864078,0.893204,0.917476,0.902913,0.882524,0.020109,3


**AdaBoost:**

In [23]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("ab", AdaBoostClassifier())
                ])

param_grid_list = {'ab__n_estimators': [50, 100, 150, 200],
                  'ab__learning_rate': [0.95, 1, 1.05, 1.25, 1.5, 1.75, 2],
                  'ab__algorithm': ['SAMME', 'SAMME.R']}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=10)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
ab_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

# Get the evaluation metrics
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

ab_df[ab_df['rank_test_score'] <= 5].head(5)

Fitting 10 folds for each of 56 candidates, totalling 560 fits
{'ab__algorithm': 'SAMME.R', 'ab__learning_rate': 1, 'ab__n_estimators': 100} 

Pipeline(steps=[('scale', StandardScaler()),
                ('ab', AdaBoostClassifier(learning_rate=1, n_estimators=100))])
Accuracy: 97.67441860465115
F1 Score: 97.52066115702479
Recall: 99.15966386554622
Precision: 95.9349593495935
ROC AUC: 97.78127078169398
Confusion Matrix: [[268  10]
 [  2 236]]
Wall time: 29 s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_ab__algorithm,param_ab__learning_rate,param_ab__n_estimators,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
3,0.536565,0.015295,0.031915,0.002523,SAMME,0.95,200,"{'ab__algorithm': 'SAMME', 'ab__learning_rate'...",0.970874,0.985437,...,0.956311,0.966019,0.961165,0.975728,0.966019,0.970874,0.975728,0.969417,0.007842,4
11,0.5236,0.004967,0.031615,0.000779,SAMME,1.05,200,"{'ab__algorithm': 'SAMME', 'ab__learning_rate'...",0.970874,0.985437,...,0.961165,0.966019,0.961165,0.975728,0.966019,0.966019,0.975728,0.969417,0.007217,4
29,0.315914,0.021721,0.02873,0.005474,SAMME.R,0.95,100,"{'ab__algorithm': 'SAMME.R', 'ab__learning_rat...",0.966019,0.980583,...,0.951456,0.966019,0.961165,0.975728,0.970874,0.970874,0.970874,0.969417,0.008422,4
30,0.506499,0.037865,0.041389,0.005768,SAMME.R,0.95,150,"{'ab__algorithm': 'SAMME.R', 'ab__learning_rat...",0.966019,0.980583,...,0.951456,0.966019,0.961165,0.980583,0.970874,0.966019,0.970874,0.969417,0.008964,4
31,0.659536,0.035394,0.050166,0.00232,SAMME.R,0.95,200,"{'ab__algorithm': 'SAMME.R', 'ab__learning_rat...",0.966019,0.980583,...,0.951456,0.966019,0.961165,0.980583,0.970874,0.966019,0.970874,0.969417,0.008964,4


**GradientBoostingClassifier:**

In [24]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("gbc", GradientBoostingClassifier())
                ])

param_grid_list = {'gbc__max_features': ['auto', 'sqrt', 'log2'],
                   'gbc__learning_rate': [0.05, 0.1, 0.2, 0.25, 0.30, 0.35, 0.40, 0.5, 0.6, 0.7, 0.9],
                   'gbc__n_estimators': [100, 200]
                  }

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=10)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
gb_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

# Get the evaluation metrics
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

gb_df[gb_df['rank_test_score'] <= 5].head(5)

Fitting 10 folds for each of 66 candidates, totalling 660 fits
{'gbc__learning_rate': 0.9, 'gbc__max_features': 'sqrt', 'gbc__n_estimators': 100} 

Pipeline(steps=[('scale', StandardScaler()),
                ('gbc',
                 GradientBoostingClassifier(learning_rate=0.9,
                                            max_features='sqrt'))])
Accuracy: 97.86821705426357
F1 Score: 97.72256728778468
Recall: 99.15966386554622
Precision: 96.3265306122449
ROC AUC: 97.96112689680189
Confusion Matrix: [[269   9]
 [  2 236]]
Wall time: 24.1 s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gbc__learning_rate,param_gbc__max_features,param_gbc__n_estimators,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
9,0.306485,0.025946,0.005086,0.001217,0.1,sqrt,200,"{'gbc__learning_rate': 0.1, 'gbc__max_features...",0.970874,0.990291,...,0.966019,0.966019,0.966019,0.985437,0.980583,0.970874,0.980583,0.974757,0.008352,2
16,0.184108,0.017797,0.004389,0.000798,0.2,log2,100,"{'gbc__learning_rate': 0.2, 'gbc__max_features...",0.970874,0.990291,...,0.966019,0.966019,0.966019,0.985437,0.985437,0.970874,0.980583,0.974757,0.009159,2
26,0.185604,0.013136,0.004787,0.001466,0.3,sqrt,100,"{'gbc__learning_rate': 0.3, 'gbc__max_features...",0.970874,0.990291,...,0.966019,0.966019,0.966019,0.985437,0.980583,0.970874,0.980583,0.974757,0.008352,2
39,0.316746,0.025938,0.004089,0.000299,0.4,sqrt,200,"{'gbc__learning_rate': 0.4, 'gbc__max_features...",0.970874,0.990291,...,0.966019,0.966019,0.966019,0.985437,0.980583,0.970874,0.980583,0.974757,0.008352,2
40,0.145411,0.004522,0.003691,0.000639,0.4,log2,100,"{'gbc__learning_rate': 0.4, 'gbc__max_features...",0.970874,0.990291,...,0.966019,0.966019,0.966019,0.985437,0.980583,0.970874,0.980583,0.974757,0.008352,2


**KNN:**

In [25]:
%%time

pipe = Pipeline([("scale", StandardScaler()),
                ("knn", KNeighborsClassifier())
                ])

param_grid_list = {'knn__n_neighbors': [1, 10, 20],
                  'knn__weights': ['uniform', 'distance'],
                  'knn__p': [1, 2],
                  'knn__algorithm': ['auto'],
                  'knn__leaf_size': [15, 30, 45]}

grid = GridSearchCV(pipe, param_grid=param_grid_list, cv=10, n_jobs=-1, verbose=10)

# Find the best hyperparameters (using 10 fold CV with the hold out fold being the validation set)
grid.fit(X_train, y_train)

# Check the hyperparameter results
knn_df = pd.DataFrame(grid.cv_results_)
print(grid.best_params_, '\n')
print(grid.best_estimator_)

# Get the best performing model
best_model = grid.best_estimator_

# Train the best model on the full training data
best_model.fit(X_train, y_train)

# Test the best performing model on the test set
predictions = best_model.predict(X_test)

# Get the evaluation metrics
print('Accuracy:', accuracy_score(y_test, predictions)*100)
print('F1 Score:', f1_score(y_test, predictions)*100)
print('Recall:', recall_score(y_test, predictions)*100)
print('Precision:', precision_score(y_test, predictions)*100)
print('ROC AUC:', roc_auc_score(y_test, predictions)*100)
print('Confusion Matrix:', confusion_matrix(y_test, predictions))

knn_df[knn_df['rank_test_score'] <= 5].head(5)

Fitting 10 folds for each of 36 candidates, totalling 360 fits
{'knn__algorithm': 'auto', 'knn__leaf_size': 15, 'knn__n_neighbors': 20, 'knn__p': 1, 'knn__weights': 'distance'} 

Pipeline(steps=[('scale', StandardScaler()),
                ('knn',
                 KNeighborsClassifier(leaf_size=15, n_neighbors=20, p=1,
                                      weights='distance'))])
Accuracy: 97.67441860465115
F1 Score: 97.52066115702479
Recall: 99.15966386554622
Precision: 95.9349593495935
ROC AUC: 97.78127078169398
Confusion Matrix: [[268  10]
 [  2 236]]
Wall time: 1.72 s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_knn__algorithm,param_knn__leaf_size,param_knn__n_neighbors,param_knn__p,param_knn__weights,params,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
5,0.007381,0.000662,0.018849,0.001218,auto,15,10,1,distance,"{'knn__algorithm': 'auto', 'knn__leaf_size': 1...",...,0.956311,0.966019,0.966019,0.961165,0.985437,0.951456,0.975728,0.968447,0.014441,4
9,0.008678,0.001672,0.020345,0.001621,auto,15,20,1,distance,"{'knn__algorithm': 'auto', 'knn__leaf_size': 1...",...,0.956311,0.966019,0.961165,0.961165,0.985437,0.970874,0.956311,0.969903,0.011239,1
11,0.00748,0.00092,0.014862,0.001575,auto,15,20,2,distance,"{'knn__algorithm': 'auto', 'knn__leaf_size': 1...",...,0.956311,0.966019,0.961165,0.961165,0.985437,0.970874,0.956311,0.968447,0.010008,4
17,0.009827,0.002717,0.020845,0.00181,auto,30,10,1,distance,"{'knn__algorithm': 'auto', 'knn__leaf_size': 3...",...,0.956311,0.966019,0.966019,0.961165,0.985437,0.951456,0.975728,0.968447,0.014441,4
21,0.010523,0.00142,0.023437,0.00363,auto,30,20,1,distance,"{'knn__algorithm': 'auto', 'knn__leaf_size': 3...",...,0.956311,0.966019,0.961165,0.961165,0.985437,0.970874,0.956311,0.969903,0.011239,1
