# **Setup**

In [73]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from matplotlib import pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn import metrics

from sklearn.metrics import plot_roc_curve
from sklearn.decomposition import PCA
from matplotlib.pyplot import figure
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler

import pandas as pd
import numpy as np

In [74]:
df = pd.read_csv('preprocessed_spam_ham_phishing.csv')

**Remove features that are specific to the old ham/spam data set:**

In [75]:
features_list = df.columns.to_list()

In [76]:
remove_list = ['time_zone', 'lines', 'domain_val_message-id']

for val in features_list:
    if 'missing' in val:
        remove_list.append(val)
        
final_features = []
for val in features_list:
    if val not in remove_list:
        final_features.append(val)
        
print(final_features)

df = df[final_features]

['hops', 'content-encoding-val', 'received_str_forged', 'str_content-encoding_empty', 'str_from_question', 'str_from_chevron', 'str_to_chevron', 'str_to_empty', 'str_message-ID_dollar', 'str_return-path_bounce', 'str_content-type_texthtml', 'str_precedence_list', 'length_from', 'num_recipients_to', 'num_recipients_cc', 'number_replies', 'x-priority', 'content-length', 'day_of_week', 'date_comp_date_received', 'span_time', 'conseq_num_received_is_one', 'conseq_received_good', 'conseq_received_bad', 'conseq_received_date', 'email_match_from_reply-to', 'domain_match_message-id_from', 'domain_match_from_return-path', 'domain_match_message-id_return-path', 'domain_match_message-id_sender', 'domain_match_message-id_reply-to', 'domain_match_return-path_reply-to', 'domain_match_reply-to_to', 'domain_match_to_in-reply-to', 'domain_match_errors-to_message-id', 'domain_match_errors-to_from', 'domain_match_errors-to_sender', 'domain_match_errors-to_reply-to', 'domain_match_sender_from', 'domain_ma

**Remove spam emails, only consider ham and phishing:**

In [77]:
df_phish = df[df['label'] == 2]
df = df[df['label'] != 2]
print(df.shape)
print(df_phish.shape)

(75423, 47)
(1288, 47)


In [78]:
df_phish['label'] = 1

In [79]:
df_Y_phish = df_phish['label']
df_X_phish = df_phish.drop('label', axis=1)

In [80]:
print(df_Y_phish.shape)
print(df_X_phish.shape)

(1288,)
(1288, 46)


**Creating a dataset of half ham emails and half phishing emails:**

In [81]:
df_ham = df[df['label'] == 0]
print(df_ham.shape)

(25220, 47)


In [82]:
df_ham_subset = df_ham.sample(n = 1288, replace=False)
print(df_ham_subset.shape)

(1288, 47)


In [83]:
df_ham_Y = df_ham_subset['label']
df_ham_X = df_ham_subset.drop(columns=['label'])
print(df_ham_Y.shape)
print(df_ham_X.shape)

(1288,)
(1288, 46)


In [84]:
df_combined_X = df_ham_X.append(df_X_phish, ignore_index=True)
df_combined_Y = df_ham_Y.append(df_Y_phish, ignore_index=True)
print(df_combined_X.shape)
print(df_combined_Y.shape)

(2576, 46)
(2576,)


In [85]:
idx = np.random.permutation(df_combined_X.index)
df_combined_X = df_combined_X.reindex(idx)
df_combined_Y = df_combined_Y.reindex(idx)

In [86]:
df_combined_X.head(5)

Unnamed: 0,hops,content-encoding-val,received_str_forged,str_content-encoding_empty,str_from_question,str_from_chevron,str_to_chevron,str_to_empty,str_message-ID_dollar,str_return-path_bounce,...,domain_match_errors-to_sender,domain_match_errors-to_reply-to,domain_match_sender_from,domain_match_references_reply-to,domain_match_references_in-reply-to,domain_match_references_to,domain_match_from_reply-to,domain_match_to_from,domain_match_to_message-id,domain_match_to_received
2202,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
499,1,0,0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
280,2,1,0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,...,1,0,0,0,0,1,1,0,0,0
1316,1,1,0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2033,1,1,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [87]:
df_combined_Y.head(5)

2202    1.0
499     0.0
280     0.0
1316    1.0
2033    1.0
Name: label, dtype: float64

# **Testing:**

**Supervised anomaly detection (classification problem, using PCA):**

In [88]:
%%time

from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

lw = 2
graph = None
classifiers = [LogisticRegression(solver='lbfgs', fit_intercept=False, tol=0.0001, penalty='l2', C=1, max_iter=500), 
               SVC(C=10, kernel='rbf', tol=0.01), 
               GradientBoostingClassifier(learning_rate=0.4, n_estimators=200, max_features='log2'),
               MLPClassifier(hidden_layer_sizes=(40,40), activation='relu', learning_rate='constant', alpha=0.001, solver='adam'), 
               GaussianNB(), 
               BernoulliNB(), 
               RandomForestClassifier(n_estimators=150, criterion='gini', min_samples_split=3, min_samples_leaf=1, max_features='log2'), 
               DecisionTreeClassifier(criterion='entropy', min_samples_split=2, min_samples_leaf=1, ccp_alpha=0,), 
               KNeighborsClassifier(algorithm='kd_tree', weights='uniform', p=1, n_neighbors=1, leaf_size=15),
               AdaBoostClassifier(n_estimators=200, learning_rate=0.95, algorithm='SAMME.R')]

for c in classifiers:
    
    pipe = Pipeline([("scale", StandardScaler()),
                    ("reduce_dims", PCA(n_components=40)),
                    (c.__class__.__name__, c)
                    ])

    print("\n---------------------------------------------------")
    print("Current classifier: ", c.__class__.__name__, "\n")

    scoring = ['accuracy', 'f1', 'recall', 'precision', 'roc_auc']
    cv = StratifiedKFold(n_splits=10, shuffle=True)
    scores = cross_validate(pipe, df_combined_X, df_combined_Y, scoring=scoring, cv=cv, return_train_score=True)
    
    for key, val in scores.items():
        print(key, "Average:", "{:.4f}".format(np.average(val)), "+-", "{:.4f}".format(np.std(val)))


---------------------------------------------------
Current classifier:  LogisticRegression 

fit_time Average: 0.0344 +- 0.0030
score_time Average: 0.0086 +- 0.0005
test_accuracy Average: 0.9984 +- 0.0019
train_accuracy Average: 0.9996 +- 0.0001
test_f1 Average: 0.9985 +- 0.0019
train_f1 Average: 0.9996 +- 0.0001
test_recall Average: 1.0000 +- 0.0000
train_recall Average: 1.0000 +- 0.0000
test_precision Average: 0.9969 +- 0.0038
train_precision Average: 0.9992 +- 0.0003
test_roc_auc Average: 1.0000 +- 0.0000
train_roc_auc Average: 1.0000 +- 0.0000

---------------------------------------------------
Current classifier:  SVC 

fit_time Average: 0.0435 +- 0.0018
score_time Average: 0.0180 +- 0.0013
test_accuracy Average: 0.9996 +- 0.0012
train_accuracy Average: 1.0000 +- 0.0000
test_f1 Average: 0.9996 +- 0.0012
train_f1 Average: 1.0000 +- 0.0000
test_recall Average: 1.0000 +- 0.0000
train_recall Average: 1.0000 +- 0.0000
test_precision Average: 0.9992 +- 0.0023
train_precision Average:

In [None]:
'''
for c in classifiers:
    
    pipe = Pipeline([("scale", StandardScaler()),
                    ("reduce_dims", PCA(n_components=40)),
                    (c.__class__.__name__, c)
                    ])

    print("\n---------------------------------------------------")
    print("Current classifier: ", c.__class__.__name__, "\n")
    
    scores = {'accuracy': [], 'f1': [], 'recall': [], 'precision': []}
    

    kf = StratifiedKFold(n_splits=10, shuffle=True)

    for train_index, test_index in kf.split(df_X, df_Y):
        X_train, X_test = df_X.iloc[train_index], df_X.iloc[test_index]
        y_train, y_test = df_Y.iloc[train_index], df_Y.iloc[test_index]
    
        c.fit(X_train, y_train)
        predictions = c.predict(df_X_phish)
        
        scores['accuracy'].append(accuracy_score(df_Y_phish, predictions))
        scores['f1'].append(f1_score(df_Y_phish, predictions))
        scores['recall'].append(recall_score(df_Y_phish, predictions))
        scores['precision'].append(precision_score(df_Y_phish, predictions))

    for key, val in scores.items():
        print(key, "Average:", "{:.4f}".format(np.average(val)), "+-", "{:.4f}".format(np.std(val)))
'''

**Supervised Stacked Classifier:**

In [None]:
base_learners = [('mlp', MLPClassifier(hidden_layer_sizes=(40,40), activation='relu', learning_rate='constant', alpha=0.001, solver='adam')), 
                 ('knn', RandomForestClassifier(n_estimators=150, criterion='gini', min_samples_split=3, min_samples_leaf=1, max_features='log2'))]
meta_classifier = LogisticRegression()

stacked_classifier = StackingClassifier(estimators=base_learners, 
                                        final_estimator=meta_classifier)

pipe = Pipeline([("scale", StandardScaler()),
                ("reduce_dims", PCA(n_components=40)),
                ('stacked', stacked_classifier)
                ])

scoring = ['accuracy', 'f1', 'recall', 'precision', 'roc_auc']
cv = StratifiedKFold(n_splits=10)
scores = cross_validate(pipe, df_X, df_Y, scoring=scoring, cv=cv, return_train_score=True, verbose=10, n_jobs=-1)

for key, val in scores.items():
    print(key, "Average:", "{:.4f}".format(np.average(val)), "+-", "{:.4f}".format(np.std(val)))