# **Setup**

In [45]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from matplotlib import pyplot as plt
from sklearn.svm import OneClassSVM

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.inspection import permutation_importance
from matplotlib import pyplot as plt

from sklearn.metrics import plot_roc_curve
from sklearn.decomposition import PCA
from matplotlib.pyplot import figure
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedShuffleSplit

import pandas as pd
import numpy as np

In [46]:
df = pd.read_csv('preprocessed_spam_ham_phishing.csv')

**Feature reduction:**

The only features that are kept are domain matching features, as these should generalize across very different email datasets without issue.

In [47]:
feature_list = [
'domain_match_from_return-path',
'domain_match_message-id_from',
'domain_match_message-id_return-path',
'domain_match_to_from',
'domain_match_errors-to_from',
'domain_match_message-id_reply-to',
'domain_match_errors-to_message-id',
'domain_match_sender_from',
'domain_match_to_received',
'domain_match_errors-to_reply-to',
'domain_match_to_message-id',
'label']

df = df[feature_list]

**Remove spam emails, only consider ham and phishing:**

In [48]:
df = df[df['label'] != 1]
print(df.shape)

(26508, 12)


In [49]:
df['label'].value_counts()

0    25220
2     1288
Name: label, dtype: int64

In [50]:
df_ham = df[df['label'] == 0]
df_phish = df[df['label'] == 2]
print(df_ham.shape)
print(df_phish.shape)

(25220, 12)
(1288, 12)


In [51]:
df_phish = df_phish.assign(label=1)

In [52]:
print(df_phish.shape)
print(df_ham.shape)
print(df_phish['label'].value_counts())

(1288, 12)
(25220, 12)
1    1288
Name: label, dtype: int64


In [53]:
df_ham_Y = df_ham['label']
df_ham_X = df_ham.drop('label', axis=1)

**Take 1288 of the ham emails to be used for testing, the rest for training:**

In [54]:
from sklearn.model_selection import train_test_split

X_train, X_test_ham, y_train, y_test_ham = train_test_split(df_ham_X, df_ham_Y, test_size=1288, random_state=42)

In [55]:
print(X_train.shape)
print(X_test_ham.shape)
print(y_train.shape)
print(y_test_ham.shape)

(23932, 11)
(1288, 11)
(23932,)
(1288,)


**Create the test set, which is 1288 ham and 1288 phishing emails:**

In [56]:
df_phish_Y = df_phish['label']
df_phish_X = df_phish.drop('label', axis=1)

In [57]:
test_X = X_test_ham.append(df_phish_X, ignore_index=True)
test_Y = y_test_ham.append(df_phish_Y, ignore_index=True)

In [58]:
print(test_X.shape)
print(test_Y.shape)

(2576, 11)
(2576,)


**OC-SVM predicts either 1 or -1, so need to adjust labels:**

In [59]:
test_Y = pd.DataFrame(test_Y, columns=['label'])
test_Y.loc[test_Y['label'] == 1, 'label'] = -1
test_Y.loc[test_Y['label'] == 0, 'label'] = 1

In [60]:
test_Y.value_counts()

label
 1       1288
-1       1288
dtype: int64

**Apply a standard scaler to the full data set:**

In [61]:
features_list = test_X.columns

In [62]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(test_X)
test_X = scaler.transform(test_X)
test_X = pd.DataFrame(test_X, columns=features_list)

scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_train = pd.DataFrame(X_train, columns=features_list)

# **Testing:**

**OC-SVM:**

In [63]:
%%time

ocsvm = OneClassSVM(kernel='poly', degree=6, nu=0.2)
ocsvm.fit(X_train)

# Test the model on the test set
predictions = ocsvm.predict(test_X)

# Get the evaluation metrics
print('Accuracy:', accuracy_score(test_Y, predictions)*100)
print('F1 Score:', f1_score(test_Y, predictions)*100)
print('Recall:', recall_score(test_Y, predictions)*100)
print('Precision:', precision_score(test_Y, predictions)*100)
print('ROC AUC:', roc_auc_score(test_Y, predictions)*100)
print('Confusion Matrix:', confusion_matrix(test_Y, predictions))

Accuracy: 87.22826086956522
F1 Score: 86.3768115942029
Recall: 80.97826086956522
Precision: 92.54658385093167
ROC AUC: 87.22826086956523
Confusion Matrix: [[1204   84]
 [ 245 1043]]
Wall time: 10.2 s
