# **Setup**

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from matplotlib import pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn import metrics

from sklearn.metrics import plot_roc_curve
from sklearn.decomposition import PCA
from matplotlib.pyplot import figure
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler

import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('preprocessed_spam_ham_phishing.csv')

**Remove phishing emails, only consider ham and spam:**

In [4]:
df = df[df['label'] != 2]
print(df.shape)

(75419, 95)


In [5]:
df_Y = df['label']
df_X = df.drop('label', axis=1)

In [6]:
feature_list = df_X.columns

# **Testing:**

**Supervised anomaly detection (classification problem, using PCA):**

In [None]:
%%time

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score

lw = 2
graph = None
classifiers = [LogisticRegression(solver='lbfgs', fit_intercept=False, tol=0.0001, penalty='l2', C=1, max_iter=500), 
               SVC(C=10, kernel='rbf', tol=0.01), 
               GradientBoostingClassifier(learning_rate=0.4, n_estimators=200, max_features='log2'),
               MLPClassifier(hidden_layer_sizes=(40,40), activation='relu', learning_rate='constant', alpha=0.001, solver='adam'), 
               GaussianNB(), 
               BernoulliNB(), 
               RandomForestClassifier(n_estimators=150, criterion='gini', min_samples_split=3, min_samples_leaf=1, max_features='log2'), 
               DecisionTreeClassifier(criterion='entropy', min_samples_split=2, min_samples_leaf=1, ccp_alpha=0,), 
               KNeighborsClassifier(algorithm='kd_tree', weights='uniform', p=1, n_neighbors=1, leaf_size=15),
               AdaBoostClassifier(n_estimators=200, learning_rate=0.95, algorithm='SAMME.R')]


for c in classifiers:
    
    pipe = Pipeline([("scale", StandardScaler()),
                    ("reduce_dims", PCA(n_components=40)),
                    (c.__class__.__name__, c)
                    ])

    print("\n---------------------------------------------------")
    print("Current classifier: ", c.__class__.__name__, "\n")

    scoring = ['accuracy', 'f1', 'recall', 'precision', 'roc_auc']
    cv = StratifiedKFold(n_splits=10, shuffle=True)
    scores = cross_validate(pipe, df_X, df_Y, scoring=scoring, cv=cv, return_train_score=True, verbose=10)
    
    for key, val in scores.items():
        print(key, "Average:", "{:.4f}".format(np.average(val)), "+-", "{:.4f}".format(np.std(val)))


---------------------------------------------------
Current classifier:  LogisticRegression 

[CV] START .....................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END  accuracy: (train=0.959, test=0.943) f1: (train=0.969, test=0.958) precision: (train=0.964, test=0.953) recall: (train=0.974, test=0.962) roc_auc: (train=0.993, test=0.988) total time=   1.8s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.1s remaining:    0.0s


[CV] END  accuracy: (train=0.958, test=0.951) f1: (train=0.968, test=0.963) precision: (train=0.963, test=0.947) recall: (train=0.974, test=0.981) roc_auc: (train=0.994, test=0.988) total time=   1.7s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.2s remaining:    0.0s


[CV] END  accuracy: (train=0.958, test=0.958) f1: (train=0.969, test=0.969) precision: (train=0.964, test=0.963) recall: (train=0.974, test=0.975) roc_auc: (train=0.993, test=0.992) total time=   1.8s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    6.3s remaining:    0.0s


[CV] END  accuracy: (train=0.958, test=0.956) f1: (train=0.968, test=0.967) precision: (train=0.963, test=0.968) recall: (train=0.974, test=0.966) roc_auc: (train=0.993, test=0.992) total time=   1.7s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    8.5s remaining:    0.0s


[CV] END  accuracy: (train=0.959, test=0.944) f1: (train=0.969, test=0.957) precision: (train=0.964, test=0.960) recall: (train=0.974, test=0.955) roc_auc: (train=0.993, test=0.990) total time=   1.8s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   10.7s remaining:    0.0s


[CV] END  accuracy: (train=0.958, test=0.954) f1: (train=0.968, test=0.965) precision: (train=0.963, test=0.969) recall: (train=0.973, test=0.961) roc_auc: (train=0.993, test=0.993) total time=   1.7s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   12.8s remaining:    0.0s


[CV] END  accuracy: (train=0.958, test=0.953) f1: (train=0.968, test=0.965) precision: (train=0.963, test=0.966) recall: (train=0.973, test=0.964) roc_auc: (train=0.993, test=0.994) total time=   1.8s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   15.0s remaining:    0.0s


[CV] END  accuracy: (train=0.957, test=0.963) f1: (train=0.968, test=0.972) precision: (train=0.963, test=0.963) recall: (train=0.972, test=0.982) roc_auc: (train=0.993, test=0.996) total time=   1.9s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   17.3s remaining:    0.0s


**Supervised Stacked Classifier:**

In [None]:
base_learners = [('mlp', MLPClassifier(hidden_layer_sizes=(40,40), activation='relu', learning_rate='constant', alpha=0.001, solver='adam')), 
                 ('knn', RandomForestClassifier(n_estimators=150, criterion='gini', min_samples_split=3, min_samples_leaf=1, max_features='log2'))]
meta_classifier = LogisticRegression()

stacked_classifier = StackingClassifier(estimators=base_learners, 
                                        final_estimator=meta_classifier)

pipe = Pipeline([("scale", StandardScaler()),
                ("reduce_dims", PCA(n_components=40)),
                ('stacked', stacked_classifier)
                ])

scoring = ['accuracy', 'f1', 'recall', 'precision', 'roc_auc']
cv = StratifiedKFold(n_splits=10)
scores = cross_validate(pipe, df_X, df_Y, scoring=scoring, cv=cv, return_train_score=True, verbose=10, n_jobs=-1)

for key, val in scores.items():
    print(key, "Average:", "{:.4f}".format(np.average(val)), "+-", "{:.4f}".format(np.std(val)))