# Models for Anomaly Detection
### 1. One-class SVM
### 2. Isolation Forest
### 3. Local Outlier Factor
### 4. AutoEncoder

### Training on Local Machine

In [8]:
import os

original_working_directory_path = os.getcwd()
print("The original working directory is {0}".format(os.getcwd()))

def to_original_working_directory():
    os.chdir(original_working_directory_path)
    print(f"Changed to original working directory {original_working_directory_path}")

The original working directory is /Users/jankreischer/Library/Mobile Documents/com~apple~CloudDocs/Master-Thesis/Code/prototypes/prototype_02


In [9]:
def to_root_working_directory():
    root_working_directory_path = os.path.join(original_working_directory_path, "../..")
    os.chdir(root_working_directory_path)
    print(f"Changed to root working directory {os.getcwd()}")

In [10]:
to_root_working_directory()

Changed to root working directory /Users/jankreischer/Library/Mobile Documents/com~apple~CloudDocs/Master-Thesis/Code


### Functions

In [144]:
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def evaluate(model, data_dict, tablefmt='latex_raw'):
        results = []
        labels= [-1,1]
        pos_label = 1
        
        y_true_total = np.empty([0])
        y_pred_total = np.empty([0])
        for behavior, data in data_dict.items():
            y_true = data[:,-1].astype(int)
            y_true_total = np.concatenate((y_true_total, y_true))

            y_pred = model.predict(data[:, :-1].astype(np.float32))
            y_pred_total = np.concatenate((y_pred_total, y_pred))

            accuracy = accuracy_score(y_true, y_pred)

            n_samples = len(y_true)
            results.append([behavior.name.replace("_", "\_"), f'{(100 * accuracy):.2f}\%', '\\notCalculated', '\\notCalculated', '\\notCalculated', str(n_samples)])

        accuracy = accuracy_score(y_true_total, y_pred_total)
        precision = precision_score(y_true_total, y_pred_total, average='binary', labels=labels, pos_label=pos_label, zero_division=1)
        recall = recall_score(y_true_total, y_pred_total, average='binary', labels=labels, pos_label=pos_label, zero_division=1)
        f1 = f1_score(y_true_total, y_pred_total, average='binary', labels=labels, pos_label=pos_label, zero_division=1)
        n_samples = len(y_true_total)
        results.append(["GLOBAL", f'{(100 * accuracy):.2f}\%', f'{(100 * precision):.2f}\%', f'{(100 * recall):.2f}\%', f'{(100 * f1):.2f}\%', n_samples])
        print(tabulate(results, headers=["Behavior", "Accuracy", "Precision", "Recall", "F1-Score", "\\#Samples"], tablefmt=tablefmt)) 

In [71]:
import pandas as pd

def convert_grid_search_result(grid_search_result, display_latex=True):
    df = pd.concat([pd.DataFrame(grid_search_result.cv_results_["mean_test_score"], columns=["mean_validation_accuracy"]), pd.DataFrame(grid_search_result.cv_results_["params"])],axis=1).sort_values(by=['mean_validation_accuracy'], ascending=False)
    df.index = np.arange(1, len(df) + 1)
    return df

In [163]:
import pandas as pd

def display_grid_search_result(df, n_items=10):
    print(df.head(n_items).to_latex(escape=True, bold_rows=True))

### Data Preparation

In [56]:
from src.data_provider import DataProvider

train_data, test_data, _ = DataProvider.get_scaled_train_test_split(pi=3, scaling_minmax=True, scale_normal_only=True)

In [22]:
import numpy as np

def get_training_datasets(training_data_dict, normal_label, abnormal_label):
    training_data_50 = np.empty([0,47])
    training_data_80 = np.empty([0,47])
    for behavior, behavior_data in training_data_dict.items():

        if behavior == Behavior.NORMAL:
            behavior_data[:, -1] =  normal_label # SVM uses 1 for normal
            training_data_50 = np.concatenate([training_data_50, behavior_data[:7000,:]], axis=0)
            training_data_80 = np.concatenate([training_data_80, behavior_data[:8000,:]], axis=0)
        else:
            behavior_data[:, -1] =  abnormal_label # SVM uses -1 for outlier
            training_data_50 = np.concatenate([training_data_50, behavior_data[:1000,:]], axis=0)
            training_data_80 = np.concatenate([training_data_80, behavior_data[:286,:]], axis=0)

    return training_data_50, training_data_80

In [28]:
import numpy as np

def get_test_datasets(test_data, normal_label, abnormal_label):
    test_data_dict = {}
    test_data_flat = np.zeros([0, 47])

    for behavior, behavior_data in test_data.items():
        if behavior == Behavior.NORMAL:
            behavior_data = behavior_data[:2800]
            behavior_data[:, -1] =  1
        else:
            behavior_data = behavior_data[:400]
            behavior_data[:, -1] =  -1

        test_data_dict[behavior] = behavior_data
        test_data_flat = np.vstack([test_data_flat, behavior_data])

    return test_data_dict, test_data_flat

## 1. One-class SVM

In [124]:
svm_training_data_50, svm_training_data_80 = get_training_datasets(train_data, 1, -1)

svm_training_x_50  = svm_training_data_50[:,:-1]
svm_training_y_50  = svm_training_data_50[:,-1].astype(int)

svm_training_x_80 = svm_training_data_80[:,:-1]
svm_training_y_80 = svm_training_data_80[:,-1].astype(int)

In [125]:
svm_test_data_dict, svm_test_data_flat = get_test_datasets(test_data, 1, -1)

In [142]:
from sklearn.svm import OneClassSVM
from sklearn.model_selection import GridSearchCV

svm_grid_search_50 = GridSearchCV(estimator=OneClassSVM(nu=0.5, verbose=False), 
                           param_grid=param_grid,
                           scoring='accuracy',
                           n_jobs=1,
                           cv=5,
                           verbose=3,
                           refit=True)

#Fitting 5 folds for each of 26 candidates, totalling 130 fits
svm_grid_search_result_50 = svm_grid_search_50.fit(svm_training_x_50, svm_training_y_50)

Fitting 5 folds for each of 26 candidates, totalling 130 fits
[CV 1/5] END .....kernel=linear, shrinking=True;, score=0.297 total time=   5.4s
[CV 2/5] END .....kernel=linear, shrinking=True;, score=0.280 total time=   5.2s
[CV 3/5] END .....kernel=linear, shrinking=True;, score=0.177 total time=   5.3s
[CV 4/5] END .....kernel=linear, shrinking=True;, score=0.511 total time=   5.0s
[CV 5/5] END .....kernel=linear, shrinking=True;, score=0.325 total time=   5.2s
[CV 1/5] END ....kernel=linear, shrinking=False;, score=0.297 total time=   5.1s
[CV 2/5] END ....kernel=linear, shrinking=False;, score=0.280 total time=   5.6s
[CV 3/5] END ....kernel=linear, shrinking=False;, score=0.177 total time=   4.9s
[CV 4/5] END ....kernel=linear, shrinking=False;, score=0.511 total time=   4.6s
[CV 5/5] END ....kernel=linear, shrinking=False;, score=0.325 total time=   4.7s
[CV 1/5] END gamma=scale, kernel=rbf, shrinking=True;, score=0.639 total time=   9.9s
[CV 2/5] END gamma=scale, kernel=rbf, shri

In [145]:
# Display the best hyperparameter configuration for the 50:50 dataset
svm_grid_search_result_table_50 = convert_grid_search_result(svm_grid_search_result_50)
display_grid_search_result(svm_grid_search_result_table_50)

\begin{tabular}{lrlllr}
\toprule
{} &  mean\_validation\_accuracy & kernel &  shrinking &  gamma &  degree \\
\midrule
\textbf{1 } &                  0.698786 &    rbf &       True &   auto &     NaN \\
\textbf{2 } &                  0.698786 &    rbf &      False &   auto &     NaN \\
\textbf{3 } &                  0.626143 &    rbf &       True &  scale &     NaN \\
\textbf{4 } &                  0.626143 &    rbf &      False &  scale &     NaN \\
\textbf{5 } &                  0.332071 &   poly &      False &  scale &     5.0 \\
\textbf{6 } &                  0.332071 &   poly &       True &  scale &     5.0 \\
\textbf{7 } &                  0.331357 &   poly &       True &  scale &     4.0 \\
\textbf{8 } &                  0.331357 &   poly &      False &  scale &     4.0 \\
\textbf{9 } &                  0.328500 &   poly &       True &  scale &     3.0 \\
\textbf{10} &                  0.328500 &   poly &      False &  scale &     3.0 \\
\bottomrule
\end{tabular}



In [147]:
# Testing the best hyperparameter configuration on the 50:50 dataset
best_svm_50 = OneClassSVM(nu=0.50, kernel = 'rbf', gamma = 'auto')
best_svm_50.fit(svm_training_x_50)
evaluate(best_svm_50, svm_test_data_dict, 'latex_raw')

\begin{tabular}{lllllr}
\hline
 Behavior                 & Accuracy   & Precision      & Recall         & F1-Score       &   \#Samples \\
\hline
 NORMAL                   & 55.75\%    & \notCalculated & \notCalculated & \notCalculated &        2800 \\
 RANSOMWARE\_POC          & 100.00\%   & \notCalculated & \notCalculated & \notCalculated &         400 \\
 ROOTKIT\_BDVL            & 100.00\%   & \notCalculated & \notCalculated & \notCalculated &         400 \\
 ROOTKIT\_BEURK           & 50.75\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 CNC\_THETICK             & 53.75\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 CNC\_BACKDOOR\_JAKORITAR & 49.50\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 CNC\_OPT1                & 52.75\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 CNC\_OPT2                & 100.00\%   & \notCalculated & \notCalculated & \notCalculated &         400 \\

In [128]:
from sklearn.svm import OneClassSVM
from sklearn.model_selection import GridSearchCV

svm_grid_search_80 = GridSearchCV(estimator=OneClassSVM(nu=0.2, verbose=False), 
                           param_grid=param_grid,
                           scoring='accuracy',
                           n_jobs=1,
                           return_train_score=True,
                           cv=5,
                           verbose=3,
                           refit=True)

# Fitting 5 folds for each of 26 candidates, totalling 130 fits
svm_grid_search_result_80 = svm_grid_search_80.fit(svm_training_x_80, svm_training_y_80)

Fitting 5 folds for each of 26 candidates, totalling 130 fits
[CV 1/5] END kernel=linear, shrinking=True;, score=(train=0.606, test=0.787) total time=   1.0s
[CV 2/5] END kernel=linear, shrinking=True;, score=(train=0.602, test=0.766) total time=   1.0s
[CV 3/5] END kernel=linear, shrinking=True;, score=(train=0.602, test=0.766) total time=   1.0s
[CV 4/5] END kernel=linear, shrinking=True;, score=(train=0.603, test=0.746) total time=   1.0s
[CV 5/5] END kernel=linear, shrinking=True;, score=(train=0.800, test=0.099) total time=   1.0s
[CV 1/5] END kernel=linear, shrinking=False;, score=(train=0.606, test=0.787) total time=   0.9s
[CV 2/5] END kernel=linear, shrinking=False;, score=(train=0.602, test=0.766) total time=   0.9s
[CV 3/5] END kernel=linear, shrinking=False;, score=(train=0.602, test=0.766) total time=   0.9s
[CV 4/5] END kernel=linear, shrinking=False;, score=(train=0.603, test=0.746) total time=   0.9s
[CV 5/5] END kernel=linear, shrinking=False;, score=(train=0.800, test

In [148]:
# Display the best hyperparameter configuration for the 80:20 dataset
svm_grid_search_result_table_80 = convert_grid_search_result(svm_grid_search_result_80)
display_grid_search_result(svm_grid_search_result_table_80)

\begin{tabular}{lrlllr}
\toprule
{} &  mean\_validation\_accuracy &   kernel &  shrinking &  gamma &  degree \\
\midrule
\textbf{1 } &                  0.836218 &      rbf &       True &   auto &     NaN \\
\textbf{2 } &                  0.836218 &      rbf &      False &   auto &     NaN \\
\textbf{3 } &                  0.805023 &      rbf &       True &  scale &     NaN \\
\textbf{4 } &                  0.804823 &      rbf &      False &  scale &     NaN \\
\textbf{5 } &                  0.633645 &  sigmoid &       True &  scale &     NaN \\
\textbf{6 } &                  0.633645 &  sigmoid &      False &  scale &     NaN \\
\textbf{7 } &                  0.632845 &   linear &       True &    NaN &     NaN \\
\textbf{8 } &                  0.632845 &  sigmoid &       True &   auto &     NaN \\
\textbf{9 } &                  0.632845 &  sigmoid &      False &   auto &     NaN \\
\textbf{10} &                  0.632845 &   linear &      False &    NaN &     NaN \\
\bottomrule
\end{ta

In [150]:
# Testing the best hyperparameter configuration on the 80:20 dataset
best_svm_80 = OneClassSVM(nu=0.20, kernel = 'rbf', gamma = 'auto')
best_svm_80.fit(svm_training_x_80)
evaluate(best_svm_80, svm_test_data_dict)

\begin{tabular}{lllllr}
\hline
 Behavior                 & Accuracy   & Precision      & Recall         & F1-Score       &   \#Samples \\
\hline
 NORMAL                   & 76.18\%    & \notCalculated & \notCalculated & \notCalculated &        2800 \\
 RANSOMWARE\_POC          & 100.00\%   & \notCalculated & \notCalculated & \notCalculated &         400 \\
 ROOTKIT\_BDVL            & 100.00\%   & \notCalculated & \notCalculated & \notCalculated &         400 \\
 ROOTKIT\_BEURK           & 25.50\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 CNC\_THETICK             & 27.00\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 CNC\_BACKDOOR\_JAKORITAR & 24.00\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 CNC\_OPT1                & 37.25\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 CNC\_OPT2                & 100.00\%   & \notCalculated & \notCalculated & \notCalculated &         400 \\

## 2. Isolation Forest

In [177]:
isf_training_data_50, isf_training_data_80 = get_training_datasets(train_data, 1, -1)

isf_training_x_50 = isf_training_data_50[:,:-1]
isf_training_y_50 = isf_training_data_50[:,-1].astype(int)

isf_training_x_80 = isf_training_data_80[:,:-1]
isf_training_y_80 = isf_training_data_80[:,-1].astype(int)

In [178]:
isf_test_data_dict, isf_test_data_flat = get_test_datasets(test_data, 1, -1)

In [179]:
param_grid = {
    'n_estimators': list(range(10,101,10)), 
    'max_samples': ["auto"], 
    'max_features': np.linspace(0.1, 1.0, 10), 
}

In [180]:
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import GridSearchCV

isolation_forest_50 = IsolationForest(contamination=0.5, random_state=42)

isf_grid_search_50 = GridSearchCV(isolation_forest_50, 
                                                 param_grid,
                                                 scoring='accuracy', 
                                                 refit=False,
                                                 cv=5, 
                                                 return_train_score=True,
                                                 verbose=3,
                                                 n_jobs=1)

#Fitting 5 folds for each of 100 candidates, totalling 500 fits
isf_grid_search_result_50 = isf_grid_search_50.fit(isf_training_x_50, isf_training_y_50)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 1/5] END max_features=0.1, max_samples=auto, n_estimators=10;, score=(train=0.618, test=0.636) total time=   0.1s
[CV 2/5] END max_features=0.1, max_samples=auto, n_estimators=10;, score=(train=0.619, test=0.675) total time=   0.1s
[CV 3/5] END max_features=0.1, max_samples=auto, n_estimators=10;, score=(train=0.608, test=0.779) total time=   0.1s
[CV 4/5] END max_features=0.1, max_samples=auto, n_estimators=10;, score=(train=0.693, test=0.492) total time=   0.1s
[CV 5/5] END max_features=0.1, max_samples=auto, n_estimators=10;, score=(train=0.626, test=0.787) total time=   0.1s
[CV 1/5] END max_features=0.1, max_samples=auto, n_estimators=20;, score=(train=0.636, test=0.682) total time=   0.1s
[CV 2/5] END max_features=0.1, max_samples=auto, n_estimators=20;, score=(train=0.621, test=0.663) total time=   0.1s
[CV 3/5] END max_features=0.1, max_samples=auto, n_estimators=20;, score=(train=0.617, test=0.791) total time= 

In [181]:
# Display the best hyperparameter configuration for the 50:50 dataset
isf_grid_search_result_table_50 = convert_grid_search_result(isf_grid_search_result_50)
display_grid_search_result(isf_grid_search_result_table_50)

\begin{tabular}{lrrlr}
\toprule
{} &  mean\_validation\_accuracy &  max\_features & max\_samples &  n\_estimators \\
\midrule
\textbf{1 } &                  0.722143 &           0.7 &        auto &            50 \\
\textbf{2 } &                  0.721357 &           0.7 &        auto &            70 \\
\textbf{3 } &                  0.720000 &           0.7 &        auto &            60 \\
\textbf{4 } &                  0.715286 &           0.7 &        auto &           100 \\
\textbf{5 } &                  0.715000 &           0.7 &        auto &            30 \\
\textbf{6 } &                  0.713786 &           0.7 &        auto &            10 \\
\textbf{7 } &                  0.713500 &           0.7 &        auto &            80 \\
\textbf{8 } &                  0.711143 &           0.7 &        auto &            90 \\
\textbf{9 } &                  0.709571 &           1.0 &        auto &            30 \\
\textbf{10} &                  0.709214 &           0.6 &        auto &  

In [182]:
isolation_forest_50 = IsolationForest(contamination=0.5, random_state=42, max_features=0.7, max_samples='auto', n_estimators=60)
isolation_forest_50.fit(isf_training_x_50, isf_training_y_50);
evaluate(isolation_forest_50, isf_test_data_dict)

\begin{tabular}{lllllr}
\hline
 Behavior                 & Accuracy   & Precision      & Recall         & F1-Score       &   \#Samples \\
\hline
 NORMAL                   & 56.82\%    & \notCalculated & \notCalculated & \notCalculated &        2800 \\
 RANSOMWARE\_POC          & 100.00\%   & \notCalculated & \notCalculated & \notCalculated &         400 \\
 ROOTKIT\_BDVL            & 90.50\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 ROOTKIT\_BEURK           & 50.00\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 CNC\_THETICK             & 60.00\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 CNC\_BACKDOOR\_JAKORITAR & 46.50\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 CNC\_OPT1                & 60.50\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 CNC\_OPT2                & 100.00\%   & \notCalculated & \notCalculated & \notCalculated &         400 \\

In [183]:
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import GridSearchCV

isolation_forest_80 = IsolationForest(contamination=0.2, random_state=42)

isf_grid_search_80 = GridSearchCV(isolation_forest_80, 
                                                 param_grid,
                                                 scoring='accuracy', 
                                                 refit=False,
                                                 cv=5, 
                                                 return_train_score=True,
                                                 verbose=3,
                                                 n_jobs=1)

#Fitting 5 folds for each of 100 candidates, totalling 500 fits
isf_grid_search_result_80 = isf_grid_search_80.fit(isf_training_x_80, isf_training_y_80)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 1/5] END max_features=0.1, max_samples=auto, n_estimators=10;, score=(train=0.741, test=0.884) total time=   0.1s
[CV 2/5] END max_features=0.1, max_samples=auto, n_estimators=10;, score=(train=0.764, test=0.862) total time=   0.1s
[CV 3/5] END max_features=0.1, max_samples=auto, n_estimators=10;, score=(train=0.765, test=0.879) total time=   0.1s
[CV 4/5] END max_features=0.1, max_samples=auto, n_estimators=10;, score=(train=0.763, test=0.845) total time=   0.1s
[CV 5/5] END max_features=0.1, max_samples=auto, n_estimators=10;, score=(train=0.800, test=0.472) total time=   0.1s
[CV 1/5] END max_features=0.1, max_samples=auto, n_estimators=20;, score=(train=0.757, test=0.905) total time=   0.1s
[CV 2/5] END max_features=0.1, max_samples=auto, n_estimators=20;, score=(train=0.773, test=0.867) total time=   0.1s
[CV 3/5] END max_features=0.1, max_samples=auto, n_estimators=20;, score=(train=0.772, test=0.886) total time= 

In [184]:
isf_grid_search_result_table_80 = convert_grid_search_result(isf_grid_search_result_80)
display_grid_search_result(isf_grid_search_result_table_80)

\begin{tabular}{lrrlr}
\toprule
{} &  mean\_validation\_accuracy &  max\_features & max\_samples &  n\_estimators \\
\midrule
\textbf{1 } &                  0.839621 &           0.3 &        auto &            10 \\
\textbf{2 } &                  0.835620 &           1.0 &        auto &            20 \\
\textbf{3 } &                  0.829421 &           0.5 &        auto &            80 \\
\textbf{4 } &                  0.829121 &           0.5 &        auto &            50 \\
\textbf{5 } &                  0.827521 &           0.5 &        auto &            70 \\
\textbf{6 } &                  0.827521 &           0.5 &        auto &            60 \\
\textbf{7 } &                  0.826822 &           0.5 &        auto &            90 \\
\textbf{8 } &                  0.826821 &           0.5 &        auto &            40 \\
\textbf{9 } &                  0.826722 &           0.5 &        auto &           100 \\
\textbf{10} &                  0.826322 &           0.3 &        auto &  

In [195]:
isolation_forest_80 = IsolationForest(contamination=0.2, random_state=42, max_features=0.6, max_samples='auto', n_estimators=10)
isolation_forest_80.fit(isf_training_x_80, isf_training_y_80);
evaluate(isolation_forest_80, isf_test_data_dict)

\begin{tabular}{lllllr}
\hline
 Behavior                 & Accuracy   & Precision      & Recall         & F1-Score       &   \#Samples \\
\hline
 NORMAL                   & 79.71\%    & \notCalculated & \notCalculated & \notCalculated &        2800 \\
 RANSOMWARE\_POC          & 100.00\%   & \notCalculated & \notCalculated & \notCalculated &         400 \\
 ROOTKIT\_BDVL            & 77.75\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 ROOTKIT\_BEURK           & 25.50\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 CNC\_THETICK             & 31.00\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 CNC\_BACKDOOR\_JAKORITAR & 23.75\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 CNC\_OPT1                & 24.50\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 CNC\_OPT2                & 89.75\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\

## 3. Local Outlier Factor

In [62]:
lof_training_data_50, lof_training_data_80 = get_training_datasets(train_data, 1, -1)

lof_training_x_50 = lof_training_data_50[:,:-1]
lof_training_y_50 = lof_training_data_50[:,-1].astype(int)

lof_training_x_80 = lof_training_data_80[:,:-1]
lof_training_y_80 = lof_training_data_80[:,-1].astype(int)

In [63]:
lof_test_data_dict, lof_test_data_flat = get_test_datasets(test_data, 1, -1)

In [83]:
param_grid = {
    'n_neighbors': [10, 15, 20, 25, 30],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'metric': ['minkowski'],
    'p': [2], 
    'novelty': [True]
}

In [84]:
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import GridSearchCV

local_outlier_factor_50 = LocalOutlierFactor(contamination=0.5, novelty=True)
lof_grid_search_50 = GridSearchCV(local_outlier_factor_50, 
                                                 param_grid,
                                                 scoring='accuracy', 
                                                 refit=False,
                                                 cv=5, 
                                                 return_train_score=True,
                                                 verbose=3,
                                                 n_jobs=1)
# Fitting 5 folds for each of 20 candidates, totalling 100 fits
lof_grid_search_result_50 = lof_grid_search_50.fit(lof_training_x_50, lof_training_y_50)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END algorithm=auto, metric=minkowski, n_neighbors=10, novelty=True, p=2;, score=(train=0.535, test=0.470) total time=   0.3s
[CV 2/5] END algorithm=auto, metric=minkowski, n_neighbors=10, novelty=True, p=2;, score=(train=0.529, test=0.452) total time=   0.2s
[CV 3/5] END algorithm=auto, metric=minkowski, n_neighbors=10, novelty=True, p=2;, score=(train=0.544, test=0.665) total time=   0.3s
[CV 4/5] END algorithm=auto, metric=minkowski, n_neighbors=10, novelty=True, p=2;, score=(train=0.533, test=0.778) total time=   0.3s
[CV 5/5] END algorithm=auto, metric=minkowski, n_neighbors=10, novelty=True, p=2;, score=(train=0.570, test=0.878) total time=   0.2s
[CV 1/5] END algorithm=auto, metric=minkowski, n_neighbors=15, novelty=True, p=2;, score=(train=0.549, test=0.487) total time=   0.3s
[CV 2/5] END algorithm=auto, metric=minkowski, n_neighbors=15, novelty=True, p=2;, score=(train=0.547, test=0.475) total time=   0.3s


In [155]:
lof_grid_search_result_table_50 = convert_grid_search_result(lof_grid_search_result_50)
display_grid_search_result(lof_grid_search_result_table_50)

\begin{tabular}{lrllrlr}
\toprule
{} &  mean\_validation\_accuracy &  algorithm &     metric &  n\_neighbors &  novelty &  p \\
\midrule
\textbf{1 } &                  0.685214 &      brute &  minkowski &           30 &     True &  2 \\
\textbf{2 } &                  0.685214 &       auto &  minkowski &           30 &     True &  2 \\
\textbf{3 } &                  0.685214 &  ball\_tree &  minkowski &           30 &     True &  2 \\
\textbf{4 } &                  0.685214 &    kd\_tree &  minkowski &           30 &     True &  2 \\
\textbf{5 } &                  0.676214 &       auto &  minkowski &           25 &     True &  2 \\
\textbf{6 } &                  0.676214 &      brute &  minkowski &           25 &     True &  2 \\
\textbf{7 } &                  0.676214 &  ball\_tree &  minkowski &           25 &     True &  2 \\
\textbf{8 } &                  0.676214 &    kd\_tree &  minkowski &           25 &     True &  2 \\
\textbf{9 } &                  0.670000 &       auto &  min

In [156]:
best_lof_50 = LocalOutlierFactor(contamination=0.5, novelty=True, algorithm='brute', metric='minkowski', p=2.0, n_neighbors=30)
best_lof_50.fit(lof_training_x_50, lof_training_y_50)
evaluate(best_lof_50, lof_test_data_dict)

\begin{tabular}{lllllr}
\hline
 Behavior                 & Accuracy   & Precision      & Recall         & F1-Score       &   \#Samples \\
\hline
 NORMAL                   & 49.32\%    & \notCalculated & \notCalculated & \notCalculated &        2800 \\
 RANSOMWARE\_POC          & 50.75\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 ROOTKIT\_BDVL            & 72.25\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 ROOTKIT\_BEURK           & 61.75\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 CNC\_THETICK             & 81.25\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 CNC\_BACKDOOR\_JAKORITAR & 62.75\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 CNC\_OPT1                & 78.00\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 CNC\_OPT2                & 68.75\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\

In [105]:
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import GridSearchCV


local_outlier_factor_80 = LocalOutlierFactor(contamination=0.2, novelty=True)
lof_grid_search_80 = GridSearchCV(local_outlier_factor_80, 
                                                 param_grid,
                                                 scoring='accuracy', 
                                                 refit=False,
                                                 cv=5, 
                                                 return_train_score=True,
                                                 verbose=3,
                                                 n_jobs=1)

#Fitting 5 folds for each of 20 candidates, totalling 100 fits
lof_grid_search_result_80 = lof_grid_search_80.fit(lof_training_x_80, lof_training_y_80)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END algorithm=auto, metric=minkowski, n_neighbors=10, novelty=True, p=2;, score=(train=0.723, test=0.696) total time=   0.3s
[CV 2/5] END algorithm=auto, metric=minkowski, n_neighbors=10, novelty=True, p=2;, score=(train=0.711, test=0.808) total time=   0.1s
[CV 3/5] END algorithm=auto, metric=minkowski, n_neighbors=10, novelty=True, p=2;, score=(train=0.708, test=0.810) total time=   0.1s
[CV 4/5] END algorithm=auto, metric=minkowski, n_neighbors=10, novelty=True, p=2;, score=(train=0.717, test=0.795) total time=   0.1s
[CV 5/5] END algorithm=auto, metric=minkowski, n_neighbors=10, novelty=True, p=2;, score=(train=0.841, test=0.823) total time=   0.2s
[CV 1/5] END algorithm=auto, metric=minkowski, n_neighbors=15, novelty=True, p=2;, score=(train=0.729, test=0.692) total time=   0.2s
[CV 2/5] END algorithm=auto, metric=minkowski, n_neighbors=15, novelty=True, p=2;, score=(train=0.714, test=0.817) total time=   0.2s


In [157]:
lof_grid_search_result_table_80 = convert_grid_search_result(lof_grid_search_result_80)
display_grid_search_result(lof_grid_search_result_table_80)

\begin{tabular}{lrllrlr}
\toprule
{} &  mean\_validation\_accuracy &  algorithm &     metric &  n\_neighbors &  novelty &  p \\
\midrule
\textbf{1 } &                  0.805346 &      brute &  minkowski &           30 &     True &  2 \\
\textbf{2 } &                  0.805346 &       auto &  minkowski &           30 &     True &  2 \\
\textbf{3 } &                  0.805346 &  ball\_tree &  minkowski &           30 &     True &  2 \\
\textbf{4 } &                  0.805346 &    kd\_tree &  minkowski &           30 &     True &  2 \\
\textbf{5 } &                  0.801047 &       auto &  minkowski &           25 &     True &  2 \\
\textbf{6 } &                  0.801047 &      brute &  minkowski &           25 &     True &  2 \\
\textbf{7 } &                  0.801047 &  ball\_tree &  minkowski &           25 &     True &  2 \\
\textbf{8 } &                  0.801047 &    kd\_tree &  minkowski &           25 &     True &  2 \\
\textbf{9 } &                  0.792548 &       auto &  min

In [158]:
best_lof_80 = LocalOutlierFactor(contamination=0.2, novelty=True, algorithm='auto', metric='minkowski', n_neighbors=20)
best_lof_80.fit(lof_training_x_80, lof_training_y_80)
evaluate(best_lof_80, lof_test_data_dict)

\begin{tabular}{lllllr}
\hline
 Behavior                 & Accuracy   & Precision      & Recall         & F1-Score       &   \#Samples \\
\hline
 NORMAL                   & 73.61\%    & \notCalculated & \notCalculated & \notCalculated &        2800 \\
 RANSOMWARE\_POC          & 41.00\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 ROOTKIT\_BDVL            & 67.75\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 ROOTKIT\_BEURK           & 36.50\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 CNC\_THETICK             & 72.25\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 CNC\_BACKDOOR\_JAKORITAR & 37.75\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 CNC\_OPT1                & 54.50\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\
 CNC\_OPT2                & 47.25\%    & \notCalculated & \notCalculated & \notCalculated &         400 \\

## 4. AutoEncoder