# 3. Local Outlier Factor for Anomaly Detection

In [1]:
executed_yet = False

In [2]:
import os

if not executed_yet:
    executed_yet = True
    original_working_directory_path = os.getcwd()
    os.chdir(os.path.join(original_working_directory_path, "../.."))
    root_working_directory_path =  os.getcwd()
    
print(f'Original working directory: {original_working_directory_path}')
print(f'Current working directory: {root_working_directory_path}')

Original working directory: /Users/jankreischer/Library/Mobile Documents/com~apple~CloudDocs/Master-Thesis/Code/prototypes/prototype_03
Current working directory: /Users/jankreischer/Library/Mobile Documents/com~apple~CloudDocs/Master-Thesis/Code


In [3]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
# Standard Dependencies
import sys
import os
import numpy as np
from time import time

In [5]:
# Global Dependencies
from src.functions import calculate_balance_metrics
from src.custom_types import Behavior, MTDTechnique,  Execution, Evaluation, actions, mitigated_by, normal_afterstates
from src.data_provider import DataProvider
from src.autoencoder import AutoEncoder, RMSELoss
from src.functions import convert_grid_search_result, display_grid_search_result

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
#from prototypes.prototype_02.agent import Agent
#from prototypes.prototype_02.client import Client
#from prototypes.prototype_02.server import Server
#from prototypes.prototype_02.experiment import Experiment

In [6]:
import pandas as pd
decision_states_dataset = pd.read_csv('prototypes/prototype_03/dataset-02_decision-state-samples.csv')
print(len(decision_states_dataset))
after_states_dataset = pd.read_csv('prototypes/prototype_03/dataset-02_after-state-samples.csv')
print(len(after_states_dataset))
dataset = pd.concat([decision_states_dataset, after_states_dataset], axis=0)

17332
60549


In [7]:
decision_states_dataset["mtd"].unique()

array(['None'], dtype=object)

In [8]:
# 3 Status Features
time_status_columns = ['time', 'timestamp', 'seconds']
try:
    dataset.drop(time_status_columns, inplace=True, axis=1)
except:
    print("All time status features are removed from the dataset")
assert len(dataset.columns) == 99

In [9]:
from fast_ml.feature_selection import get_constant_features

constant_features = set(get_constant_features(dataset, threshold=0.99, dropna=False)['Var'])
print(constant_features)
try:
    dataset.drop(constant_features, inplace=True, axis=1)
except:
    print("All constant features are removed from the dataset")
print(dataset.shape)

{'cpuHardIrq', 'connectivity', 'alarmtimer:alarmtimer_start', 'cpuNice', 'dma_fence:dma_fence_init', 'alarmtimer:alarmtimer_fired', 'udp:udp_fail_queue_rcv_skb', 'cachefiles:cachefiles_create', 'clk:clk_set_rate', 'cachefiles:cachefiles_mark_active', 'tasksStopped', 'cachefiles:cachefiles_lookup'}
(77881, 87)


In [10]:
normal_afterstate_strings = [
    ("Behavior.ROOTKIT_BDVL", "MTDTechnique.ROOTKIT_SANITIZER"),
    ("Behavior.ROOTKIT_BEURK", "MTDTechnique.ROOTKIT_SANITIZER"),
    ("Behavior.RANSOMWARE_POC", "MTDTechnique.RANSOMWARE_DIRTRAP"),
    ("Behavior.RANSOMWARE_POC", "MTDTechnique.RANSOMWARE_FILE_EXT_HIDE"),
    ("Behavior.CNC_BACKDOOR_JAKORITAR", "MTDTechnique.CNC_IP_SHUFFLE"),
    ("Behavior.CNC_THETICK", "MTDTechnique.CNC_IP_SHUFFLE"),
    ("Behavior.CNC_OPT1", "MTDTechnique.CNC_IP_SHUFFLE"),
    ("Behavior.CNC_OPT2", "MTDTechnique.CNC_IP_SHUFFLE"),
]

In [11]:
dataset["behavior"].unique()

array(['Behavior.NORMAL', 'Behavior.RANSOMWARE_POC',
       'Behavior.ROOTKIT_BDVL', 'Behavior.CNC_BACKDOOR_JAKORITAR',
       'Behavior.ROOTKIT_BEURK', 'Behavior.CNC_THETICK',
       'Behavior.CNC_OPT1', 'Behavior.CNC_OPT2'], dtype=object)

In [13]:
normal_label = 1
abnormal_label = -1

def is_normal(sample):  
    behavior = sample.behavior 
    mtd = sample.mtd
    if behavior == "Behavior.NORMAL":
        label = normal_label
    elif (str(behavior), str(mtd)) in normal_afterstate_strings:
        label = normal_label
    else:
        label = abnormal_label
    return label

In [14]:
# Add a label if a state should be considered normal or not
dataset['is_normal'] = dataset.apply(lambda sample: is_normal(sample), axis=1)

In [15]:
from sklearn.preprocessing import MinMaxScaler
# Scaling
fit_normal_behavior_only = True
standard_scaling = False
if standard_scaling:
    scaler = StandardScaler()
else:
    scaler = MinMaxScaler()
    
print(f"Using {scaler}")

if fit_normal_behavior_only:
    df = dataset[dataset['is_normal'] == normal_label]

else: 
    df = dataset
    
scaler.fit(df.values[:,:-3])

scaled_dataset = pd.DataFrame(scaler.transform(dataset.values[:,:-3]), columns=dataset.columns.drop(["behavior", "mtd", "is_normal"]), index=dataset.index)
scaled_dataset["behavior"] = dataset["behavior"]
scaled_dataset["mtd"] = dataset["mtd"]
scaled_dataset["is_normal"] = dataset["is_normal"]

Using MinMaxScaler()


In [16]:
from sklearn.model_selection import train_test_split

rl_dataset, ad_dataset = train_test_split(scaled_dataset, train_size=0.5, shuffle=True)
print(len(rl_dataset))
print(len(ad_dataset))

38940
38941


In [18]:
n_total = 0
for behavior in Behavior:
    for mtd in ["None"] + list(MTDTechnique):
        behavior_samples = scaled_dataset.loc[(scaled_dataset['behavior'] == str(behavior)) & (scaled_dataset['mtd'] == str(mtd))]
        n_total+=len(behavior_samples)
        print(f"{behavior}, {mtd} : labeled {behavior_samples['is_normal'].unique()} ({len(behavior_samples)} samples)")
print(f"Contains a total of {n_total} samples.")

Behavior.NORMAL, None : labeled [1] (4178 samples)
Behavior.NORMAL, MTDTechnique.CNC_IP_SHUFFLE : labeled [1] (2031 samples)
Behavior.NORMAL, MTDTechnique.ROOTKIT_SANITIZER : labeled [1] (1971 samples)
Behavior.NORMAL, MTDTechnique.RANSOMWARE_DIRTRAP : labeled [1] (2084 samples)
Behavior.NORMAL, MTDTechnique.RANSOMWARE_FILE_EXT_HIDE : labeled [1] (1971 samples)
Behavior.ROOTKIT_BDVL, None : labeled [-1] (1658 samples)
Behavior.ROOTKIT_BDVL, MTDTechnique.CNC_IP_SHUFFLE : labeled [-1] (657 samples)
Behavior.ROOTKIT_BDVL, MTDTechnique.ROOTKIT_SANITIZER : labeled [1] (1995 samples)
Behavior.ROOTKIT_BDVL, MTDTechnique.RANSOMWARE_DIRTRAP : labeled [-1] (1392 samples)
Behavior.ROOTKIT_BDVL, MTDTechnique.RANSOMWARE_FILE_EXT_HIDE : labeled [-1] (624 samples)
Behavior.ROOTKIT_BEURK, None : labeled [-1] (2012 samples)
Behavior.ROOTKIT_BEURK, MTDTechnique.CNC_IP_SHUFFLE : labeled [-1] (1975 samples)
Behavior.ROOTKIT_BEURK, MTDTechnique.ROOTKIT_SANITIZER : labeled [1] (2081 samples)
Behavior.ROOTKI

In [20]:
n_normal_ad_dataset = len(ad_dataset.loc[ad_dataset["is_normal"] == normal_label])
n_abnormal_ad_dataset = len(ad_dataset.loc[ad_dataset["is_normal"] == abnormal_label])
n_total = len(ad_dataset)

print(n_normal_ad_dataset)
print(n_abnormal_ad_dataset)
print(n_total)
assert n_normal_ad_dataset + n_abnormal_ad_dataset == n_total

nu = n_abnormal_ad_dataset/n_total
print(nu)

14242
24699
38941
0.6342672247759431


### Functions

In [226]:
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def evaluate(model, data_dict, tablefmt='latex_raw'):
        results = []
        labels= [-1,1]
        pos_label = 1
        
        y_true_total = np.empty([0])
        y_pred_total = np.empty([0])
        for behavior, data in data_dict.items():
            y_true = data[:,-1].astype(int)
            y_true_total = np.concatenate((y_true_total, y_true))

            y_pred = model.predict(data[:, :-1].astype(np.float32))
            y_pred_total = np.concatenate((y_pred_total, y_pred))

            accuracy = accuracy_score(y_true, y_pred)

            n_samples = len(y_true)
            results.append([behavior.name.replace("_", "\_"), f'{(100 * accuracy):.2f}\%', '\\notCalculated', '\\notCalculated', '\\notCalculated', str(n_samples)])

        accuracy = accuracy_score(y_true_total, y_pred_total)
        precision = precision_score(y_true_total, y_pred_total, average='binary', labels=labels, pos_label=pos_label, zero_division=1)
        recall = recall_score(y_true_total, y_pred_total, average='binary', labels=labels, pos_label=pos_label, zero_division=1)
        f1 = f1_score(y_true_total, y_pred_total, average='binary', labels=labels, pos_label=pos_label, zero_division=1)
        n_samples = len(y_true_total)
        results.append(["GLOBAL", f'{(100 * accuracy):.2f}\%', f'{(100 * precision):.2f}\%', f'{(100 * recall):.2f}\%', f'{(100 * f1):.2f}\%', n_samples])
        print(tabulate(results, headers=["Behavior", "Accuracy", "Precision", "Recall", "F1-Score", "\\#Samples"], tablefmt=tablefmt)) 

In [50]:
svm_training_x = ad_dataset.drop(["behavior", "mtd", "is_normal"],  axis=1, inplace=False).to_numpy().astype(np.float32) 
svm_training_y = ad_dataset["is_normal"].to_numpy().astype(np.int32)



## 3. Local Outlier Factor

In [21]:
lof_training_x = ad_dataset.drop(["behavior", "mtd", "is_normal"],  axis=1, inplace=False).to_numpy().astype(np.float32)
lof_training_y = ad_dataset["is_normal"].to_numpy().astype(np.int32)

In [None]:
lof_test_data_dict, lof_test_data_flat = get_test_datasets(test_data, 1, -1)

In [22]:
param_grid = {
    'n_neighbors': [10, 15, 20, 25, 30],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'metric': ['minkowski'],
    'p': [2], 
    'novelty': [True]
}

In [24]:
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import GridSearchCV

local_outlier_factor = LocalOutlierFactor(contamination=0.5, novelty=True)
lof_grid_search = GridSearchCV(local_outlier_factor, 
                                                 param_grid,
                                                 scoring='accuracy', 
                                                 refit=False,
                                                 cv=2, 
                                                 return_train_score=True,
                                                 verbose=3,
                                                 n_jobs=1)
# Fitting 5 folds for each of 20 candidates, totalling 100 fits
lof_grid_search_result = lof_grid_search.fit(lof_training_x, lof_training_y)

Fitting 2 folds for each of 20 candidates, totalling 40 fits
[CV 1/2] END algorithm=auto, metric=minkowski, n_neighbors=10, novelty=True, p=2;, score=(train=0.482, test=0.502) total time=   1.1s
[CV 2/2] END algorithm=auto, metric=minkowski, n_neighbors=10, novelty=True, p=2;, score=(train=0.488, test=0.499) total time=   0.9s
[CV 1/2] END algorithm=auto, metric=minkowski, n_neighbors=15, novelty=True, p=2;, score=(train=0.486, test=0.501) total time=   1.0s
[CV 2/2] END algorithm=auto, metric=minkowski, n_neighbors=15, novelty=True, p=2;, score=(train=0.496, test=0.502) total time=   1.0s
[CV 1/2] END algorithm=auto, metric=minkowski, n_neighbors=20, novelty=True, p=2;, score=(train=0.490, test=0.498) total time=   1.0s
[CV 2/2] END algorithm=auto, metric=minkowski, n_neighbors=20, novelty=True, p=2;, score=(train=0.494, test=0.499) total time=   1.0s
[CV 1/2] END algorithm=auto, metric=minkowski, n_neighbors=25, novelty=True, p=2;, score=(train=0.490, test=0.493) total time=   1.1s
[

In [None]:
lof_grid_search_result_table = convert_grid_search_result(lof_grid_search_result)
display_grid_search_result(lof_grid_search_result_table)

In [None]:
best_lof = LocalOutlierFactor(contamination=0.5, novelty=True, algorithm='brute', metric='minkowski', p=2.0, n_neighbors=30)
best_lof.fit(lof_training_x, lof_training_y)
evaluate(best_lof, lof_test_data_dict)