# 2. Isolation Forest for State Anomaly Detection

In [11]:
executed_yet = False

In [3]:
import os

if not executed_yet:
    executed_yet = True
    original_working_directory_path = os.getcwd()
    os.chdir(os.path.join(original_working_directory_path, "../.."))
    root_working_directory_path =  os.getcwd()
    
print(f'Original working directory: {original_working_directory_path}')
print(f'Current working directory: {root_working_directory_path}')

Original working directory: /Users/jankreischer/Library/Mobile Documents/com~apple~CloudDocs/Master-Thesis/Code/prototypes/prototype_03
Current working directory: /Users/jankreischer/Library/Mobile Documents/com~apple~CloudDocs/Master-Thesis/Code


In [60]:
from src.functions import convert_grid_search_result, display_grid_search_result

In [37]:
import pandas as pd
decision_states_dataset = pd.read_csv('prototypes/prototype_03/dataset-02_decision-state-samples.csv')
print(len(decision_states_dataset))
after_states_dataset = pd.read_csv('prototypes/prototype_03/dataset-02_after-state-samples.csv')
print(len(after_states_dataset))
dataset = pd.concat([decision_states_dataset, after_states_dataset], axis=0)

17332
60549


In [38]:
# 3 Status Features
time_status_columns = ['time', 'timestamp', 'seconds']
try:
    dataset.drop(time_status_columns, inplace=True, axis=1)
except:
    print("All time status features are removed from the dataset")
assert len(dataset.columns) == 99

In [39]:
from fast_ml.feature_selection import get_constant_features

constant_features = set(get_constant_features(dataset, threshold=0.99, dropna=False)['Var'])
print(constant_features)
try:
    dataset.drop(constant_features, inplace=True, axis=1)
except:
    print("All constant features are removed from the dataset")
print(dataset.shape)

{'cachefiles:cachefiles_lookup', 'connectivity', 'tasksStopped', 'cachefiles:cachefiles_create', 'cachefiles:cachefiles_mark_active', 'cpuHardIrq', 'alarmtimer:alarmtimer_start', 'dma_fence:dma_fence_init', 'clk:clk_set_rate', 'udp:udp_fail_queue_rcv_skb', 'cpuNice', 'alarmtimer:alarmtimer_fired'}
(77881, 87)


In [40]:
normal_afterstate_strings = [
    ("Behavior.ROOTKIT_BDVL", "MTDTechnique.ROOTKIT_SANITIZER"),
    ("Behavior.ROOTKIT_BEURK", "MTDTechnique.ROOTKIT_SANITIZER"),
    ("Behavior.RANSOMWARE_POC", "MTDTechnique.RANSOMWARE_DIRTRAP"),
    ("Behavior.RANSOMWARE_POC", "MTDTechnique.RANSOMWARE_FILE_EXT_HIDE"),
    ("Behavior.CNC_BACKDOOR_JAKORITAR", "MTDTechnique.CNC_IP_SHUFFLE"),
    ("Behavior.CNC_THETICK", "MTDTechnique.CNC_IP_SHUFFLE"),
    ("Behavior.CNC_OPT1", "MTDTechnique.CNC_IP_SHUFFLE"),
    ("Behavior.CNC_OPT2", "MTDTechnique.CNC_IP_SHUFFLE"),
]

In [41]:
dataset["behavior"].unique()

array(['Behavior.NORMAL', 'Behavior.RANSOMWARE_POC',
       'Behavior.ROOTKIT_BDVL', 'Behavior.CNC_BACKDOOR_JAKORITAR',
       'Behavior.ROOTKIT_BEURK', 'Behavior.CNC_THETICK',
       'Behavior.CNC_OPT1', 'Behavior.CNC_OPT2'], dtype=object)

In [51]:
normal_label = 1
abnormal_label = -1

def is_normal(sample):  
    behavior = sample.behavior 
    mtd = sample.mtd
    if behavior == "Behavior.NORMAL":
        label = normal_label
    elif (str(behavior), str(mtd)) in normal_afterstate_strings:
        label = normal_label
    else:
        label = abnormal_label
    return label

In [52]:
# Add a label if a state should be considered normal or not
dataset['is_normal'] = dataset.apply(lambda sample: is_normal(sample), axis=1)

In [53]:
from sklearn.preprocessing import MinMaxScaler
# Scaling
fit_normal_behavior_only = True
standard_scaling = False
if standard_scaling:
    scaler = StandardScaler()
else:
    scaler = MinMaxScaler()
    
print(f"Using {scaler}")

if fit_normal_behavior_only:
    df = dataset[dataset['is_normal'] == normal_label]

else: 
    df = dataset
    
scaler.fit(df.values[:,:-3])

scaled_dataset = pd.DataFrame(scaler.transform(dataset.values[:,:-3]), columns=dataset.columns.drop(["behavior", "mtd", "is_normal"]), index=dataset.index)
scaled_dataset["behavior"] = dataset["behavior"]
scaled_dataset["mtd"] = dataset["mtd"]
scaled_dataset["is_normal"] = dataset["is_normal"]

Using MinMaxScaler()


In [54]:
from sklearn.model_selection import train_test_split

rl_dataset, ad_dataset = train_test_split(scaled_dataset, train_size=0.5, shuffle=True)
print(len(rl_dataset))
print(len(ad_dataset))

38940
38941


In [55]:
n_normal_ad_dataset = len(ad_dataset.loc[ad_dataset["is_normal"] == normal_label])
n_abnormal_ad_dataset = len(ad_dataset.loc[ad_dataset["is_normal"] == abnormal_label])
n_total = len(ad_dataset)

print(n_normal_ad_dataset)
print(n_abnormal_ad_dataset)
print(n_total)
assert n_normal_ad_dataset + n_abnormal_ad_dataset == n_total

nu = n_abnormal_ad_dataset/n_total
print(f"Contamination level of {round(nu*100, 2)}%")

14481
24460
38941
Contamination level of 62.81%


In [56]:
import numpy as np

isf_training_x  = ad_dataset.drop(["behavior", "mtd", "is_normal"],  axis=1, inplace=False).to_numpy().astype(np.float32) 
isf_training_y = ad_dataset["is_normal"].to_numpy().astype(np.int32)

In [48]:
#isf_test_data_dict, isf_test_data_flat = get_test_datasets(test_data, 1, -1)

NameError: name 'get_test_datasets' is not defined

In [57]:
param_grid = {
    'n_estimators': list(range(10,101,10)), 
    'max_samples': ["auto"], 
    'max_features': np.linspace(0.1, 1.0, 10), 
}

In [58]:
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import GridSearchCV

isolation_forest = IsolationForest(contamination=0.5, random_state=42)

isf_grid_search = GridSearchCV(isolation_forest, 
                                                 param_grid,
                                                 scoring='accuracy', 
                                                 refit=False,
                                                 cv=2, 
                                                 return_train_score=True,
                                                 verbose=3,
                                                 n_jobs=1)

#Fitting 5 folds for each of 100 candidates, totalling 500 fits
isf_grid_search_result = isf_grid_search.fit(isf_training_x, isf_training_y)

Fitting 2 folds for each of 100 candidates, totalling 200 fits
[CV 1/2] END max_features=0.1, max_samples=auto, n_estimators=10;, score=(train=0.529, test=0.520) total time=   0.2s
[CV 2/2] END max_features=0.1, max_samples=auto, n_estimators=10;, score=(train=0.551, test=0.549) total time=   0.2s
[CV 1/2] END max_features=0.1, max_samples=auto, n_estimators=20;, score=(train=0.534, test=0.530) total time=   0.3s
[CV 2/2] END max_features=0.1, max_samples=auto, n_estimators=20;, score=(train=0.548, test=0.551) total time=   0.3s
[CV 1/2] END max_features=0.1, max_samples=auto, n_estimators=30;, score=(train=0.537, test=0.531) total time=   0.4s
[CV 2/2] END max_features=0.1, max_samples=auto, n_estimators=30;, score=(train=0.554, test=0.557) total time=   0.4s
[CV 1/2] END max_features=0.1, max_samples=auto, n_estimators=40;, score=(train=0.529, test=0.523) total time=   0.5s
[CV 2/2] END max_features=0.1, max_samples=auto, n_estimators=40;, score=(train=0.546, test=0.547) total time= 

In [61]:
# Display the best hyperparameter configuration for the 50:50 dataset
isf_grid_search_result_table = convert_grid_search_result(isf_grid_search_result)
display_grid_search_result(isf_grid_search_result_table)

\begin{tabular}{lrrlr}
\toprule
{} &  mean\_validation\_accuracy &  max\_features & max\_samples &  n\_estimators \\
\midrule
\textbf{1 } &                  0.573791 &           0.5 &        auto &            10 \\
\textbf{2 } &                  0.557690 &           0.1 &        auto &           100 \\
\textbf{3 } &                  0.557561 &           0.5 &        auto &            30 \\
\textbf{4 } &                  0.555276 &           0.5 &        auto &            60 \\
\textbf{5 } &                  0.554711 &           1.0 &        auto &           100 \\
\textbf{6 } &                  0.554583 &           0.4 &        auto &            30 \\
\textbf{7 } &                  0.554274 &           0.3 &        auto &            90 \\
\textbf{8 } &                  0.554121 &           0.1 &        auto &            90 \\
\textbf{9 } &                  0.553787 &           0.5 &        auto &           100 \\
\textbf{10} &                  0.553786 &           0.5 &        auto &  

  print(df.head(n_items).to_latex(escape=True, bold_rows=True))


In [None]:
#isolation_forest = IsolationForest(contamination=0.5, random_state=42, max_features=0.7, max_samples='auto', n_estimators=60)
#isolation_forest.fit(isf_training_x, isf_training_y);
#evaluate(isolation_forest, isf_test_data_dict)