# Prototype 02 > Experiment 01

In [3]:
# Mount your google drive in google colab
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

ModuleNotFoundError: No module named 'google.colab'

In [None]:
# Go to base directory
%cd /content/drive/MyDrive/University/Master-Thesis/Code

In [1]:
executed_yet = False

In [2]:
import os

if not executed_yet:
    executed_yet = True
    original_working_directory_path = os.getcwd()
    os.chdir(os.path.join(original_working_directory_path, "../.."))
    root_working_directory_path =  os.getcwd()
    
print(f'Original working directory: {original_working_directory_path}')
print(f'Current working directory: {root_working_directory_path}')

Original working directory: /Users/jankreischer/Library/Mobile Documents/com~apple~CloudDocs/Master-Thesis/Code/prototypes/prototype_03
Current working directory: /Users/jankreischer/Library/Mobile Documents/com~apple~CloudDocs/Master-Thesis/Code


## --- Dependencies ---

In [3]:
# Standard Dependencies
import sys
import os
import numpy as np
from time import time

In [4]:
# Global Dependencies
from src.functions import calculate_balance_metrics
from src.custom_types import Behavior, MTDTechnique, actions, mitigated_by, normal_afterstates
from src.data_provider import DataProvider
from src.enums import Execution, Evaluation
from src.evaluation_utils import plot_learning, seed_random, get_pretrained_agent, evaluate_agent, evaluate_agent_on_afterstates
from src.autoencoder_utils import evaluate_ae_on_afterstates, evaluate_ae_on_no_mtd_behavior, pretrain_ae_model, \
    evaluate_all_ds_as_ae_models, pretrain_all_ds_as_ae_models

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
#from prototypes.prototype_02.agent import Agent
#from prototypes.prototype_02.client import Client
#from prototypes.prototype_02.server import Server
#from prototypes.prototype_02.experiment import Experiment

In [5]:
import pandas as pd
decision_states_dataset = pd.read_csv('prototypes/prototype_03/dataset-02_decision-state-samples.csv')
print(len(decision_states_dataset))
after_states_dataset = pd.read_csv('prototypes/prototype_03/dataset-02_after-state-samples.csv')
print(len(after_states_dataset))
dataset = pd.concat([decision_states_dataset, after_states_dataset], axis=0)

17332
60549


In [6]:
decision_states_dataset["mtd"].unique()

array(['None'], dtype=object)

In [7]:
# 3 Status Features
time_status_columns = ['time', 'timestamp', 'seconds']
try:
    dataset.drop(time_status_columns, inplace=True, axis=1)
except:
    print("All time status features are removed from the dataset")
assert len(dataset.columns) == 99

In [8]:
from fast_ml.feature_selection import get_constant_features

constant_features = set(get_constant_features(dataset, threshold=0.99, dropna=False)['Var'])
print(constant_features)
try:
    dataset.drop(constant_features, inplace=True, axis=1)
except:
    print("All constant features are removed from the dataset")
print(dataset.shape)

{'cachefiles:cachefiles_lookup', 'cachefiles:cachefiles_mark_active', 'dma_fence:dma_fence_init', 'alarmtimer:alarmtimer_fired', 'cachefiles:cachefiles_create', 'tasksStopped', 'connectivity', 'clk:clk_set_rate', 'cpuHardIrq', 'udp:udp_fail_queue_rcv_skb', 'alarmtimer:alarmtimer_start', 'cpuNice'}
(77881, 87)


In [9]:
normal_afterstate_strings = [
    ("Behavior.ROOTKIT_BDVL", "MTDTechnique.ROOTKIT_SANITIZER"),
    ("Behavior.ROOTKIT_BEURK", "MTDTechnique.ROOTKIT_SANITIZER"),
    ("Behavior.RANSOMWARE_POC", "MTDTechnique.RANSOMWARE_DIRTRAP"),
    ("Behavior.RANSOMWARE_POC", "MTDTechnique.RANSOMWARE_FILE_EXT_HIDE"),
    ("Behavior.CNC_BACKDOOR_JAKORITAR", "MTDTechnique.CNC_IP_SHUFFLE"),
    ("Behavior.CNC_THETICK", "MTDTechnique.CNC_IP_SHUFFLE"),
    ("Behavior.CNC_OPT1", "MTDTechnique.CNC_IP_SHUFFLE"),
    ("Behavior.CNC_OPT2", "MTDTechnique.CNC_IP_SHUFFLE"),
]

In [10]:
dataset["behavior"].unique()

array(['Behavior.NORMAL', 'Behavior.RANSOMWARE_POC',
       'Behavior.ROOTKIT_BDVL', 'Behavior.CNC_BACKDOOR_JAKORITAR',
       'Behavior.ROOTKIT_BEURK', 'Behavior.CNC_THETICK',
       'Behavior.CNC_OPT1', 'Behavior.CNC_OPT2'], dtype=object)

In [11]:
def is_normal(sample):  
    behavior = sample.behavior 
    mtd = sample.mtd
    if behavior == "Behavior.NORMAL":
        label = 0
    elif (str(behavior), str(mtd)) in normal_afterstate_strings:
        #print(f"normal afterstate for {behavior} and {mtd}")
        label = 0
    else:
        label = 1
    return label

In [12]:
# Add a label if a state should be considered normal or not
dataset['is_normal'] = dataset.apply(lambda sample: is_normal(sample), axis=1)

In [14]:
from sklearn.model_selection import train_test_split

rl_dataset, ae_dataset = train_test_split(dataset, train_size=0.5, shuffle=True)
print(len(rl_dataset))
print(len(ae_dataset))

38940
38941


In [18]:
for behavior in Behavior:
    for mtd in ["None"] + list(MTDTechnique):
        behavior_samples = rl_dataset.loc[(rl_dataset['behavior'] == str(behavior)) & (rl_dataset['mtd'] == str(mtd))]
        print(f"{behavior}, {mtd} : labeled {behavior_samples['is_normal'].unique()} ({len(behavior_samples)} samples)")

Behavior.NORMAL, None : labeled [0] (2057 samples)
Behavior.NORMAL, MTDTechnique.CNC_IP_SHUFFLE : labeled [0] (973 samples)
Behavior.NORMAL, MTDTechnique.ROOTKIT_SANITIZER : labeled [0] (995 samples)
Behavior.NORMAL, MTDTechnique.RANSOMWARE_DIRTRAP : labeled [0] (1061 samples)
Behavior.NORMAL, MTDTechnique.RANSOMWARE_FILE_EXT_HIDE : labeled [0] (970 samples)
Behavior.ROOTKIT_BDVL, None : labeled [1] (785 samples)
Behavior.ROOTKIT_BDVL, MTDTechnique.CNC_IP_SHUFFLE : labeled [1] (327 samples)
Behavior.ROOTKIT_BDVL, MTDTechnique.ROOTKIT_SANITIZER : labeled [0] (993 samples)
Behavior.ROOTKIT_BDVL, MTDTechnique.RANSOMWARE_DIRTRAP : labeled [1] (662 samples)
Behavior.ROOTKIT_BDVL, MTDTechnique.RANSOMWARE_FILE_EXT_HIDE : labeled [1] (284 samples)
Behavior.ROOTKIT_BEURK, None : labeled [1] (1048 samples)
Behavior.ROOTKIT_BEURK, MTDTechnique.CNC_IP_SHUFFLE : labeled [1] (974 samples)
Behavior.ROOTKIT_BEURK, MTDTechnique.ROOTKIT_SANITIZER : labeled [0] (1075 samples)
Behavior.ROOTKIT_BEURK, MTDT

In [19]:
ae_normal_x = ae_dataset.loc[ae_dataset["is_normal"] == 0].drop(["behavior", "mtd", "is_normal"],  axis=1, inplace=False).to_numpy().astype(np.float32) 
print(ae_normal_x.shape)

(14433, 85)


In [20]:
threshold = int(0.6666*len(ae_normal_x))
ae_train_x = ae_normal_x[:threshold]
ae_valid_x = ae_normal_x[threshold:]
print(ae_train_x.shape)
print(ae_valid_x.shape)

(9621, 85)
(4812, 85)


In [30]:
from torch import nn
import torch
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report, accuracy_score
from tabulate import tabulate


class RMSELoss(nn.Module):
    def __init__(self, eps=1e-6):
        super().__init__()
        self.mse = torch.nn.MSELoss()
        self.eps = eps
        
    def forward(self, yhat, y):
        loss = torch.sqrt(self.mse(yhat,y) + self.eps)
        return loss
    

class AutoEncoder(torch.nn.Module):
    

    def __init__(self, X_valid, evaluation_data, n_stds=[1], n_hidden_1=64, n_hidden_2=32, activation_function=nn.GELU(), batch_size: int = 64, verbose=False):

        super().__init__()
        
        validation_dataset = torch.utils.data.TensorDataset(
            torch.from_numpy(X_valid).type(torch.float),

        )
        self.validation_data_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=1, shuffle=True, drop_last=True)

        self.y_test = evaluation_data[["is_normal"]].to_numpy().astype(np.float32) 
        self.X_test = evaluation_data.drop(["behavior", "mtd", "is_normal"],  axis=1, inplace=False).to_numpy().astype(np.float32) 
        
        self.evaluation_data = evaluation_data
        self.n_stds = n_stds
        
        n_features = X_valid.shape[1]
        
        self.model = nn.Sequential(
            nn.Linear(n_features, n_hidden_1),
            nn.BatchNorm1d(n_hidden_1),
            activation_function,
            nn.Linear(n_hidden_1, n_hidden_2),
            activation_function,
            #nn.Linear(32, 16),
            #activation_function,
            #nn.Linear(16, 32),
            #activation_function,
            nn.Linear(n_hidden_2, n_hidden_1),
            nn.BatchNorm1d(n_hidden_1),
            activation_function,
            nn.Linear(n_hidden_1, n_features),
            activation_function
        )
        self.threshold = None
        self.loss_mean = None
        self.loss_standard_deviation = None
        
        self.verbose = verbose

        
    def forward(self, X):
        return self.model(X)
    
    
    def pretrain(self, X_train, optimizer=torch.optim.SGD, loss_function=torch.nn.MSELoss(reduction='mean'), num_epochs: int = 15, batch_size=64, verbose=False):
        
        training_dataset = torch.utils.data.TensorDataset(
            torch.from_numpy(X_train).type(torch.float),
        )
        training_data_loader = torch.utils.data.DataLoader(training_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

        epoch_losses = []
        #for e in tqdm(range(num_epochs), unit="epoch", leave=False):
        for e in range(num_epochs):
            self.train()
            current_losses = []
            for batch_index, (inputs,) in enumerate(training_data_loader):
                optimizer.zero_grad()
                outputs = self.forward(inputs)
                loss = loss_function(inputs, outputs)
                loss.backward()
                optimizer.step()
                current_losses.append(loss.item())
            
            epoch_losses.append(np.average(current_losses))
            if verbose:
                print(f'Training Loss in epoch {e + 1}: {epoch_losses[e]}')
            
        self.analyze_loss()

    '''
    This function uses normal data samles 
    after training the autoencoder to determine
    values that can be considered normal
    for the reconstruction loss based on normal samples
    '''
    def analyze_loss(self):
        losses = []
        
        self.eval() 
        with torch.no_grad():
            loss_function = torch.nn.MSELoss(reduction='sum')
            for batch_index, (inputs,) in enumerate(self.validation_data_loader):
                outputs = self.forward(inputs)
                loss = loss_function(inputs, outputs)
                losses.append(loss.item())
        
        losses = np.array(losses)

        self.loss_mean = losses.mean()
        self.loss_standard_deviation = losses.std()

        
    def predict(self, x, n_std = 1):
        test_data = torch.utils.data.TensorDataset(
            torch.from_numpy(x).type(torch.float32)
        )
        test_data_loader = torch.utils.data.DataLoader(test_data, batch_size=1, shuffle=False)

        all_predictions = torch.tensor([])  # .cuda()

        self.eval()
        with torch.no_grad():
            ae_loss = torch.nn.MSELoss(reduction="sum")
            for idx, (batch_x,) in enumerate(test_data_loader):
                model_predictions = self.forward(batch_x)
                model_predictions = ae_loss(model_predictions, batch_x).unsqueeze(0)  # unsqueeze as batch_size set to 1
                all_predictions = torch.cat((all_predictions, model_predictions))

        threshold = self.loss_mean + n_std * self.loss_standard_deviation
        all_predictions = (all_predictions > threshold).type(torch.long)
        return all_predictions.flatten()
    
    
    def predict_deviation(self, x):
        test_data = torch.utils.data.TensorDataset(
            torch.from_numpy(x).type(torch.float32)
        )
        test_data_loader = torch.utils.data.DataLoader(test_data, batch_size=1, shuffle=False)

        prediction_errors = torch.tensor([])
        loss_function = torch.nn.MSELoss(reduction="sum")
        
        self.eval()
        with torch.no_grad():
            
            for batch_index, (inputs,) in enumerate(test_data_loader):
                prediction = self.forward(inputs)
                prediction_error = loss_function(inputs, prediction).unsqueeze(0)  # unsqueeze as batch_size set to 1
                prediction_errors = torch.cat((prediction_errors, prediction_error))

        return prediction_errors
    
    
    def score(self):
        n_std, accuracy = self.accuracy_score(None, None)
        if self.verbose:
            print(f">> Highest validation accuracy achieved {accuracy:.2f} with n_std={n_std} <<")
            self.evaluate(n_std)
        return accuracy
    
    
    def accuracy_score(self, X, y):
        #if not self.threshold:
        #loss_mean, loss_standard_deviation = self.analyze_loss(X)
        #n_stds = np.arange(0.1, 3, 0.1)
        if self.loss_mean == None or self.loss_standard_deviation == None:
              #print("accuracy_score_optimized > accurcy_loss()")
              self.analyze_loss()
    
        best_accuracy = 0
        best_n_std = 0
        #accuracies = []
        y_dev = self.predict_deviation((self.X_test).astype(np.float32))
        for n_std in self.n_stds:
            y_true = self.y_test
            threshold = self.loss_mean + n_std * self.loss_standard_deviation
            y_pred = (y_dev > threshold).type(torch.long).detach().cpu().numpy()
            
            accuracy = accuracy_score(y_true, y_pred)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_n_std = n_std
            #if self.verbose:
            #    print(f"n_std {n_std:.2f} -> accuracy: {accuracy}")

        return best_n_std, best_accuracy
    
    def evaluate(self, n_std=1, tablefmt='pipe'):
        results = []
        headers=["Behavior", "Type", "After MTD", "Accuracy", "\#Samples"]
        
        y_true_total = np.empty([0])
        y_pred_total = np.empty([0])
        
        for behavior in Behavior:
            for mtd in ["None"] + list(MTDTechnique):
                behavior_samples = self.evaluation_data.loc[(self.evaluation_data['behavior'] == str(behavior)) & (self.evaluation_data['mtd'] == str(mtd))]
                
                y_true= behavior_samples[["is_normal"]].to_numpy().flatten().astype(np.float32) 
                #print(y_true.shape)
                #print(y_true_total.shape)
                y_true_total = np.concatenate((y_true_total, y_true))
                
                X_test = behavior_samples.drop(["behavior", "mtd", "is_normal"],  axis=1, inplace=False).to_numpy().astype(np.float32) 
                
                y_pred = self.predict(X_test, n_std=n_std)
                print(f"{behavior}, {mtd} : Predicted {len(y_pred)} for {len(X_test)} given samples")
                y_pred_total = np.concatenate((y_pred_total, y_pred))
                
                accuracy = accuracy_score(y_true, y_pred)
            
                n_samples = len(y_true)
                
                if mtd == 'None':
                    state_type = "Decision"
                else:
                    state_type = "After"
                results.append([behavior.name.replace('_', '\_'), state_type, str(mtd), f'{(100 * accuracy):.2f}\%', str(n_samples)])
                
        print(tabulate(results, headers=headers, tablefmt=tablefmt))

In [33]:
autoencoder = AutoEncoder(ae_valid_x, rl_dataset, n_hidden_1=64, n_hidden_2=32)

In [None]:
autoencoder.pretrain(ae_train_x, optimizer=torch.optim.Adam(autoencoder.parameters(), lr=1e-4,  weight_decay=0.01), loss_function=RMSELoss(), num_epochs=100, batch_size=64, verbose=True)

Training Loss in epoch 1: 275886.25354166667
Training Loss in epoch 2: 275870.97541666665
Training Loss in epoch 3: 275890.76895833336
Training Loss in epoch 4: 275889.43083333335
Training Loss in epoch 5: 275877.80645833333
Training Loss in epoch 6: 275899.6375
Training Loss in epoch 7: 275874.7054166667
Training Loss in epoch 8: 275880.12
Training Loss in epoch 9: 275894.1039583333
Training Loss in epoch 10: 275890.2972916667
Training Loss in epoch 11: 275896.2233333333
Training Loss in epoch 12: 275879.7058333333
Training Loss in epoch 13: 275877.7495833333
Training Loss in epoch 14: 275892.105625
Training Loss in epoch 15: 275898.4889583333
Training Loss in epoch 16: 275885.1820833333
Training Loss in epoch 17: 275884.78770833334
Training Loss in epoch 18: 275889.54270833335
Training Loss in epoch 19: 275881.138125
Training Loss in epoch 20: 275880.411875
Training Loss in epoch 21: 275892.218125
Training Loss in epoch 22: 275875.866875
Training Loss in epoch 23: 275861.18145833333


In [25]:
autoencoder.evaluate(n_std=2, tablefmt='latex_raw')

Behavior.NORMAL, None : Predicted 2057 for 2057 given samples
Behavior.NORMAL, MTDTechnique.CNC_IP_SHUFFLE : Predicted 973 for 973 given samples
Behavior.NORMAL, MTDTechnique.ROOTKIT_SANITIZER : Predicted 995 for 995 given samples
Behavior.NORMAL, MTDTechnique.RANSOMWARE_DIRTRAP : Predicted 1061 for 1061 given samples
Behavior.NORMAL, MTDTechnique.RANSOMWARE_FILE_EXT_HIDE : Predicted 970 for 970 given samples
Behavior.ROOTKIT_BDVL, None : Predicted 785 for 785 given samples
Behavior.ROOTKIT_BDVL, MTDTechnique.CNC_IP_SHUFFLE : Predicted 327 for 327 given samples
Behavior.ROOTKIT_BDVL, MTDTechnique.ROOTKIT_SANITIZER : Predicted 993 for 993 given samples
Behavior.ROOTKIT_BDVL, MTDTechnique.RANSOMWARE_DIRTRAP : Predicted 662 for 662 given samples
Behavior.ROOTKIT_BDVL, MTDTechnique.RANSOMWARE_FILE_EXT_HIDE : Predicted 284 for 284 given samples
Behavior.ROOTKIT_BEURK, None : Predicted 1048 for 1048 given samples
Behavior.ROOTKIT_BEURK, MTDTechnique.CNC_IP_SHUFFLE : Predicted 974 for 974 giv

In [7]:
import numpy as np
import torch 
#from src.autoencoder_utils import initial_autoencoder_architecture
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report, accuracy_score
from tabulate import tabulate

#class AutoEncoder(torch.nn.Module):
class AutoEncoder():
    
    def __init__(self, train_x: np.ndarray,
                 valid_x: np.ndarray,
                 batch_size: int = 64, batch_size_valid=1):
        #super().__init__()
            
        data_train = torch.utils.data.TensorDataset(
            torch.from_numpy(train_x).type(torch.float),
            #torch.from_numpy(train_y).type(torch.float)
        )
        self.data_loader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, shuffle=True, drop_last=True)

        data_valid = torch.utils.data.TensorDataset(
            torch.from_numpy(valid_x).type(torch.float),
            #torch.from_numpy(valid_y).type(torch.float)
        )
        self.validation_data_loader = torch.utils.data.DataLoader(data_valid, batch_size=batch_size_valid, shuffle=True)
        self.validation_losses = []

        n_features = train_x.shape[1]
        print(f"n_features: {n_features}")
        self.model = initial_autoencoder_architecture(n_features)
        print(self.model)
        self.threshold = np.nan
        
        self.threshold = None
        self.loss_mean = None
        self.loss_standard_deviation = None

    def get_model(self):
        return self.model
    
    def forward(self, X):
        return self.model(X)

    def train(self, optimizer=torch.optim.SGD, loss_function=torch.nn.MSELoss(reduction='sum'), num_epochs: int = 15):
        epoch_losses = []
        # for e in tqdm(range(num_epochs), unit="epoch", leave=False):
        for e in range(num_epochs):
            self.model.train()
            current_losses = []
            for batch_idx, (x,) in enumerate(self.data_loader):
                x = x  # x.cuda()
                optimizer.zero_grad()
                model_out = self.model(x)
                loss = loss_function(model_out, x)
                loss.backward()
                optimizer.step()
                current_losses.append(loss.item())
            epoch_losses.append(sum(current_losses) / len(current_losses))
            # print(f'Training Loss in epoch {e + 1}: {epoch_losses[e]}')
        
        self.analyze_loss()

    '''
    This function uses normal data samles 
    after training the autoencoder to determine
    values that can be considered normal
    for the reconstruction loss based on normal samples
    '''
    def analyze_loss(self):
        losses = []

        self.model.eval() 
        with torch.no_grad():
            loss_function = torch.nn.MSELoss(reduction='sum')
            for batch_index, (inputs,) in enumerate(self.validation_data_loader):
                outputs = self.forward(inputs)
                loss = loss_function(inputs, outputs)
                losses.append(loss.item())

        losses = np.array(losses)

        self.loss_mean = losses.mean()
        self.loss_standard_deviation = losses.std()
    
    '''
    def determine_threshold(self, n_std=1) -> float:
        mses = []
        self.model.eval()
        with torch.no_grad():
            loss_function = torch.nn.MSELoss(reduction='sum')
            for batch_idx, (x,) in enumerate(self.validation_data_loader):
                x = x  # x.cuda()
                model_out = self.model(x)
                loss = loss_function(model_out, x)
                mses.append(loss.item())
        mses = np.array(mses)
        self.loss_mean = mses.mean()
        self.loss_standard_deviation = mses.std()
        self.threshold = mses.mean() + n_std * mses.std()
        return self.threshold
    '''

    def predict(self, x, n_std=1):
        test_data = torch.utils.data.TensorDataset(
            torch.from_numpy(x).type(torch.float)
        )
        data_loader = torch.utils.data.DataLoader(test_data, batch_size=1, shuffle=False)

        all_predictions = torch.tensor([])  # .cuda()

        self.model.eval()
        with torch.no_grad():
            ae_loss = torch.nn.MSELoss(reduction="sum")
            for idx, (batch_x,) in enumerate(data_loader):
                batch_x = batch_x  # .cuda()
                model_predictions = self.model(batch_x)

                model_predictions = ae_loss(model_predictions, batch_x).unsqueeze(0)  # unsqueeze as batch_size set to 1
                all_predictions = torch.cat((all_predictions, model_predictions))

        threshold = self.loss_mean + n_std * self.loss_standard_deviation
        # all_predictions = all_predictions.tolist()
        all_predictions = (all_predictions > threshold).type(torch.long)
        return all_predictions.flatten()
    
    def evaluate(self, decision_state_test_data, after_state_test_data, n_std=1, tablefmt='pipe'):
        results = []
        labels= [0,1]
        pos_label = 1
        
        y_true_total = np.empty([0])
        y_pred_total = np.empty([0])
        for behavior, data in decision_state_test_data.items():
            y_true = np.array([0 if behavior == Behavior.NORMAL else 1] * len(data)).astype(int)
            y_true_total = np.concatenate((y_true_total, y_true))

            y_pred = self.predict(data[:, :-1].astype(np.float32), n_std=n_std)
            y_pred_total = np.concatenate((y_pred_total, y_pred))

            accuracy = accuracy_score(y_true, y_pred)

            n_samples = len(y_true)
            results.append(["Decisionstate","None", behavior.name.replace("_", "\_"), f'{(100 * accuracy):.2f}\%', str(n_samples)])

        for (b, m), samples in after_state_test_data.items():
            if b == behavior == Behavior.NORMAL or (b, m) in normal_afterstates:
                true_label = 0
            else:
                true_label = 1
             
            y_true = np.array([true_label] * len(samples)).astype(int) 
            y_true_total = np.concatenate((y_true_total, y_true))
            
            y_pred = self.predict(samples[:, :-2].astype(np.float32), n_std=n_std)
            y_pred_total = np.concatenate((y_pred_total, y_pred))
                
            accuracy = accuracy_score(y_true, y_pred)
            
            n_samples = len(y_true)
            results.append(["Afterstate", m.name.replace("_", "\_"), b.name.replace("_", "\_"), f'{(100 * accuracy):.2f}\%', str(n_samples)])
            
            #print(f"{b} {m}: {value.shape}")
            
        accuracy = accuracy_score(y_true_total, y_pred_total)
        precision = precision_score(y_true_total, y_pred_total, average='binary', labels=labels, pos_label=pos_label, zero_division=1)
        recall = recall_score(y_true_total, y_pred_total, average='binary', labels=labels, pos_label=pos_label, zero_division=1)
        f1 = f1_score(y_true_total, y_pred_total, average='binary', labels=labels, pos_label=pos_label, zero_division=1)
        n_samples = len(y_true_total)
        results.append(["", "", "GLOBAL", f'{(100 * accuracy):.2f}\%', n_samples])
        print("-----------")
        print(tabulate(results, headers=["State", "After MTD", "Behavior", "Accuracy", "\#Samples"], tablefmt=tablefmt))
        
        

In [9]:
def pretrain_ae_model(ae_data, split=0.8, lr=1e-4, momentum=0.9, num_epochs=100, n_std=2.5):
    idx = int(len(ae_data) * split)
    train_ae_x = ae_data[:idx,:].astype(np.float32)
    valid_ae_x = ae_data[idx:,:].astype(np.float32)
    print(f"size train: {train_ae_x.shape}, size valid: {valid_ae_x.shape}")

    print("---Training AE---")
    autoencoder = AutoEncoder(train_x=train_ae_x, valid_x=valid_ae_x)
    autoencoder.train(optimizer=torch.optim.Adam(autoencoder.model.parameters(), lr=lr,  weight_decay=0.01), num_epochs=num_epochs)
    return autoencoder, train_ae_x, valid_ae_x

In [10]:
def pretrain_autoencoder(decision_states_data, ae_train_dict, dir="experiments/experiment_03/trained_models", n_std=1):
    #for key, value in ae_train_dict.items():
    #    print(f"{key}:{len(value)}")
    #"""pretrains autoencoder models on 1. decision state normal,
    #2. on each normal-mtd combination,
    #3. on both decision and normal-mtd combination data"""
    #ae, all_train, all_valid = pretrain_ae_model(decision_states_data)
    #for i, mtd in enumerate(ae_train_dict):
    #    path = f"{dir}/ae_model_{mtd.value}.pth"
    #    ae, train_data, valid_data = pretrain_ae_model(ae_train_dict[mtd][:, :-1], n_std=n_std)
    #    all_train = np.vstack((all_train, train_data))
    #    all_valid = np.vstack((all_valid, valid_data))
    #    # for all afterstate model
    #    if i == 0:
    #        all_as_train, all_as_valid = train_data, valid_data
    #    else:
    #        all_as_train = np.vstack((all_as_train, train_data))
    #        all_as_valid = np.vstack((all_as_valid, valid_data))
    #
    #all_as_data = np.vstack((all_as_train, all_as_valid))
    #all_as_data = np.hstack((all_as_data, np.ones((len(all_as_data), 1))))
    #print("all as data: ", len(all_as_data))
    #pretrain_ae_model(all_as_data, n_std=n_std, num_epochs=100, lr=1e-4)

    #all_data = np.vstack((all_train, all_valid))
    #all_data = np.hstack((all_data, np.ones((len(all_data), 1))))
    #print("all ds/as data: ", len(all_data))
    #print(f"all_data(type): {type(all_data)}")
    decision_states_data_x = decision_states_data[:,:-1]
    n_features = decision_states_data_x.shape[1]
    combined_training_data = np.empty([0, n_features])
    for key, value in ae_train_dict.items():
        combined_training_data = np.vstack([combined_training_data, value[:,:-2]])
    return pretrain_ae_model(combined_training_data, n_std=n_std, num_epochs=100, lr=1e-4)

In [11]:
def split_as_data_for_ae_and_rl(train_data, s=0.3):
    ae_dict = {}
    for mtd in MTDTechnique:
        normal_mtd_train = train_data[(Behavior.NORMAL, mtd)]
        train_data[(Behavior.NORMAL, mtd)] = normal_mtd_train[:int(s * len(normal_mtd_train))]
        ae_dict[mtd] = normal_mtd_train[int(s * len(normal_mtd_train)):]
    return ae_dict, train_data

In [12]:
# read in all preprocessed data for a simulated, supervised environment to sample from
# dtrain, dtest, atrain, atest = DataProvider.get_reduced_dimensions_with_pca_ds_as(DIMS,
#                                                                                   dir=f"{experiment_base_dir}/")
decision_states_training_data_dict, decision_states_test_data_dict, after_states_training_data_dict, after_states_test_data_dict, scaler = DataProvider.get_scaled_scaled_train_test_split_with_afterstates(
    scaling_minmax=True, scale_normal_only=True)

# get splits for RL & AD of normal data
ae_decision_states_training_data, rl_decision_states_training_data_dict = DataProvider.split_ds_data_for_ae_and_rl(decision_states_training_data_dict)
ae_decision_states_training_data = np.vstack((ae_decision_states_training_data, ae_decision_states_training_data)) # upsampling to have equal contribution with afterstates
dims = len(ae_decision_states_training_data[0, :-1])
ae_after_states_training_data_dict, rl_after_states_training_data_dict = DataProvider.split_as_data_for_ae_and_rl(after_states_training_data_dict)

#print(f"dtrain.shape: {len(dtrain)}; dtrain.type: {type(dtrain)}")
#print(f"dtest.shape: {len(dtest)}; dtest.type: {type(dtest)}")
#print(f"atrain.shape: {len(atrain)}; atrain.type: {type(atrain)}")
#print(f"atest.shape: {len(atest)}; atest.type: {type(atest)}")
#print("---")
#print(f"ae_ds_train.shape: {len(ae_ds_train)}; ae_ds_train.type: {type(ae_ds_train)}")
#print(f"dtrain_rl .shape: {len(dtrain_rl)}; dtrain_rl .type: {type(dtrain_rl)}")
#print(f"ae_as_train.shape: {len(ae_as_train)}; ae_as_train.type: {type(ae_as_train)}")
#print(f"atrain_rl.shape: {len(atrain_rl)}; atrain_rl.type: {type(atrain_rl)}")

# MODEL trained on all ds and as normal data assumes the least -> MOST REALISTIC
#autoencoder = pretrain_autoencoder(ae_ds_train, ae_as_train, n_std=2.5)
#evaluate_all_ds_as_ae_models(autoencoder, dtrain_rl, atrain_rl, dims=dims, dir=dir)

getting Behavior.NORMAL
getting Behavior.RANSOMWARE_POC
getting Behavior.ROOTKIT_BDVL
getting Behavior.CNC_BACKDOOR_JAKORITAR
getting Behavior.ROOTKIT_BEURK
getting Behavior.CNC_THETICK
getting Behavior.CNC_OPT1
getting Behavior.CNC_OPT2
(16924, 87)
(60197, 88)


InvalidIndexError: (slice(None, None, None), slice(None, -1, None))

In [20]:
print(ae_decision_states_training_data.shape)

(2716, 87)


In [18]:
# 86 features
n_total = 0
for key, value in decision_states_test_data_dict.items():
    n_total+=value.shape[0]
    print(f"{key}: {value.shape}")
print(n_total)

Behavior.NORMAL: (825, 87)
Behavior.RANSOMWARE_POC: (351, 87)
Behavior.ROOTKIT_BDVL: (321, 87)
Behavior.CNC_BACKDOOR_JAKORITAR: (393, 87)
Behavior.ROOTKIT_BEURK: (392, 87)
Behavior.CNC_THETICK: (291, 87)
Behavior.CNC_OPT1: (406, 87)
Behavior.CNC_OPT2: (405, 87)
3384


In [17]:
# 86 features
n_total = 0
for key, value in decision_states_training_data_dict.items():
    n_total+=value.shape[0]
    print(f"{key}: {value.shape}")
print(n_total)

Behavior.NORMAL: (582, 87)
Behavior.RANSOMWARE_POC: (905, 87)
Behavior.ROOTKIT_BDVL: (799, 87)
Behavior.CNC_BACKDOOR_JAKORITAR: (1025, 87)
Behavior.ROOTKIT_BEURK: (951, 87)
Behavior.CNC_THETICK: (730, 87)
Behavior.CNC_OPT1: (1061, 87)
Behavior.CNC_OPT2: (1075, 87)
7128


In [16]:
# 86 features
n_total = 0
for key, value in rl_decision_states_training_data_dict.items():
    n_total+=value.shape[0]
    print(f"{key}: {value.shape}")
print(n_total)

Behavior.NORMAL: (582, 87)
Behavior.RANSOMWARE_POC: (905, 87)
Behavior.ROOTKIT_BDVL: (799, 87)
Behavior.CNC_BACKDOOR_JAKORITAR: (1025, 87)
Behavior.ROOTKIT_BEURK: (951, 87)
Behavior.CNC_THETICK: (730, 87)
Behavior.CNC_OPT1: (1061, 87)
Behavior.CNC_OPT2: (1075, 87)
7128


In [20]:
for key, value in rl_decision_states_training_data_dict.items():
    print(f"{key}: {value.shape}")

Behavior.NORMAL: (581, 87)
Behavior.RANSOMWARE_POC: (921, 87)
Behavior.ROOTKIT_BDVL: (802, 87)
Behavior.CNC_BACKDOOR_JAKORITAR: (1014, 87)
Behavior.ROOTKIT_BEURK: (929, 87)
Behavior.CNC_THETICK: (716, 87)
Behavior.CNC_OPT1: (1067, 87)
Behavior.CNC_OPT2: (1075, 87)


In [None]:
for key, value in rl_after_states_training_data_dict.items():
    print(f"{key}: {value[:,-2:]}")

In [23]:
print(ae_decision_states_training_data.shape)
print(len(ae_after_states_training_data_dict))

(2716, 87)
4


In [26]:
autoencoder, _, _ = pretrain_autoencoder(ae_decision_states_training_data, ae_after_states_training_data_dict, n_std=2.5)

size train: (2256, 86), size valid: (565, 86)
---Training AE---
n_features: 86
Sequential(
  (0): Linear(in_features=86, out_features=64, bias=True)
  (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): GELU(approximate='none')
  (3): Linear(in_features=64, out_features=32, bias=True)
  (4): GELU(approximate='none')
  (5): Linear(in_features=32, out_features=16, bias=True)
  (6): GELU(approximate='none')
  (7): Linear(in_features=16, out_features=8, bias=True)
  (8): GELU(approximate='none')
  (9): Linear(in_features=8, out_features=16, bias=True)
  (10): GELU(approximate='none')
  (11): Linear(in_features=16, out_features=32, bias=True)
  (12): GELU(approximate='none')
  (13): Linear(in_features=32, out_features=64, bias=True)
  (14): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (15): GELU(approximate='none')
  (16): Linear(in_features=64, out_features=86, bias=True)
  (17): GELU(approximate='none')
)


In [27]:
autoencoder.evaluate(rl_decision_states_training_data_dict, rl_after_states_training_data_dict, n_std=3, tablefmt='pipe')

-----------
| State         | After MTD                   | Behavior                 | Accuracy   |   \#Samples |
|:--------------|:----------------------------|:-------------------------|:-----------|------------:|
| Decisionstate | None                        | NORMAL                   | 99.14\%    |         582 |
| Decisionstate | None                        | RANSOMWARE\_POC          | 100.00\%   |         905 |
| Decisionstate | None                        | ROOTKIT\_BDVL            | 100.00\%   |         799 |
| Decisionstate | None                        | CNC\_BACKDOOR\_JAKORITAR | 40.68\%    |        1025 |
| Decisionstate | None                        | ROOTKIT\_BEURK           | 1.16\%     |         951 |
| Decisionstate | None                        | CNC\_THETICK             | 100.00\%   |         730 |
| Decisionstate | None                        | CNC\_OPT1                | 99.81\%    |        1061 |
| Decisionstate | None                        | CNC\_OPT2             