# Investigating the State Anomaly Detection

In [1]:
import os
import sys
import torch
import numpy as np
import pandas as pd
from time import time
from scipy import stats
from tabulate import tabulate

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report, accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
from enum import Enum

class Behavior(Enum):
    NORMAL = "normal"
    ROOTKIT_BDVL = "bdvl"
    ROOTKIT_BEURK = "beurk"
    CNC_BACKDOOR_JAKORITAR = "backdoor_jakoritar"
    CNC_THETICK = "the_tick"
    CNC_OPT1 = "data_leak_1"
    CNC_OPT2 = "data_leak_2"
    RANSOMWARE_POC = "ransomware_poc"


In [2]:
dataset = pd.read_csv('dataset-01.csv')        

In [71]:
print(f"Full length of dataset: {len(dataset)}")

Full length of dataset: 59004


In [3]:
# Dropping time status features
time_status_columns = ['time', 'timestamp', 'seconds']
try:
    dataset.drop(time_status_columns, inplace=True, axis=1)
except:
    print(f"All time status features {(time_status_columns)} are removed from the dataset")

In [4]:
from fast_ml.feature_selection import get_constant_features

# Removing constant features
constant_features = set(get_constant_features(dataset, threshold=0.99, dropna=False)['Var'])
try:
    dataset.drop(constant_features, inplace=True, axis=1)
except:
    print(f"All constant features {(constant_features)} are removed from the dataset")

In [55]:
from sklearn.preprocessing import MinMaxScaler
# Scaling
fit_normal_behavior_only = True
standard_scaling = False
if standard_scaling:
    scaler = StandardScaler()
else:
    scaler = MinMaxScaler()
    
print(f"Using {scaler}")

if fit_normal_behavior_only: 
    scaler.fit(dataset[dataset['behavior'] == "Behavior.NORMAL"].values[:,:-1])
else: 
    scaler.fit(dataset.values[:,:-1])

scaled_dataset = pd.DataFrame(scaler.transform(dataset.values[:,:-1]), columns=dataset.columns.drop("behavior"), index=dataset.index)
scaled_dataset["behavior"] = dataset["behavior"]

Using MinMaxScaler()


In [146]:
X_normal = scaled_dataset.loc[scaled_dataset['behavior'] == "Behavior.NORMAL"].drop(["behavior"],  axis=1).to_numpy()
X_normal.shape

(14702, 85)

In [156]:
import torch.nn as nn

class RMSELoss(nn.Module):
    def __init__(self, eps=1e-6):
        super().__init__()
        self.mse = torch.nn.MSELoss()
        self.eps = eps
        
    def forward(self, yhat, y):
        loss = torch.sqrt(self.mse(yhat,y) + self.eps)
        return loss

class AutoEncoder(torch.nn.Module):
    

    def __init__(self, n_features):
        super().__init__()

        self.model = nn.Sequential(
        nn.Linear(n_features, 64),
        nn.BatchNorm1d(64),
        nn.GELU(),
        nn.Linear(64, 16),
        nn.GELU(),
        #nn.Linear(32, 16),
        #nn.GELU(),
        #nn.Linear(16, 32),
        #nn.GELU(),
        nn.Linear(16, 64),
        nn.BatchNorm1d(64),
        nn.GELU(),
        nn.Linear(64, n_features),
        nn.GELU()
    )
        self.threshold = None
        self.loss_mean = None
        self.loss_standard_deviation = None
        self.prediction_loss_function = torch.nn.MSELoss(reduction="sum")
        
        
    def forward(self, X):
        return self.model(X)
    
    
    def pretrain(self, X_normal, optimizer=torch.optim.SGD, loss_function=torch.nn.MSELoss(reduction='mean'), num_epochs: int = 15, batch_size=64, verbose=False):
        threshold = int(0.5*len(X_normal))
        X_train = X_normal[:threshold]
        X_valid = X_normal[threshold:]
        
        training_dataset = torch.utils.data.TensorDataset(
            torch.from_numpy(X_train).type(torch.float),
        )
        training_data_loader = torch.utils.data.DataLoader(training_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

        epoch_losses = []
        #for e in tqdm(range(num_epochs), unit="epoch", leave=False):
        for e in range(num_epochs):
            self.train()
            current_losses = []
            for batch_index, (inputs,) in enumerate(training_data_loader):
                optimizer.zero_grad()
                outputs = self.forward(inputs)
                loss = loss_function(inputs, outputs)
                loss.backward()
                optimizer.step()
                current_losses.append(loss.item())
            
            epoch_losses.append(np.average(current_losses))
            if verbose:
                print(f'Training Loss in epoch {e + 1}: {epoch_losses[e]}')
            
        self.analyze_loss(X_valid)

    '''
    This function uses normal data samles 
    after training the autoencoder to determine
    values that can be considered normal
    for the reconstruction loss based on normal samples
    '''
    def analyze_loss(self, X_valid):
        validation_dataset = torch.utils.data.TensorDataset(
            torch.from_numpy(X_valid).type(torch.float),

        )
        validation_data_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=1, shuffle=True, drop_last=True)
        
        losses = []
        
        self.eval() 
        with torch.no_grad():
            for batch_index, (inputs,) in enumerate(validation_data_loader):
                outputs = self.forward(inputs)
                loss = self.prediction_loss_function(inputs, outputs)
                losses.append(loss.item())
        
        losses = np.array(losses)
        self.loss_mean = losses.mean()
        self.loss_standard_deviation = losses.std()

    def predict(self, x, n_std=3):
        test_data = torch.utils.data.TensorDataset(
            torch.from_numpy(x).type(torch.float32)
        )
        test_data_loader = torch.utils.data.DataLoader(test_data, batch_size=1, shuffle=False)

        all_predictions = torch.tensor([])  # .cuda()

        self.eval()
        with torch.no_grad():
            for idx, (batch_x,) in enumerate(test_data_loader):
                model_predictions = self.forward(batch_x)
                model_predictions = self.prediction_loss_function(model_predictions, batch_x).unsqueeze(0)  # unsqueeze as batch_size set to 1
                all_predictions = torch.cat((all_predictions, model_predictions))

        threshold = self.loss_mean + n_std * self.loss_standard_deviation
        all_predictions = (all_predictions > threshold).type(torch.long)
        return all_predictions.flatten()
    
    
    def predict_deviation(self, x):
        test_data = torch.utils.data.TensorDataset(
            torch.from_numpy(x).type(torch.float32)
        )
        test_data_loader = torch.utils.data.DataLoader(test_data, batch_size=1, shuffle=False)

        prediction_errors = torch.tensor([])
        
        self.eval()
        with torch.no_grad():
            for batch_index, (inputs,) in enumerate(test_data_loader):
                prediction = self.forward(inputs)
                prediction_error = self.prediction_loss_function(inputs, prediction).unsqueeze(0)  # unsqueeze as batch_size set to 1
                prediction_errors = torch.cat((prediction_errors, prediction_error))

        return prediction_errors
    
    
    def score(self):
        n_std, accuracy = self.accuracy_score(None, None)
        if self.verbose:
            print(f"Highest validation accuracy achieved {accuracy:.2f} with n_std={n_std}")
            self.evaluate(n_std)
        return accuracy
    
    
    def accuracy_score(self, X, y):
        #if not self.threshold:
        #loss_mean, loss_standard_deviation = self.analyze_loss(X)
        #n_stds = np.arange(0.1, 3, 0.1)
        if self.loss_mean == None or self.loss_standard_deviation == None:
              #print("accuracy_score_optimized > accurcy_loss()")
              self.analyze_loss()
    
        best_accuracy = 0
        best_n_std = 0
        #accuracies = []
        y_dev = self.predict_deviation((self.X_test).astype(np.float32))
        for n_std in self.n_stds:
            y_true = self.y_test
            threshold = self.loss_mean + n_std * self.loss_standard_deviation
            y_pred = (y_dev > threshold).type(torch.long).detach().cpu().numpy()
            
            accuracy = accuracy_score(y_true, y_pred)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_n_std = n_std
            #if self.verbose:
            #    print(f"n_std {n_std:.2f} -> accuracy: {accuracy}")

        return best_n_std, best_accuracy
    
    
    def evaluate(self, evaluation_data, n_std=3, tablefmt='pipe'):
        results = []
        labels= [0,1]
        pos_label = 1
        
        y_true_total = np.empty([0])
        y_pred_total = np.empty([0])
        for behavior in Behavior:
            X_behavior = evaluation_data.loc[evaluation_data['behavior'] == f"Behavior.{behavior.name}"].drop(["behavior"],  axis=1).to_numpy()
            y_true = np.array([0 if behavior == Behavior.NORMAL else 1] * len(X_behavior)).astype(int)
            y_true_total = np.concatenate((y_true_total, y_true))
            
            #print(f"Using n_std: {n_std} as prediction threshold")
            y_pred = self.predict(X_behavior.astype(np.float32), n_std=n_std)
            y_pred_total = np.concatenate((y_pred_total, y_pred))

            accuracy = accuracy_score(y_true, y_pred)

            n_samples = len(y_true)
            results.append([behavior.name.replace("_", "\_"), f'{(100 * accuracy):.2f}\%', '\\notCalculated', '\\notCalculated', '\\notCalculated', str(n_samples)])

        accuracy = accuracy_score(y_true_total, y_pred_total)
        precision = precision_score(y_true_total, y_pred_total, average='binary', labels=labels, pos_label=pos_label, zero_division=1)
        recall = recall_score(y_true_total, y_pred_total, average='binary', labels=labels, pos_label=pos_label, zero_division=1)
        f1 = f1_score(y_true_total, y_pred_total, average='binary', labels=labels, pos_label=pos_label, zero_division=1)
        n_samples = len(y_true_total)
        results.append(["GLOBAL", f'{(100 * accuracy):.2f}\%', f'{(100 * precision):.2f}\%', f'{(100 * recall):.2f}\%', f'{(100 * f1):.2f}\%', n_samples])
        print("-----------")
        print(tabulate(results, headers=["Behavior", "Accuracy", "Precision", "Recall", "F1-Score", "\#Samples"], tablefmt=tablefmt)) 

In [157]:
N_FEATURES = X_normal.shape[1]
autoencoder = AutoEncoder(N_FEATURES)

In [158]:
autoencoder.pretrain(X_normal, optimizer=torch.optim.Adam(autoencoder.parameters(), lr=1e-4,  weight_decay=0.01), loss_function=RMSELoss(), num_epochs=100, batch_size=64, verbose=False)

In [159]:
evaluation_data = scaled_dataset
autoencoder.evaluate(evaluation_data, n_std=0.7, tablefmt='latex_raw')

-----------
\begin{tabular}{lllllr}
\hline
 Behavior                 & Accuracy   & Precision      & Recall         & F1-Score       &   \#Samples \\
\hline
 NORMAL                   & 95.50\%    & \notCalculated & \notCalculated & \notCalculated &       14702 \\
 ROOTKIT\_BDVL            & 100.00\%   & \notCalculated & \notCalculated & \notCalculated &        5698 \\
 ROOTKIT\_BEURK           & 100.00\%   & \notCalculated & \notCalculated & \notCalculated &        7358 \\
 CNC\_BACKDOOR\_JAKORITAR & 100.00\%   & \notCalculated & \notCalculated & \notCalculated &        4312 \\
 CNC\_THETICK             & 100.00\%   & \notCalculated & \notCalculated & \notCalculated &        7704 \\
 CNC\_OPT1                & 100.00\%   & \notCalculated & \notCalculated & \notCalculated &        5687 \\
 CNC\_OPT2                & 100.00\%   & \notCalculated & \notCalculated & \notCalculated &        4162 \\
 RANSOMWARE\_POC          & 100.00\%   & \notCalculated & \notCalculated & \notCalculated &   