In [8]:
import os
from sklearn import metrics
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 
from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score, balanced_accuracy_score, roc_auc_score, precision_recall_fscore_support, classification_report

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import average_precision_score, matthews_corrcoef
import warnings
warnings.filterwarnings('ignore')

# Import data

In [9]:
print(os.getcwd())
relative_path = os.path.abspath(os.path.join(os.getcwd(), "../../"))
train_df = pd.read_csv(relative_path+'/data/labelled_training_data.csv')
test_df = pd.read_csv(relative_path+'/data/labelled_testing_data.csv')
validation_df = pd.read_csv(relative_path+'/data/labelled_validation_data.csv')

c:\Users\klimczak\Desktop\FYP Final Files\notebooks\Preprocessing


# Helpers


In [10]:
def prepare_dataset(df: pd.DataFrame) -> pd.DataFrame:
    """
    Funcction that takes in a DataFrame and returns a modified DataFrame including data processing as stated by the author of https://www.gatsby.ucl.ac.uk/~balaji/udl2021/accepted-papers/UDL2021-paper-033.pdf
    """
    
    df["processId"] = df["processId"].map(lambda x: 0 if x in [0, 1, 2] else 1)  # Map to OS/not OS
    df["parentProcessId"] = df["parentProcessId"].map(lambda x: 0 if x in [0, 1, 2] else 1)  # Map to OS/not OS
    df["userId"] = df["userId"].map(lambda x: 0 if x < 1000 else 1)  # Map to OS/not OS
    df["mountNamespace"] = df["mountNamespace"].map(lambda x: 0 if x == 4026531840 else 1)  # Map to mount access to mnt/ (all non-OS users) /elsewhere
    df["eventId"] = df["eventId"]  # Keep eventId values (requires knowing max value)
    df["returnValue"] = df["returnValue"].map(lambda x: 0 if x == 0 else (1 if x > 0 else 2))  # Map to success/success with value/error
    
    features = df[["processId", "parentProcessId", "userId", "mountNamespace", "eventId", "argsNum", "returnValue",'System_Core','Amazon_AWS','Container_Virtualization','Maintenance_Package_Management','Security_Monitoring','Utility_Misc','sus','evil']]
    labels = df[['sus','evil']]
    
    return features, labels

In [11]:
def one_hot_process_names(dataset, process_name_column='processName'):
    groupings = {
        'System_Core': ['systemd', 'systemd-logind', 'systemd-journal', 'systemd-user-ru',
                        'systemd-resolve', 'systemd-network', 'systemd-timesyn', 'systemd-udevd',
                        'dbus-daemon', 'cron', 'kworker/dying', 'kworker/u30:1', 'kworker/u30:0',
                        'kworker/u30:2', '(sd-pam)','close'],

        'Amazon_AWS': ['amazon-ssm-agen', 'ssm-agent-worke'],

        'Container_Virtualization': ['docker', 'dockerd', 'containerd-shim'],
        
        'Maintenance_Package_Management': ['run-parts', 'atd', 'snapd', 'packagekitd',
                                           'systemd-tmpfile', '(tmpfiles)'],

        'Security_Monitoring': ['sshd'],

        'Utility_Misc': ['sh', 'ps', 'gdbus', 'gmain', 'journal-offline', '(time-dir)',
                         '(sd-sync)', 'poweroff']
    }

    # one-hot encoding for each group
    for group_name, processes in groupings.items():
        dataset[group_name] = dataset[process_name_column].apply(lambda x: 1 if x in processes else 0)
    return dataset

## Apply preprocessing to datasets

In [12]:
print(train_df.shape)
print(test_df.shape)
print(validation_df.shape)

(763144, 16)
(188967, 16)
(188967, 16)


In [13]:
train_df = one_hot_process_names(train_df)
test_df = one_hot_process_names(test_df)
validation_df = one_hot_process_names(validation_df)

In [14]:
train_df_feats, train_df_labels = prepare_dataset(train_df)
test_df_feats, test_df_labels = prepare_dataset(test_df)
val_df_feats, val_df_labels = prepare_dataset(validation_df)

In [15]:
print(train_df_feats.shape)
print(test_df_feats.shape)
print(val_df_feats.shape)
train_df_feats.head()

(763144, 15)
(188967, 15)
(188967, 15)


Unnamed: 0,processId,parentProcessId,userId,mountNamespace,eventId,argsNum,returnValue,System_Core,Amazon_AWS,Container_Virtualization,Maintenance_Package_Management,Security_Monitoring,Utility_Misc,sus,evil
0,1,0,0,1,157,5,0,1,0,0,0,0,0,1,0
1,1,0,0,1,3,1,0,1,0,0,0,0,0,1,0
2,1,0,0,1,1010,0,0,1,0,0,0,0,0,1,0
3,1,1,0,0,21,2,2,0,0,0,0,0,1,1,0
4,1,1,0,0,1005,4,0,0,0,0,0,0,1,1,0


# Output CSV's 

In [16]:
print(os.getcwd())
# os.chdir('A:/Desktop/Kamil_Klimczak_FYP/preprocessed_data/full_dataset')
relative_path = os.path.abspath(os.path.join(os.getcwd(), "../../"))
print(os.getcwd())
train_df_feats.to_csv(relative_path+'/preprocessed_data/full_dataset_train_test_val_OHE/full_train_df_feats_OHE.csv', index=False)
train_df_labels.to_csv(relative_path+'/preprocessed_data/full_dataset_train_test_val_OHE/full_train_df_labels_OHE.csv', index=False)

test_df_feats.to_csv(relative_path+'/preprocessed_data/full_dataset_train_test_val_OHE/full_test_df_feats_OHE.csv', index=False)
test_df_labels.to_csv(relative_path+'/preprocessed_data/full_dataset_train_test_val_OHE/full_test_df_labels_OHE.csv', index=False)

val_df_feats.to_csv(relative_path+'/preprocessed_data/full_dataset_train_test_val_OHE/full_val_df_feats_OHE.csv', index=False)
val_df_labels.to_csv(relative_path+'/preprocessed_data/full_dataset_train_test_val_OHE/full_val_df_labels_OHE.csv', index=False)

c:\Users\klimczak\Desktop\FYP Final Files\notebooks\Preprocessing
c:\Users\klimczak\Desktop\FYP Final Files\notebooks\Preprocessing
