In [1]:
import os
from sklearn import metrics
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 
from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score, balanced_accuracy_score, roc_auc_score, precision_recall_fscore_support, classification_report

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import average_precision_score, matthews_corrcoef
import warnings
warnings.filterwarnings('ignore')

# Import data

In [2]:
print(os.getcwd())
relative_path = os.path.abspath(os.path.join(os.getcwd(), "../../"))
train_df = pd.read_csv(relative_path+'/data/labelled_training_data.csv')
test_df = pd.read_csv(relative_path+'/data/labelled_testing_data.csv')
validation_df = pd.read_csv(relative_path+'/data/labelled_validation_data.csv')

a:\Desktop\FYP Final Files\notebooks\Preprocessing


In [4]:
train_df.processName.unique()

array(['close', 'sh', 'run-parts', 'atd', 'systemd-logind', 'systemd',
       'systemd-journal', 'dbus-daemon', '(time-dir)', 'systemd-user-ru',
       'docker', 'systemd-resolve', 'dockerd', 'containerd-shim',
       'amazon-ssm-agen', 'ps', 'cron', 'snapd', 'systemd-network',
       'journal-offline', 'kworker/dying', 'ssm-agent-worke',
       'packagekitd', 'gdbus', 'gmain', '(tmpfiles)', 'systemd-tmpfile',
       'kworker/u30:1', 'poweroff', '(sd-sync)', 'kworker/u30:0',
       'kworker/u30:2', 'systemd-timesyn', 'systemd-udevd', '(sd-pam)',
       'sshd'], dtype=object)

In [5]:
validation_df.processName.unique()

array(['systemd-resolve', 'systemd-network', 'systemd', 'sshd',
       'systemd-journal', 'dbus-daemon', 'systemd-logind',
       'systemd-udevd', 'docker', 'dockerd', 'cron', 'systemd-timesyn',
       '(sd-pam)', '(time-dir)', 'systemd-user-ru', 'containerd-shim',
       'amazon-ssm-agen', 'ps', 'snapd', 'journal-offline',
       'kworker/dying', 'ssm-agent-worke', 'packagekitd', 'gmain',
       'gdbus', 'kworker/u30:2', 'poweroff', '(sd-sync)', 'kworker/u30:3',
       'accounts-daemon', 'acpid', '(activate)'], dtype=object)

In [3]:
test_df.processName.unique()

array(['systemd-resolve', 'systemd-network', 'systemd', 'sshd',
       'systemd-journal', 'dbus-daemon', 'systemd-logind', 'docker',
       'cron', '(sd-pam)', '(time-dir)', 'systemd-user-ru',
       'containerd-shim', 'amazon-ssm-agen', 'ps', 'snapd',
       'journal-offline', 'kworker/dying', 'ssm-agent-worke',
       'packagekitd', 'gmain', 'gdbus', '(systemd)', '(sd-executor)',
       '(direxec)', '30-systemd-envi', '(ystemctl)', 'systemctl', 'sh',
       'env', 'run-parts', '00-header', 'uname', '10-help-text',
       '50-landscape-sy', 'grep', 'bc', 'cut', 'date', 'landscape-sysin',
       'who', '50-motd-news', 'cat', 'head', 'tr', '85-fwupd',
       '90-updates-avai', 'find', '91-release-upgr', 'lsb_release', 'id',
       'release-upgrade', 'stat', 'expr', '92-unattended-u',
       'update-motd-una', '95-hwe-eol', 'update-motd-hwe',
       'systemd-detect-', 'apt-config', 'dpkg', 'dirname', 'mktemp',
       'hwe-support-sta', 'mv', 'rm', '97-overlayroot', 'egrep', 'sort',
     

# Helpers

In [4]:
def prepare_dataset_no_split(df: pd.DataFrame) -> pd.DataFrame:
    """
    Funcction that takes in a DataFrame and returns a modified DataFrame including data processing as stated by the author of https://www.gatsby.ucl.ac.uk/~balaji/udl2021/accepted-papers/UDL2021-paper-033.pdf
    """
    
    df["processId"] = df["processId"].map(lambda x: 0 if x in [0, 1, 2] else 1)  # Map to OS/not OS
    df["parentProcessId"] = df["parentProcessId"].map(lambda x: 0 if x in [0, 1, 2] else 1)  # Map to OS/not OS
    df["userId"] = df["userId"].map(lambda x: 0 if x < 1000 else 1)  # Map to OS/not OS
    df["mountNamespace"] = df["mountNamespace"].map(lambda x: 0 if x == 4026531840 else 1)  # Map to mount access to mnt/ (all non-OS users) /elsewhere
    df["eventId"] = df["eventId"]  # Keep eventId values (requires knowing max value)
    df["returnValue"] = df["returnValue"].map(lambda x: 0 if x == 0 else (1 if x > 0 else 2))  # Map to success/success with value/error
    
    df = df[["processId", "parentProcessId", "userId", "mountNamespace", "eventId", "argsNum", "returnValue",'System_Core','Amazon_AWS','Container_Virtualization','Maintenance_Package_Management','Security_Monitoring','Utility_Misc','sus','evil']]
    return df

In [5]:
def one_hot_process_names(dataset, process_name_column='processName'):
    # Defining the groupings 
    groupings = {
        'System_Core': ['systemd', 'systemd-logind', 'systemd-journal', 'systemd-user-ru',
                        'systemd-resolve', 'systemd-network', 'systemd-timesyn', 'systemd-udevd',
                        'dbus-daemon', 'cron', 'kworker/dying', 'kworker/u30:1', 'kworker/u30:0',
                        'kworker/u30:2', '(sd-pam)','close'],

        'Amazon_AWS': ['amazon-ssm-agen', 'ssm-agent-worke'],

        'Container_Virtualization': ['docker', 'dockerd', 'containerd-shim'],
        
        'Maintenance_Package_Management': ['run-parts', 'atd', 'snapd', 'packagekitd',
                                           'systemd-tmpfile', '(tmpfiles)'],

        'Security_Monitoring': ['sshd'],

        'Utility_Misc': ['sh', 'ps', 'gdbus', 'gmain', 'journal-offline', '(time-dir)',
                         '(sd-sync)', 'poweroff']
    }

    # Create one-hot encoding for each group
    for group_name, processes in groupings.items():
        dataset[group_name] = dataset[process_name_column].apply(lambda x: 1 if x in processes else 0)
    return dataset

## Apply preprocessing to datasets

In [5]:
train_df = one_hot_process_names(train_df)
test_df = one_hot_process_names(test_df)
validation_df = one_hot_process_names(validation_df)

In [14]:
train_df= prepare_dataset_no_split(train_df)
test_df= prepare_dataset_no_split(test_df)
validation_df = prepare_dataset_no_split(validation_df)

KeyError: "['System_Core', 'Amazon_AWS', 'Container_Virtualization', 'Maintenance_Package_Management', 'Security_Monitoring', 'Utility_Misc'] not in index"

## Combine dataset

In [7]:
full_dataset = pd.concat([train_df,test_df,validation_df],axis=0)

In [8]:
full_dataset.shape

(1141078, 15)

In [9]:
full_dataset.head()

Unnamed: 0,processId,parentProcessId,userId,mountNamespace,eventId,argsNum,returnValue,System_Core,Amazon_AWS,Container_Virtualization,Maintenance_Package_Management,Security_Monitoring,Utility_Misc,sus,evil
0,1,0,0,1,157,5,0,1,0,0,0,0,0,1,0
1,1,0,0,1,3,1,0,1,0,0,0,0,0,1,0
2,1,0,0,1,1010,0,0,1,0,0,0,0,0,1,0
3,1,1,0,0,21,2,2,0,0,0,0,0,1,1,0
4,1,1,0,0,1005,4,0,0,0,0,0,0,1,1,0


## Output CSV

In [10]:
print(os.getcwd())
relative_path = os.path.abspath(os.path.join(os.getcwd(), "../../"))
print(relative_path)
full_dataset.to_csv(relative_path+'/preprocessed_data/non_split_dataset_OHE/total_subset.csv', index=False)

c:\Users\klimczak\Desktop\FYP Final Files\notebooks\Preprocessing
c:\Users\klimczak\Desktop\FYP Final Files
