In [1]:
import os
from sklearn import metrics
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 
from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score, balanced_accuracy_score, roc_auc_score, precision_recall_fscore_support, classification_report

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import average_precision_score, matthews_corrcoef
import warnings
warnings.filterwarnings('ignore')

# Import data

In [2]:
print(os.getcwd())
relative_path = os.path.abspath(os.path.join(os.getcwd(), "../../"))
train_df = pd.read_csv(relative_path+'/data/labelled_training_data.csv')
test_df = pd.read_csv(relative_path+'/data/labelled_testing_data.csv')
validation_df = pd.read_csv(relative_path+'/data/labelled_validation_data.csv')

c:\Users\klimczak\Desktop\FYP Final Files\notebooks\Preprocessing


# Helpers

In [3]:
def prepare_dataset(df: pd.DataFrame) -> pd.DataFrame:
    """
    Data preprocessing function that takes in a dataset and modifies certain columns as stated by the author in the preprocessing section of the paper https://www.gatsby.ucl.ac.uk/~balaji/udl2021/accepted-papers/UDL2021-paper-033.pdf
    """
    df["processId"] = df["processId"].map(lambda x: 0 if x in [0, 1, 2] else 1)  # Map to OS/not OS
    df["parentProcessId"] = df["parentProcessId"].map(lambda x: 0 if x in [0, 1, 2] else 1)  # Map to OS/not OS
    df["userId"] = df["userId"].map(lambda x: 0 if x < 1000 else 1)  # Map to OS/not OS
    df["mountNamespace"] = df["mountNamespace"].map(lambda x: 0 if x == 4026531840 else 1)  # Map to mount access to mnt/ (all non-OS users) /elsewhere
    df["eventId"] = df["eventId"]  # Keep eventId values
    df["returnValue"] = df["returnValue"].map(lambda x: 0 if x == 0 else (1 if x > 0 else 2))  # Map to success/success with value/error

    features = df[["processId", "parentProcessId", "userId", "mountNamespace", "eventId", "argsNum", "returnValue",'evil','sus']]
    labels = df['sus']
    
    return features, labels

## Apply preprocessing to datasets

In [4]:
print(train_df.shape)
print(test_df.shape)
print(validation_df.shape)

(763144, 16)
(188967, 16)
(188967, 16)


In [5]:
train_df , train_lables= prepare_dataset(train_df)
test_df,test_lables= prepare_dataset(test_df)
validation_df,val_lables = prepare_dataset(validation_df)

## Combine dataset

In [6]:
full_dataset = pd.concat([train_df,test_df,validation_df],axis=0)

In [7]:
full_dataset.shape

(1141078, 9)

In [8]:
full_dataset.head()

Unnamed: 0,processId,parentProcessId,userId,mountNamespace,eventId,argsNum,returnValue,evil,sus
0,1,0,0,1,157,5,0,0,1
1,1,0,0,1,3,1,0,0,1
2,1,0,0,1,1010,0,0,0,1
3,1,1,0,0,21,2,2,0,1
4,1,1,0,0,1005,4,0,0,1


## Output CSV

In [9]:
print(os.getcwd())
# os.chdir('A:/Desktop/Kamil_Klimczak_FYP/preprocessed_data/full_dataset')
relative_path = os.path.abspath(os.path.join(os.getcwd(), "../../"))
print(os.getcwd())
full_dataset.to_csv(relative_path+'/preprocessed_data/non_split_dataset/total_subset.csv', index=False)

c:\Users\klimczak\Desktop\FYP Final Files\notebooks\Preprocessing
c:\Users\klimczak\Desktop\FYP Final Files\notebooks\Preprocessing
