In [1]:
import kagglehub
import pandas as pd
import torch
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import numpy as np

scaler = MinMaxScaler()

# get dataset
path = kagglehub.dataset_download("hassan06/nslkdd")
print("Path to dataset files:", path)

train_name, test_name = 'KDDTrain+.arff', 'KDDTest+.arff'

Path to dataset files: /home/joao/.cache/kagglehub/datasets/hassan06/nslkdd/versions/1


In [2]:
# couldn't get arff loading libaries to work, so I'll do it manually
def parse_arff(p):
    with open(p, 'r') as f:
        lines = f.readlines()

        attribute_strings = [l.replace('\n', '').replace('@attribute ', '') for l in lines if l.startswith('@attribute')]

        attributes = {}
        for s in attribute_strings:
            # we only have attribute type real or categorical in this dataset
            att_name = s[0:s.find(' ')].replace('\'', '')
            if s.endswith('real'):
                attributes[att_name] = 'real'
            else:
                attributes[att_name] = eval(s[s.find(' ')+1:])

        data_ind = -1 # data starts at this index
        for i in range(len(lines)):
            if lines[i].find('@data') != -1:
                data_ind = i+1
        
        data = [] 
        positional_attribs = list(attributes.items()) # python 3.7+ guarantees dict order of insertion
        for d in lines[data_ind:]:
            attribs = d.replace('\n', '').split(',')
            row = []
            for i in range(len(attribs)):
                if positional_attribs[i][1] == 'real' or str(positional_attribs[i][1]) == '{\'0\', \'1\'}':
                    row.append(float(attribs[i]))
                else:
                    row.append(str(attribs[i]))
            data.append(row)

    return attributes, data

In [3]:
attributes, train_data = parse_arff(path+'/'+train_name)

train_data_df = pd.DataFrame(train_data, columns=list(attributes.keys()))

# exclude anomalous entries for encoder training
train_data_df = train_data_df[train_data_df['class'] == 'normal']
train_data_df = train_data_df.drop(columns=['class'])

# one hot encode categorical data
train_data_df['service'] = pd.Categorical(train_data_df['service'], categories=attributes['service'])
train_data_df['flag'] = pd.Categorical(train_data_df['flag'], categories=attributes['flag'])
train_data_df = pd.get_dummies(train_data_df, columns=['protocol_type', 'service', 'flag'])

#train_data_df

In [4]:
# before normalizing, let's see if we don't have any huge outliers since we're doing minmax scaling
# we can exclude our hot encoded ones, and also ignore any of the rate features or booleans
columns = list(train_data_df.columns)
excluded_indexes = [i for i in range(columns.index('protocol_type_icmp'), len(columns))]
excluded_indexes.append([])

train_data_np = train_data_df.to_numpy()
for i in range(len(columns)):
    if i not in excluded_indexes:
        col = train_data_np[:, i]
        std = col.std()
        if std != 0 and columns[i].find('rate') == -1: 
            z_col = col/std
            z_col = z_col[(z_col > 6) | (z_col < -6)] # count entries that are beyond 6 z-score range
            print(f'for column {i} ({columns[i]}) std is {std:.2f}:', len(z_col))
            print(f'\t z-score of 6 corresponds to {std*6:.2f}')
    
            # automatically filter from out dataframe
            train_data_df = train_data_df[train_data_df[columns[i]] <= std*6]
        
# investigate by plotting our samples
#plt.scatter(np.linspace(0, 100000, len(train_data_df['duration'])), train_data_df['duration'], s=1)

# maybe log transform some of the fields that have 0 or large values
train_data_df['duration'] = np.log(train_data_df['duration']+0.001)
train_data_df['src_bytes'] = np.log(train_data_df['src_bytes']+0.001)
train_data_df['dst_bytes'] = np.log(train_data_df['dst_bytes']+0.001)

for column 0 (duration) std is 1304.44: 545
	 z-score of 6 corresponds to 7826.65
for column 1 (src_bytes) std is 418110.03: 16
	 z-score of 6 corresponds to 2508660.18
for column 2 (dst_bytes) std is 65462.33: 71
	 z-score of 6 corresponds to 392773.99
for column 3 (land) std is 0.01: 7
	 z-score of 6 corresponds to 0.06
for column 5 (urgent) std is 0.02: 6
	 z-score of 6 corresponds to 0.10
for column 6 (hot) std is 2.31: 520
	 z-score of 6 corresponds to 13.85
for column 7 (num_failed_logins) std is 0.05: 68
	 z-score of 6 corresponds to 0.30
for column 8 (logged_in) std is 0.45: 0
	 z-score of 6 corresponds to 2.72
for column 9 (num_compromised) std is 32.74: 43
	 z-score of 6 corresponds to 196.46
for column 10 (root_shell) std is 0.05: 137
	 z-score of 6 corresponds to 0.27
for column 11 (su_attempted) std is 0.06: 79
	 z-score of 6 corresponds to 0.37
for column 12 (num_root) std is 33.37: 45
	 z-score of 6 corresponds to 200.21
for column 13 (num_file_creations) std is 0.65: 81

In [5]:
print(len(train_data_df))

65162


In [15]:
# normalize
data = scaler.fit_transform(train_data_df)
data.shape

(65162, 122)

In [13]:
# We'll use pytorch to implement our model
# source for architecture: https://pmc.ncbi.nlm.nih.gov/articles/PMC8272075/pdf/sensors-21-04294.pdf
# Apparently, it looks like different depth and hidden layer neuron numbers achieve very similar results on this dataset
# We'll go for a symmetric autoencoder with depth of 5 with 64 neurons on the first hidden layer (subsequent layers divide number of neurons by 2)
class Autoencoder(torch.nn.Module):
    def __init__(self, input_dim, latent_dim):
        super().__init__()
        self.encoder = torch.nn.Sequential(
            torch.nn.Linear(input_dim, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, 32),
            torch.nn.ReLU(),
            torch.nn.Linear(32, latent_dim),
            # torch.nn.ReLU() MAYBE?
        )

        self.decoder = torch.nn.Sequential(
            torch.nn.Linear(latent_dim, 32),
            torch.nn.ReLU(),
            torch.nn.Linear(32, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, input_dim),
            torch.nn.Sigmoid()
        )

    def forward(self, x):
        return self.decoder(self.encoder(x))

In [20]:
model = Autoencoder(data.shape[1], 3)
model.forward(torch.from_numpy(data[0]).float())

tensor([0.5121, 0.4626, 0.4720, 0.5340, 0.4542, 0.4727, 0.4755, 0.4666, 0.5350,
        0.5373, 0.4960, 0.5084, 0.5191, 0.4951, 0.5448, 0.5674, 0.5186, 0.4893,
        0.4677, 0.4663, 0.5431, 0.5050, 0.4585, 0.5016, 0.4916, 0.4560, 0.5265,
        0.4868, 0.5285, 0.5040, 0.5436, 0.5016, 0.4953, 0.5133, 0.4617, 0.5343,
        0.5017, 0.4994, 0.4599, 0.5280, 0.4831, 0.4723, 0.4829, 0.4633, 0.4612,
        0.4808, 0.4948, 0.4489, 0.4599, 0.4908, 0.5101, 0.4962, 0.4734, 0.5055,
        0.5069, 0.5067, 0.5068, 0.5057, 0.4919, 0.4904, 0.5016, 0.5162, 0.5244,
        0.4988, 0.4849, 0.4495, 0.4814, 0.4917, 0.5354, 0.5167, 0.4880, 0.4826,
        0.5295, 0.4465, 0.4943, 0.5219, 0.5291, 0.5014, 0.4618, 0.5002, 0.5109,
        0.4402, 0.4630, 0.4857, 0.5010, 0.5176, 0.5168, 0.4984, 0.4886, 0.4819,
        0.5289, 0.5505, 0.4936, 0.4985, 0.4611, 0.4874, 0.5482, 0.4771, 0.4716,
        0.4840, 0.5507, 0.5419, 0.5079, 0.5142, 0.5125, 0.5064, 0.5307, 0.5388,
        0.4718, 0.5283, 0.4889, 0.4906, 