In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler

# Fetch data: 10% labeled data

### 1. Download

In [3]:
df = pd.read_csv('../../data/kddcup.data_10_percent.gz', header=None)
cols = pd.read_csv('../../data/kddcup.names',header=None)

### 2. Add column names to DataFrame

In [4]:
if cols[0][0] == 'back':
    cols = cols.drop(cols.index[0])
    cols.reset_index(drop=True, inplace=True)

cols = cols.dropna(axis=1)
cols.head(3)

Unnamed: 0,0
0,duration: continuous.
1,protocol_type: symbolic.
2,service: symbolic.


In [5]:
# split merged column names (name:type --> name | type)
cols[[0,1]] = cols[0].str.split(':',expand = True)

cols.head(3)

Unnamed: 0,0,1
0,duration,continuous.
1,protocol_type,symbolic.
2,service,symbolic.


In [6]:
# add column names to DataFrame
names = cols[0].tolist()
names.append('label')
df.columns = names
df.columns

Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'label'],
      dtype='object')

In [7]:
df.head(3)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.


### 3. Rename y-labels and show their occurances

In [8]:
df['label'] = df['label'].str.replace('.', '', regex=False)
df.groupby(['label']).size().sort_values()

label
spy                     2
perl                    3
phf                     4
multihop                7
ftp_write               8
loadmodule              9
rootkit                10
imap                   12
warezmaster            20
land                   21
buffer_overflow        30
guess_passwd           53
nmap                  231
pod                   264
teardrop              979
warezclient          1020
portsweep            1040
ipsweep              1247
satan                1589
back                 2203
normal              97278
neptune            107201
smurf              280790
dtype: int64

In [9]:
for column in df.select_dtypes(include=[float, int]).columns:
    print(f'{df[column].dtype}\t\t{len(df[column].unique())}\t\t\tdf[{column}]')

int64		2495			df[duration]
int64		3300			df[src_bytes]
int64		10725			df[dst_bytes]
int64		2			df[land]
int64		3			df[wrong_fragment]
int64		4			df[urgent]
int64		22			df[hot]
int64		6			df[num_failed_logins]
int64		2			df[logged_in]
int64		23			df[num_compromised]
int64		2			df[root_shell]
int64		3			df[su_attempted]
int64		20			df[num_root]
int64		18			df[num_file_creations]
int64		3			df[num_shells]
int64		7			df[num_access_files]
int64		1			df[num_outbound_cmds]
int64		1			df[is_host_login]
int64		2			df[is_guest_login]
int64		490			df[count]
int64		470			df[srv_count]
float64		92			df[serror_rate]
float64		51			df[srv_serror_rate]
float64		77			df[rerror_rate]
float64		51			df[srv_rerror_rate]
float64		99			df[same_srv_rate]
float64		78			df[diff_srv_rate]
float64		64			df[srv_diff_host_rate]
int64		256			df[dst_host_count]
int64		256			df[dst_host_srv_count]
float64		101			df[dst_host_same_srv_rate]
float64		101			df[dst_host_diff_srv_rate]
float64		101			df[dst_host_same_src_por

# Fetch Data: Attack Type 
Summarize labels (i. e., attack types)

### 1. Download Attack Types 

In [10]:
df_attack_types = pd.read_csv('../../data/training_attack_types')
df_attack_types

Unnamed: 0,back dos
0,buffer_overflow u2r
1,ftp_write r2l
2,guess_passwd r2l
3,imap r2l
4,ipsweep probe
5,land dos
6,loadmodule u2r
7,multihop r2l
8,neptune dos
9,nmap probe


### 2. Split columns 
fetched data contains two features in one column, so split it

In [11]:
df_temp = pd.DataFrame(columns=['Attack','Type'])


df_temp[['Attack','Type']] = df_attack_types['back dos'].str.split(' ', expand=True)

row_normal = pd.DataFrame({'Attack': 'normal', 'Type':'normal'}, index=[0]) # add normal to attacks
df_temp = pd.concat([df_temp, row_normal], ignore_index=True)

df_temp

Unnamed: 0,Attack,Type
0,buffer_overflow,u2r
1,ftp_write,r2l
2,guess_passwd,r2l
3,imap,r2l
4,ipsweep,probe
5,land,dos
6,loadmodule,u2r
7,multihop,r2l
8,neptune,dos
9,nmap,probe


### 3. Add column 'Attack Type' to df 

In [12]:
df['Attack Type'] = df['label'].map(df_temp.set_index('Attack')['Type'])

### 4. Rearrange columns

In [13]:
cols = list(df.columns)
cols.insert(0, cols.pop(cols.index('Attack Type')))
df = df.loc[:, cols]
cols.insert(1, cols.pop(cols.index('label')))
df = df.loc[:, cols]

### 5. Show Example Data

In [14]:
# Show entry from each class
df_example = pd.concat([
    df[df['Attack Type'] == 'normal'].head(1),
    df[df['Attack Type'] == 'u2r'].head(1), 
    df[df['Attack Type'] == 'dos'].head(1),
    df[df['Attack Type'] == 'r2l'].head(1),
    df[df['Attack Type'] == 'probe'].head(1),
])

df_example

Unnamed: 0,Attack Type,label,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,normal,normal,0,tcp,http,SF,181,5450,0,0,...,9,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0
744,u2r,buffer_overflow,184,tcp,telnet,SF,1511,2957,0,0,...,1,3,1.0,0.0,1.0,0.67,0.0,0.0,0.0,0.0
7601,dos,neptune,0,tcp,telnet,S0,0,0,0,0,...,5,6,1.0,0.0,0.2,0.33,1.0,0.83,0.0,0.0
15699,r2l,guess_passwd,23,tcp,telnet,SF,104,276,0,0,...,1,2,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
22814,probe,portsweep,1,tcp,private,RSTR,0,0,0,0,...,178,2,0.01,0.04,0.04,0.0,0.01,0.0,0.32,1.0


# Data Wrangling

### 1.  Column types (int -> str)
Some columns are categorical (0,1) but due to the import they are considered to be numerical 

In [15]:
cols_categorical = ['protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_host_login', 'is_guest_login']
df[cols_categorical] = df[cols_categorical].astype(str)
len(df.select_dtypes(exclude=[float, int]).columns ) , df.select_dtypes(exclude=[float, int]).columns 

(9,
 Index(['Attack Type', 'label', 'protocol_type', 'service', 'flag', 'land',
        'logged_in', 'is_host_login', 'is_guest_login'],
       dtype='object'))

### 2. Standardization of numerical values (mean = 0, std. dev. = 1)

In [16]:
# Select non-numerical columns
non_numerical_cols = df.select_dtypes(exclude=[float, int]).columns

# Select numerical columns
numerical_cols = df.select_dtypes(include=[float, int]).columns

# Perform standardization
scaler = StandardScaler(with_mean=True, with_std=True)
df_standardized = df.copy()
df_standardized[numerical_cols] = scaler.fit_transform(df[numerical_cols])
df_standardized.head()

Unnamed: 0,Attack Type,label,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,normal,normal,-0.067792,tcp,http,SF,-0.002879,0.138664,0,-0.04772,...,-3.451536,-1.694315,0.599396,-0.282867,-1.022077,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464
1,normal,normal,-0.067792,tcp,http,SF,-0.00282,-0.011578,0,-0.04772,...,-3.297085,-1.600011,0.599396,-0.282867,-1.146737,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464
2,normal,normal,-0.067792,tcp,http,SF,-0.002824,0.014179,0,-0.04772,...,-3.142633,-1.505707,0.599396,-0.282867,-1.188291,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464
3,normal,normal,-0.067792,tcp,http,SF,-0.00284,0.014179,0,-0.04772,...,-2.988182,-1.411403,0.599396,-0.282867,-1.188291,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464
4,normal,normal,-0.067792,tcp,http,SF,-0.002842,0.035214,0,-0.04772,...,-2.833731,-1.3171,0.599396,-0.282867,-1.209067,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464


In [17]:
df_no_ylabel = df_standardized.iloc[:, 2::1]
print(len(df_standardized), len(df_no_ylabel))
df_no_ylabel.head()

494021 494021


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,-0.067792,tcp,http,SF,-0.002879,0.138664,0,-0.04772,-0.002571,-0.044136,...,-3.451536,-1.694315,0.599396,-0.282867,-1.022077,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464
1,-0.067792,tcp,http,SF,-0.00282,-0.011578,0,-0.04772,-0.002571,-0.044136,...,-3.297085,-1.600011,0.599396,-0.282867,-1.146737,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464
2,-0.067792,tcp,http,SF,-0.002824,0.014179,0,-0.04772,-0.002571,-0.044136,...,-3.142633,-1.505707,0.599396,-0.282867,-1.188291,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464
3,-0.067792,tcp,http,SF,-0.00284,0.014179,0,-0.04772,-0.002571,-0.044136,...,-2.988182,-1.411403,0.599396,-0.282867,-1.188291,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464
4,-0.067792,tcp,http,SF,-0.002842,0.035214,0,-0.04772,-0.002571,-0.044136,...,-2.833731,-1.3171,0.599396,-0.282867,-1.209067,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464


### 3. One Hot Encoding

Remove y-variables for one-hot-encoding

In [18]:
non_number_cols = df_no_ylabel.select_dtypes(exclude=[float, int]).columns
non_number_cols

Index(['protocol_type', 'service', 'flag', 'land', 'logged_in',
       'is_host_login', 'is_guest_login'],
      dtype='object')

one hot encoding

In [19]:
df_encoded = pd.get_dummies(df_no_ylabel, columns=non_number_cols)
df_encoded.head()

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,su_attempted,...,flag_S3,flag_SF,flag_SH,land_0,land_1,logged_in_0,logged_in_1,is_host_login_0,is_guest_login_0,is_guest_login_1
0,-0.067792,-0.002879,0.138664,-0.04772,-0.002571,-0.044136,-0.009782,-0.005679,-0.010552,-0.004676,...,0,1,0,1,0,0,1,1,1,0
1,-0.067792,-0.00282,-0.011578,-0.04772,-0.002571,-0.044136,-0.009782,-0.005679,-0.010552,-0.004676,...,0,1,0,1,0,0,1,1,1,0
2,-0.067792,-0.002824,0.014179,-0.04772,-0.002571,-0.044136,-0.009782,-0.005679,-0.010552,-0.004676,...,0,1,0,1,0,0,1,1,1,0
3,-0.067792,-0.00284,0.014179,-0.04772,-0.002571,-0.044136,-0.009782,-0.005679,-0.010552,-0.004676,...,0,1,0,1,0,0,1,1,1,0
4,-0.067792,-0.002842,0.035214,-0.04772,-0.002571,-0.044136,-0.009782,-0.005679,-0.010552,-0.004676,...,0,1,0,1,0,0,1,1,1,0


merge y-variable with one-hot encoded features

In [20]:
df = pd.concat([df_standardized.iloc[:, 0:2:1], df_encoded], axis=1)
df.head()

Unnamed: 0,Attack Type,label,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,...,flag_S3,flag_SF,flag_SH,land_0,land_1,logged_in_0,logged_in_1,is_host_login_0,is_guest_login_0,is_guest_login_1
0,normal,normal,-0.067792,-0.002879,0.138664,-0.04772,-0.002571,-0.044136,-0.009782,-0.005679,...,0,1,0,1,0,0,1,1,1,0
1,normal,normal,-0.067792,-0.00282,-0.011578,-0.04772,-0.002571,-0.044136,-0.009782,-0.005679,...,0,1,0,1,0,0,1,1,1,0
2,normal,normal,-0.067792,-0.002824,0.014179,-0.04772,-0.002571,-0.044136,-0.009782,-0.005679,...,0,1,0,1,0,0,1,1,1,0
3,normal,normal,-0.067792,-0.00284,0.014179,-0.04772,-0.002571,-0.044136,-0.009782,-0.005679,...,0,1,0,1,0,0,1,1,1,0
4,normal,normal,-0.067792,-0.002842,0.035214,-0.04772,-0.002571,-0.044136,-0.009782,-0.005679,...,0,1,0,1,0,0,1,1,1,0


Check categorical columns, which only should be `'Attack Type', 'label', 'protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_host_login', 'is_guest_login'`

In [21]:
df.select_dtypes(exclude=[int, float]).columns

Index(['Attack Type', 'label', 'protocol_type_icmp', 'protocol_type_tcp',
       'protocol_type_udp', 'service_IRC', 'service_X11', 'service_Z39_50',
       'service_auth', 'service_bgp', 'service_courier', 'service_csnet_ns',
       'service_ctf', 'service_daytime', 'service_discard', 'service_domain',
       'service_domain_u', 'service_echo', 'service_eco_i', 'service_ecr_i',
       'service_efs', 'service_exec', 'service_finger', 'service_ftp',
       'service_ftp_data', 'service_gopher', 'service_hostnames',
       'service_http', 'service_http_443', 'service_imap4', 'service_iso_tsap',
       'service_klogin', 'service_kshell', 'service_ldap', 'service_link',
       'service_login', 'service_mtp', 'service_name', 'service_netbios_dgm',
       'service_netbios_ns', 'service_netbios_ssn', 'service_netstat',
       'service_nnsp', 'service_nntp', 'service_ntp_u', 'service_other',
       'service_pm_dump', 'service_pop_2', 'service_pop_3', 'service_printer',
       'service_private',

In [31]:
import torch 
from torch.utils.data import Dataset, TensorDataset

df_no_ylabels:pd.DataFrame = df.iloc[:, 2:].values
dataset:TensorDataset = TensorDataset(torch.Tensor(np.array(df_no_ylabels)))

(494021, 494021)

In [42]:
print(dataset[0][0].size())

torch.Size([121])


In [43]:
len(dataset)

494021