In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Fetch data

### Download

In [10]:
df = pd.read_csv('../../data/kddcup.data_10_percent.gz', header=None)
cols = pd.read_csv('../../data/kddcup.names',header=None)

### Add column names to DataFrame

In [11]:
if cols[0][0] == 'back':
    cols = cols.drop(cols.index[0])
    cols.reset_index(drop=True, inplace=True)

cols = cols.dropna(axis=1)
cols.head()

Unnamed: 0,0
0,duration: continuous.
1,protocol_type: symbolic.
2,service: symbolic.
3,flag: symbolic.
4,src_bytes: continuous.


In [12]:
# split merged column names (name:type --> name | type)
cols[[0,1]] = cols[0].str.split(':',expand = True)

cols.head()

Unnamed: 0,0,1
0,duration,continuous.
1,protocol_type,symbolic.
2,service,symbolic.
3,flag,symbolic.
4,src_bytes,continuous.


In [13]:
# add column names to DataFrame
names = cols[0].tolist()
names.append('label')
df.columns = names
df.columns

Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'label'],
      dtype='object')

In [14]:
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


### Show y-labels and their occurances

In [15]:
df['label'] = df['label'].str.replace('.', '', regex=False)
df.groupby(['label']).size().sort_values()

label
spy                     2
perl                    3
phf                     4
multihop                7
ftp_write               8
loadmodule              9
rootkit                10
imap                   12
warezmaster            20
land                   21
buffer_overflow        30
guess_passwd           53
nmap                  231
pod                   264
teardrop              979
warezclient          1020
portsweep            1040
ipsweep              1247
satan                1589
back                 2203
normal              97278
neptune            107201
smurf              280790
dtype: int64

# Summarize labels (i. e., attack types)

### Download Attack Types 

In [16]:
df_attack_types = pd.read_csv('../../data/training_attack_types')
df_attack_types

Unnamed: 0,back dos
0,buffer_overflow u2r
1,ftp_write r2l
2,guess_passwd r2l
3,imap r2l
4,ipsweep probe
5,land dos
6,loadmodule u2r
7,multihop r2l
8,neptune dos
9,nmap probe


### Split columns 

In [17]:
df_temp = pd.DataFrame(columns=['Attack','Type'])


df_temp[['Attack','Type']] = df_attack_types['back dos'].str.split(' ', expand=True)

row_normal = pd.DataFrame({'Attack': 'normal', 'Type':'normal'}, index=[0]) # add normal to attacks
df_temp = pd.concat([df_temp, row_normal], ignore_index=True)

df_temp

Unnamed: 0,Attack,Type
0,buffer_overflow,u2r
1,ftp_write,r2l
2,guess_passwd,r2l
3,imap,r2l
4,ipsweep,probe
5,land,dos
6,loadmodule,u2r
7,multihop,r2l
8,neptune,dos
9,nmap,probe


### Add columns to 

In [18]:
df['Attack Type'] = df['label'].map(df_temp.set_index('Attack')['Type'])

#df['Attack Type'] = df['label'].apply(lambda x: df_temp['Type'])


In [19]:
df[['label', 'Attack Type']].loc[50000:60000]

Unnamed: 0,label,Attack Type
50000,smurf,dos
50001,smurf,dos
50002,smurf,dos
50003,smurf,dos
50004,smurf,dos
...,...,...
59996,neptune,dos
59997,neptune,dos
59998,neptune,dos
59999,neptune,dos


In [20]:
df[df['Attack Type'] == 'u2r'].head(1)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label,Attack Type
744,184,tcp,telnet,SF,1511,2957,0,0,0,3,...,1.0,0.0,1.0,0.67,0.0,0.0,0.0,0.0,buffer_overflow,u2r


# Show Example Data

In [21]:
# rearrange columns
cols = list(df.columns)
cols.insert(0, cols.pop(cols.index('Attack Type')))
df = df.loc[:, cols]
cols.insert(1, cols.pop(cols.index('label')))
df = df.loc[:, cols]

In [22]:
# Show entry from each class
df_example = pd.concat([
    df[df['Attack Type'] == 'normal'].head(1),
    df[df['Attack Type'] == 'u2r'].head(1), 
    df[df['Attack Type'] == 'dos'].head(1),
    df[df['Attack Type'] == 'r2l'].head(1),
    df[df['Attack Type'] == 'probe'].head(1),
])

df_example

Unnamed: 0,Attack Type,label,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,normal,normal,0,tcp,http,SF,181,5450,0,0,...,9,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0
744,u2r,buffer_overflow,184,tcp,telnet,SF,1511,2957,0,0,...,1,3,1.0,0.0,1.0,0.67,0.0,0.0,0.0,0.0
7601,dos,neptune,0,tcp,telnet,S0,0,0,0,0,...,5,6,1.0,0.0,0.2,0.33,1.0,0.83,0.0,0.0
15699,r2l,guess_passwd,23,tcp,telnet,SF,104,276,0,0,...,1,2,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
22814,probe,portsweep,1,tcp,private,RSTR,0,0,0,0,...,178,2,0.01,0.04,0.04,0.0,0.01,0.0,0.32,1.0
