In [102]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np

## NOTES

* For KDD99 feature description, check http://kdd.ics.uci.edu/databases/kddcup99/task.html

In [128]:
kdd99_file = "kddcup.data.corrected"
kdd99_df = pd.read_csv(kdd99_file, header=None)
print(kdd99_df.shape)
kdd99_df.head()

(4898431, 42)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
0,0,tcp,http,SF,215,45076,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,162,4528,0,0,0,0,...,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,236,1228,0,0,0,0,...,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,233,2032,0,0,0,0,...,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,239,486,0,0,0,0,...,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,normal.


In [129]:
kdd99_df.drop_duplicates(inplace=True)
kdd99_df.shape

(1074992, 42)

In [130]:
kdd99_df.columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
                   'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
                   'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 
                    'is_hot_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate',
                    'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 
                  'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
                  'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
                  'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack_type']
kdd99_df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_type
0,0,tcp,http,SF,215,45076,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,162,4528,0,0,0,0,...,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,236,1228,0,0,0,0,...,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,233,2032,0,0,0,0,...,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,239,486,0,0,0,0,...,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,normal.


In [131]:
# add a feature to calculate the bytes difference between source and destination
kdd99_df['src_dst_bytes_diff'] = kdd99_df['dst_bytes'] - kdd99_df['src_bytes']

## Data Exploration

In [132]:
def get_percentile(col):
    result = {'Feature':col.name, 'min':np.percentile(col, 0), '1%':np.percentile(col, 1),
             '5%':np.percentile(col, 5), '15%':np.percentile(col, 15),
             '25%':np.percentile(col, 25), '50%':np.percentile(col, 50), '75%':np.percentile(col, 75),
             '85%':np.percentile(col, 85), '95%':np.percentile(col, 95), 
              '99%':np.percentile(col, 99), '99.9%':np.percentile(col, 99.9), 'max':np.percentile(col, 100)}
    return result

In [133]:
# find columns with null
isnull_df = kdd99_df.isnull().sum()
isnull_df.loc[isnull_df > 0]  # no null in any column

Series([], dtype: int64)

In [134]:
kdd99_df['attack_type'].value_counts()/kdd99_df['attack_type'].shape[0] * 100

normal.             75.611167
neptune.            22.525656
satan.               0.466887
ipsweep.             0.346328
portsweep.           0.331537
smurf.               0.279723
nmap.                0.144559
back.                0.090047
teardrop.            0.085396
warezclient.         0.083070
pod.                 0.019163
guess_passwd.        0.004930
buffer_overflow.     0.002791
warezmaster.         0.001860
land.                0.001767
imap.                0.001116
rootkit.             0.000930
loadmodule.          0.000837
ftp_write.           0.000744
multihop.            0.000651
phf.                 0.000372
perl.                0.000279
spy.                 0.000186
Name: attack_type, dtype: float64

In [135]:
int_types = [col for col in kdd99_df.columns if kdd99_df[col].dtype == 'int64']
print(int_types)
print()
float_types = [col for col in kdd99_df.columns if kdd99_df[col].dtype == 'float64']
print(float_types)
print()
o_types = [col for col in kdd99_df.columns if kdd99_df[col].dtype == 'O']
print(o_types)
print()

['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_hot_login', 'is_guest_login', 'count', 'srv_count', 'dst_host_count', 'dst_host_srv_count', 'src_dst_bytes_diff']

['serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']

['protocol_type', 'service', 'flag', 'attack_type']



In [136]:
# lime needs categorical feature names (values will still be numerical value)
cat_features = o_types
cat_features.extend(['land', 'logged_in', 'root_shell', 'su_attempted', 'is_hot_login', 'is_guest_login'])
cat_features

['protocol_type',
 'service',
 'flag',
 'attack_type',
 'land',
 'logged_in',
 'root_shell',
 'su_attempted',
 'is_hot_login',
 'is_guest_login']

In [137]:
# check values for each categorical values
print(kdd99_df['protocol_type'].unique())
print()
print(kdd99_df['service'].unique())
print()
print(kdd99_df['flag'].unique())
print()
print(kdd99_df['attack_type'].unique())
print()
print(kdd99_df['land'].value_counts())
print()
print(kdd99_df['logged_in'].value_counts())
print()
print(kdd99_df['root_shell'].value_counts())
print()
print(kdd99_df['su_attempted'].value_counts())
print()
print(kdd99_df['is_hot_login'].value_counts())
print()
print(kdd99_df['is_guest_login'].value_counts())
print()

['tcp' 'udp' 'icmp']

['http' 'smtp' 'domain_u' 'auth' 'finger' 'telnet' 'eco_i' 'ftp' 'ntp_u'
 'ecr_i' 'other' 'urp_i' 'private' 'pop_3' 'ftp_data' 'netstat' 'daytime'
 'ssh' 'echo' 'time' 'name' 'whois' 'domain' 'mtp' 'gopher' 'remote_job'
 'rje' 'ctf' 'supdup' 'link' 'systat' 'discard' 'X11' 'shell' 'login'
 'imap4' 'nntp' 'uucp' 'pm_dump' 'IRC' 'Z39_50' 'netbios_dgm' 'ldap'
 'sunrpc' 'courier' 'exec' 'bgp' 'csnet_ns' 'http_443' 'klogin' 'printer'
 'netbios_ssn' 'pop_2' 'nnsp' 'efs' 'hostnames' 'uucp_path' 'sql_net'
 'vmnet' 'iso_tsap' 'netbios_ns' 'kshell' 'urh_i' 'http_2784' 'harvest'
 'aol' 'tftp_u' 'http_8001' 'tim_i' 'red_i']

['SF' 'S2' 'S1' 'S3' 'OTH' 'REJ' 'RSTO' 'S0' 'RSTR' 'RSTOS0' 'SH']

['normal.' 'buffer_overflow.' 'loadmodule.' 'perl.' 'neptune.' 'smurf.'
 'guess_passwd.' 'pod.' 'teardrop.' 'portsweep.' 'ipsweep.' 'land.'
 'ftp_write.' 'back.' 'imap.' 'satan.' 'phf.' 'nmap.' 'multihop.'
 'warezmaster.' 'warezclient.' 'spy.' 'rootkit.']

0    1074966
1         26
Name: 

In [138]:
# is_hot_login is the same for all type of attack_type, drop it
kdd99_df.loc[kdd99_df['is_hot_login'] == 1]['attack_type']
kdd99_df.drop(['is_hot_login'], inplace=True, axis=1)
cat_features.remove('is_hot_login')

In [139]:
y = kdd99_df['attack_type']
y.value_counts()

normal.             812814
neptune.            242149
satan.                5019
ipsweep.              3723
portsweep.            3564
smurf.                3007
nmap.                 1554
back.                  968
teardrop.              918
warezclient.           893
pod.                   206
guess_passwd.           53
buffer_overflow.        30
warezmaster.            20
land.                   19
imap.                   12
rootkit.                10
loadmodule.              9
ftp_write.               8
multihop.                7
phf.                     4
perl.                    3
spy.                     2
Name: attack_type, dtype: int64

In [140]:
# label encoding
number = LabelEncoder()

for cat_col in cat_features:
    kdd99_df[cat_col] = number.fit_transform(kdd99_df[cat_col].astype('str'))
    kdd99_df[cat_col] = kdd99_df[cat_col].astype('object')
    
kdd99_df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_type,src_dst_bytes_diff
0,0,1,24,9,215,45076,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11,44861
1,0,1,24,9,162,4528,0,0,0,0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,11,4366
2,0,1,24,9,236,1228,0,0,0,0,...,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,11,992
3,0,1,24,9,233,2032,0,0,0,0,...,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,11,1799
4,0,1,24,9,239,486,0,0,0,0,...,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,11,247


In [141]:
kdd99_df['attack_type'].value_counts()

11    812814
9     242149
17      5019
5       3723
15      3564
18      3007
10      1554
0        968
20       918
21       893
14       206
3         53
1         30
22        20
6         19
4         12
16        10
7          9
2          8
8          7
13         4
12         3
19         2
Name: attack_type, dtype: int64

In [142]:
kdd99_df.dtypes

duration                         int64
protocol_type                   object
service                         object
flag                            object
src_bytes                        int64
dst_bytes                        int64
land                            object
wrong_fragment                   int64
urgent                           int64
hot                              int64
num_failed_logins                int64
logged_in                       object
num_compromised                  int64
root_shell                      object
su_attempted                    object
num_root                         int64
num_file_creations               int64
num_shells                       int64
num_access_files                 int64
num_outbound_cmds                int64
is_guest_login                  object
count                            int64
srv_count                        int64
serror_rate                    float64
srv_serror_rate                float64
rerror_rate              

In [143]:
kdd99_df.var()

duration                       1.746640e+06
protocol_type                  8.515773e-02
service                        1.891681e+02
flag                           5.961787e+00
src_bytes                      4.038560e+12
dst_bytes                      1.895760e+12
land                           2.418566e-05
wrong_fragment                 7.807464e-03
urgent                         2.372100e-04
hot                            9.949300e-01
num_failed_logins              2.427714e-04
logged_in                      2.330700e-01
num_compromised                6.776741e+01
root_shell                     3.022367e-04
su_attempted                   2.976489e-04
num_root                       7.066340e+01
num_file_creations             7.025125e-02
num_shells                     3.477952e-04
num_access_files               5.399057e-03
num_outbound_cmds              0.000000e+00
is_guest_login                 3.788361e-03
count                          7.264525e+03
srv_count                      8

In [144]:
num_dist_dct = {}
idx = 0

for col in kdd99_df.columns:
    if kdd99_df[col].dtype == 'O':
        continue
    num_dist_dct[idx] = get_percentile(kdd99_df[col])
    idx += 1
num_dist_df = pd.DataFrame(num_dist_dct).T
num_dist_df = num_dist_df[['Feature', 'min', '1%', '5%', '15%', '25%', '50%', '75%', '85%', '95%', '99%', '99.9%','max']]
num_dist_df

Unnamed: 0,Feature,min,1%,5%,15%,25%,50%,75%,85%,95%,99%,99.9%,max
0,duration,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4289.0,19085.0,58329.0
1,src_bytes,0.0,0.0,0.0,0.0,0.0,219.0,306.0,343.0,1448.0,9178.0,61298.0,1379960000.0
2,dst_bytes,0.0,0.0,0.0,0.0,0.0,332.0,1721.0,3437.0,11166.0,30262.0,125015.0,1309940000.0
3,wrong_fragment,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0
4,urgent,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0
5,hot,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,77.0
6,num_failed_logins,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
7,num_compromised,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7479.0
8,num_root,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,7468.0
9,num_file_creations,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,43.0


#### TO-DOs
* check the attack_type of those outliers, then deal with them
* check attack_types of those almost constant features, to see whether need to drop them
* may need to normalize the data
* feature correlation
* multicolinary features


* run random forest, xgboost to check accuracy first
* then run lime multi-class
* if lime multi-class not good enough, run 2-class for each attack type with "normal" type

In [154]:
# check outliers

print(kdd99_df.loc[kdd99_df['src_bytes'] > 61298]['attack_type'].value_counts())
print()
print(kdd99_df.loc[kdd99_df['dst_bytes'] > 125015]['attack_type'].value_counts())
print()
print(kdd99_df.loc[kdd99_df['src_dst_bytes_diff'] < -9178]['attack_type'].value_counts())
print()
print(kdd99_df.loc[kdd99_df['src_dst_bytes_diff'] > 124758]['attack_type'].value_counts())
print()

11    904
21     60
15      8
Name: attack_type, dtype: int64

11    1049
22      16
15       3
8        2
4        1
Name: attack_type, dtype: int64

11    9487
0      963
21      60
15       8
9        1
Name: attack_type, dtype: int64

11    1052
22      16
15       3
8        2
4        1
Name: attack_type, dtype: int64



It seems that some attack types have majority with outlier values, such as 0 and 22, so here not going to replace outliers with any other value

In [None]:
# check constant values