In [2]:
import pandas as pd
import numpy as np
from time import time
col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]
DoS = ['back.','land.','neptune.','pod.','smurf.', 'teardrop.']
R2L = ['ftp_write.','guess_passwd.','imap.','multihop.','phf.','spy.','warezclient.','warezmaster.']
U2R = ['buffer-overflow.','loadmodule.','perl.','rootkit.']
Probe = ['ipsweep.','nmap.','portsweep.','satan.']
kdd_data = pd.read_csv("kddcup.data.corrected", header=None, names = col_names)

In [3]:
kdd_data.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,http,SF,215,45076,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,162,4528,0,0,0,0,...,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,236,1228,0,0,0,0,...,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,233,2032,0,0,0,0,...,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,239,486,0,0,0,0,...,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,normal.


In [4]:
len(kdd_data)

4898431

In [5]:
print(kdd_data['protocol_type'].value_counts())

icmp    2833545
tcp     1870598
udp      194288
Name: protocol_type, dtype: int64


In [6]:
print(kdd_data['service'].value_counts())

ecr_i          2811660
private        1100831
http            623091
smtp             96554
other            72653
domain_u         57782
ftp_data         40697
eco_i            16338
finger            6891
urp_i             5378
ftp               5214
telnet            4277
ntp_u             3833
auth              3382
pop_3             1981
time              1579
domain            1113
Z39_50            1078
gopher            1077
mtp               1076
ssh               1075
whois             1073
remote_job        1073
rje               1070
imap4             1069
link              1069
ctf               1068
name              1067
supdup            1060
discard           1059
                ...   
vmnet             1053
netbios_dgm       1052
sql_net           1052
iso_tsap          1052
shell             1051
csnet_ns          1051
klogin            1050
hostnames         1050
bgp               1047
exec              1045
login             1045
printer           1045
http_443   

In [7]:
print(kdd_data['label'].value_counts())
#print(len(kdd_data['service'].value_counts()))

smurf.              2807886
neptune.            1072017
normal.              972781
satan.                15892
ipsweep.              12481
portsweep.            10413
nmap.                  2316
back.                  2203
warezclient.           1020
teardrop.               979
pod.                    264
guess_passwd.            53
buffer_overflow.         30
land.                    21
warezmaster.             20
imap.                    12
rootkit.                 10
loadmodule.               9
ftp_write.                8
multihop.                 7
phf.                      4
perl.                     3
spy.                      2
Name: label, dtype: int64


In [8]:
print(kdd_data['flag'].value_counts())

SF        3744328
S0         869829
REJ        268874
RSTR         8094
RSTO         5344
SH           1040
S1            532
S2            161
RSTOS0        122
OTH            57
S3             50
Name: flag, dtype: int64


In [10]:
#OneHotEncode protcol_type and flag. 
cols_to_transform = ['protocol_type', 'flag']
df_with_dummies = pd.get_dummies(columns = cols_to_transform, data = kdd_data)

In [12]:
df_with_dummies.head()

Unnamed: 0,duration,service,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0,http,215,45076,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,0,http,162,4528,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,0,http,236,1228,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,0,http,233,2032,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,0,http,239,486,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0


In [13]:
#Group label into 4 attack types and OneHotEncode
print("Label value count before grouping = {}".format(df_with_dummies['label'].value_counts()))

def classifyAttack(x):
    if x in DoS:
        return 'DoS'
    elif x in R2L:
        return 'R2L'
    elif x in U2R:
        return 'U2R'
    elif x in Probe:
        return 'Probe'
    else:
        return 'Normal'

for i in range(len(df_with_dummies)):
    df_with_dummies.set_value(i,'label',classifyAttack(df_with_dummies['label'][i]))
    
print("Label value count after grouping = {}".format(df_with_dummies['label'].value_counts()))

cols_to_transform = ['label']
df_with_dummies = pd.get_dummies(columns = cols_to_transform, data = df_with_dummies)


Label value count before grouping = smurf.              2807886
neptune.            1072017
normal.              972781
satan.                15892
ipsweep.              12481
portsweep.            10413
nmap.                  2316
back.                  2203
warezclient.           1020
teardrop.               979
pod.                    264
guess_passwd.            53
buffer_overflow.         30
land.                    21
warezmaster.             20
imap.                    12
rootkit.                 10
loadmodule.               9
ftp_write.                8
multihop.                 7
phf.                      4
perl.                     3
spy.                      2
Name: label, dtype: int64
Label value count after grouping = DoS       3883370
Normal     972811
Probe       41102
R2L          1126
U2R            22
Name: label, dtype: int64


In [14]:
#LabelEncode all 70 types of service attribute
from sklearn import preprocessing 
le = preprocessing.LabelEncoder()
#from collections import defaultdict
#d = defaultdict(preprocessing.LabelEncoder)

df_with_dummies = df_with_dummies.apply(le.fit_transform)



In [15]:
df_with_dummies.head()

Unnamed: 0,duration,service,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,...,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,label_DoS,label_Normal,label_Probe,label_R2L,label_U2R
0,0,24,213,19600,0,0,0,0,0,1,...,0,0,0,1,0,0,1,0,0,0
1,0,24,160,4511,0,0,0,0,0,1,...,0,0,0,1,0,0,1,0,0,0
2,0,24,234,1221,0,0,0,0,0,1,...,0,0,0,1,0,0,1,0,0,0
3,0,24,231,2025,0,0,0,0,0,1,...,0,0,0,1,0,0,1,0,0,0
4,0,24,237,479,0,0,0,0,0,1,...,0,0,0,1,0,0,1,0,0,0


In [22]:
list(df_with_dummies)

['duration',
 'service',
 'src_bytes',
 'dst_bytes',
 'land',
 'wrong_fragment',
 'urgent',
 'hot',
 'num_failed_logins',
 'logged_in',
 'num_compromised',
 'root_shell',
 'su_attempted',
 'num_root',
 'num_file_creations',
 'num_shells',
 'num_access_files',
 'num_outbound_cmds',
 'is_host_login',
 'is_guest_login',
 'count',
 'srv_count',
 'serror_rate',
 'srv_serror_rate',
 'rerror_rate',
 'srv_rerror_rate',
 'same_srv_rate',
 'diff_srv_rate',
 'srv_diff_host_rate',
 'dst_host_count',
 'dst_host_srv_count',
 'dst_host_same_srv_rate',
 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate',
 'dst_host_serror_rate',
 'dst_host_srv_serror_rate',
 'dst_host_rerror_rate',
 'dst_host_srv_rerror_rate',
 'protocol_type_icmp',
 'protocol_type_tcp',
 'protocol_type_udp',
 'flag_OTH',
 'flag_REJ',
 'flag_RSTO',
 'flag_RSTOS0',
 'flag_RSTR',
 'flag_S0',
 'flag_S1',
 'flag_S2',
 'flag_S3',
 'flag_SF',
 'flag_SH',
 'label_DoS',
 'label_Normal',
 'label_Probe',
 '

In [27]:
new_column_list = list(df_with_dummies)
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
np_scaled = min_max_scaler.fit_transform(df_with_dummies)
df_normalized = pd.DataFrame(np_scaled,columns=new_column_list)
df_normalized.head()

Unnamed: 0,duration,service,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,...,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,label_DoS,label_Normal,label_Probe,label_R2L,label_U2R
0,0.0,0.347826,0.029608,0.911967,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.347826,0.022241,0.209892,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.347826,0.032527,0.056812,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.347826,0.03211,0.094221,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.347826,0.032944,0.022287,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [28]:
df_normalized.describe()

Unnamed: 0,duration,service,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,...,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,label_DoS,label_Normal,label_Probe,label_R2L,label_U2R
count,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,...,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0
mean,0.003406568,0.3669251,0.0864784,0.01995741,5.716116e-06,0.0002253783,1.184053e-06,0.0004127078,6.410216e-06,0.143529,...,0.0001086062,3.286767e-05,1.020735e-05,0.7643933,0.0002123129,0.7927783,0.1985964,0.00839085,0.0002298695,4.491234e-06
std,0.04117306,0.2149888,0.07927254,0.08926702,0.002390833,0.01454819,0.0007825853,0.0151902,0.001459882,0.3506116,...,0.01042087,0.005732939,0.003194878,0.4243774,0.01456941,0.4053158,0.3989435,0.09121648,0.01515971,0.002119249
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.2173913,0.005977203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,0.0,0.2173913,0.07200445,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,0.0,0.6666667,0.1431749,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [31]:
#Shuffle data
df_normalized = df_normalized.sample(n=len(df_normalized), random_state = 42)

In [32]:
df_normalized.to_csv('KDD.preProcessed.csv',index=False)