In [None]:
"""
This script is to analyze and preprocess NSL_KDD Dataset
"""

In [4]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from matplotlib import pyplot as plt

In [2]:
column_names = ['duration',
'protocol_type',
'service',
'flag',
'src_bytes',
'dst_bytes',
'land',
'wrong_fragment',
'urgent',
'hot',
'num_failed_logins',
'logged_in',
'num_compromised',
'root_shell',
'su_attempted',
'num_root',
'num_file_creations',
'num_shells',
'num_access_files',
'num_outbound_cmds',
'is_host_login',
'is_guest_login',
'count',
'srv_count',
'serror_rate',
'srv_serror_rate',
'rerror_rate',
'srv_rerror_rate',
'same_srv_rate',
'diff_srv_rate',
'srv_diff_host_rate',
'dst_host_count',
'dst_host_srv_count',
'dst_host_same_srv_rate',
'dst_host_diff_srv_rate',
'dst_host_same_src_port_rate',
'dst_host_srv_diff_host_rate',
'dst_host_serror_rate',
'dst_host_srv_serror_rate',
'dst_host_rerror_rate',
'dst_host_srv_rerror_rate',
'attack_type',
'no_correctly_classified'] 

In [3]:
len(column_names)

43

In [4]:
f = '/Users/harikoduvely/Projects/RL/NSL_KDD_DataSet/Small_Training_Set.csv'

In [5]:
df_ST = pd.read_csv(f,header=None, names = column_names)    

In [6]:
df_ST.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_type,no_correctly_classified
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [7]:
df_ST['protocol_type'].nunique()

3

In [8]:
df_ST['protocol_type'].value_counts()

tcp     839
udp     109
icmp     63
Name: protocol_type, dtype: int64

In [9]:
df_ST['service'].nunique()

58

In [10]:
df_ST['service'].value_counts()

http           308
private        171
domain_u        59
ftp_data        57
smtp            57
eco_i           39
other           36
telnet          28
finger          19
ecr_i           19
ftp             17
uucp            10
bgp              9
Z39_50           9
supdup           8
csnet_ns         8
time             7
netbios_dgm      6
courier          6
imap4            6
whois            6
uucp_path        6
vmnet            6
auth             6
iso_tsap         6
echo             5
gopher           5
netbios_ns       5
urp_i            5
hostnames        5
domain           5
exec             5
login            5
mtp              5
discard          4
ldap             4
klogin           3
ctf              3
name             3
ssh              3
link             3
nntp             3
sql_net          3
efs              3
remote_job       3
kshell           3
systat           2
http_443         2
IRC              2
daytime          2
netstat          2
sunrpc           2
shell       

In [11]:
df_ST['flag'].nunique()

7

In [12]:
df_ST['flag'].value_counts()

SF      584
S0      290
REJ      91
RSTR     27
RSTO     14
S1        3
SH        2
Name: flag, dtype: int64

In [13]:
"""
Creating boolean variables for all the 3 categorical variables namely protocol_type (3), service (58) and flag (7) using 
One Hot Encoding in Scikit learn
"""
# select only columns containing categorical variables
df_ST_C = df_ST.select_dtypes(include=[object])

In [14]:
df_ST_C = df_ST_C.drop('attack_type',1)

In [21]:
df_ST_C.shape

(1011, 3)

In [12]:
df_ST_C.head()

Unnamed: 0,protocol_type,service,flag
0,tcp,ftp_data,SF
1,udp,other,SF
2,tcp,private,S0
3,tcp,http,SF
4,tcp,http,SF


In [15]:
le = preprocessing.LabelEncoder()

In [16]:
df_ST_C_1 = df_ST_C.apply(le.fit_transform)

In [16]:
df_ST_C_1.head()

Unnamed: 0,protocol_type,service,flag
0,1,18,5
1,2,38,5
2,1,42,3
3,1,21,5
4,1,21,5


In [17]:
enc = preprocessing.OneHotEncoder()

In [18]:
enc.fit(df_ST_C_1)

OneHotEncoder(categorical_features='all', dtype=<type 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [19]:
ar_ST_C_OH = enc.transform(df_ST_C_1).toarray()

In [20]:
ar_ST_C_OH.shape

(1011, 68)

In [21]:
df_ST_C_OH = pd.DataFrame(ar_ST_C_OH)

In [22]:
df_ST_C_OH.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,58,59,60,61,62,63,64,65,66,67
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [23]:
df_ST_F = pd.concat([df_ST, df_ST_C_OH], axis=1)

In [24]:
df_ST_Final = df_ST_F.drop(['protocol_type', 'service', 'flag'], axis=1)

In [38]:
df_ST['su_attempted'].nunique()

2

In [39]:
df_ST['su_attempted'].value_counts()

0    1010
1       1
Name: su_attempted, dtype: int64

In [None]:
"""
Query a df and get row index
"""

In [12]:
t_xy = {'y':[1,-1,1,1,1], 'a':[0,1,1,0,0], 'b':[1,1,0,0,1], 'c':[0,0,0,1,1]}

In [13]:
df_xy = pd.DataFrame(t_xy)

In [16]:
df_x = df_xy.drop(['y'], axis=1).astype(str)

In [8]:
def row_concat(x):
    return ''.join(list(x))

In [19]:
df_s = df_x.apply(row_concat, axis=1)

In [22]:
df_s = pd.concat([df_xy['y'],df_s], axis=1)

In [29]:
df_s.columns=['y','x']

In [30]:
df_s.head()

Unnamed: 0,y,x
0,1,10
1,-1,110
2,1,100
3,1,1
4,1,11


In [32]:
df_s[df_s['x']=='100'].index[0]

2

In [36]:
df_s.iloc[3][1]

'001'

In [39]:
# Function returning next string and y value for a given string
def next_state(df, s):
    n = df[df['x']==s].index[0]
    y = df.iloc[n][0]
    s = df.iloc[n+1][1]
    return s, y
    

In [40]:
next_state(df_s, '010')

('110', 1)