In [2]:
"""
This python script does the following:
1. Loads NSL_KDD_Dataset from a file
2. Convert every continuous variable x into categorical by binnig. 
   Here a quantile based binning is used
   0.0 <= x < 0.2 : 1
   0.2 <= x < 0.4 : 2
   0.4 <= x < 0.6 : 3
   0.6 <= x < 0.8 : 4
   0.8 <= x < 1.0 : 5
3. Convert all categorical variables to boolean using one hot encoding
4. Write the resulting output into a file after compressing using pickle
"""

'\nThis python script does the following:\n1. Loads NSL_KDD_Dataset from a file\n2. Convert every continuous variable x into categorical by binnig. Here a quantile based binning is used\n   - x < 0.2 : low\n   - 0.2 <= x < 0.4: low_medium\n   - 0.4 <= x\n'

In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

In [2]:
# File containing NSL_KDD_DataSet
f = '/Users/harikoduvely/Projects/RL/DataSets/NSL_KDD_DataSet/KDDTrain+.csv'

In [3]:
# Column Names for NSL_KDD_DataSet
nsl_kdd_columns = ['duration',
'protocol_type',
'service',
'flag',
'src_bytes',
'dst_bytes',
'land',
'wrong_fragment',
'urgent',
'hot',
'num_failed_logins',
'logged_in',
'num_compromised',
'root_shell',
'su_attempted',
'num_root',
'num_file_creations',
'num_shells',
'num_access_files',
'num_outbound_cmds',
'is_host_login',
'is_guest_login',
'count',
'srv_count',
'serror_rate',
'srv_serror_rate',
'rerror_rate',
'srv_rerror_rate',
'same_srv_rate',
'diff_srv_rate',
'srv_diff_host_rate',
'dst_host_count',
'dst_host_srv_count',
'dst_host_same_srv_rate',
'dst_host_diff_srv_rate',
'dst_host_same_src_port_rate',
'dst_host_srv_diff_host_rate',
'dst_host_serror_rate',
'dst_host_srv_serror_rate',
'dst_host_rerror_rate',
'dst_host_srv_rerror_rate',
'attack_type',
'no_correctly_classified'] 

In [4]:
# Load Data
df_nsl = pd.read_csv(f,header=None, names=nsl_kdd_columns) 

In [5]:
# Dropping the column 'no_correctly_classified'
df_nsl = df_nsl.drop(['no_correctly_classified'], axis=1)

In [6]:
# Split Numeric and Categorical Variables
df_nsl_cat = df_nsl.select_dtypes(include=[object])

In [7]:
df_nsl_num = df_nsl.select_dtypes(exclude=[object])

In [8]:
# Keeping the attack_type in a separate dataframe (no need to one hot encode this)
df_nsl_attack_type = df_nsl_cat.drop(['protocol_type','service','flag'], axis=1)

In [9]:
df_nsl_cat = df_nsl_cat.drop(['attack_type'], axis=1)

In [10]:
# Keeping the binary variables in a separate dataframe (no need to bin these)
df_nsl_bin = df_nsl_num[['logged_in','is_host_login','is_guest_login','root_shell','urgent','su_attempted']]

In [11]:
df_nsl_cont= df_nsl_num.drop(['logged_in','is_host_login','is_guest_login','root_shell','urgent','su_attempted'], axis=1)

In [None]:
# Code below convert all the continous variables by first discretizing them and then using
# One-Hot encoder

In [12]:
# Discretizing continuous variables using binning
def quantile_discret(x, q2, q4, q6, q8):
    # _1  : low 
    # _2 : Low_Medium
    # _3  : Medium
    # _4 : High_Medium
    # _5  : High
    if x < q2:
        y = 1
    elif (x >= q2 and x < q4):
        y = 2
    elif (x >= q4 and x < q6):
        y = 3
    elif (x >= q6 and x < q8):
        y = 4
    else:
        y = 5

    return y

In [13]:
colnames_nsl_cont = list(df_nsl_cont)

In [14]:
for name in colnames_nsl_cont:
    new_name = 'disc_' + name
    """
    q2 = df_nsl_cont[name].quantile(0.2)
    q4 = df_nsl_cont[name].quantile(0.4)
    q6 = df_nsl_cont[name].quantile(0.6)
    q8 = df_nsl_cont[name].quantile(0.8)
    """
    xmin = df_nsl_cont[name].min()
    xmax = df_nsl_cont[name].max()
    xgap = (xmax - xmin)/5
    q2 = xmin + xgap
    q4 = xmin + 2*xgap
    q6 = xmin + 3*xgap
    q8 = xmin + 4*xgap
    df_nsl_cont[new_name] = df_nsl_cont[name].apply(quantile_discret, args=(q2, q4, q6, q8))

In [15]:
df_nsl_disc = df_nsl_cont.drop(colnames_nsl_cont, axis=1)

In [16]:
enc = preprocessing.OneHotEncoder()

In [17]:
enc.fit(df_nsl_disc)

OneHotEncoder(categorical_features='all', dtype=<type 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [18]:
ar_df_nsl_disc = enc.transform(df_nsl_disc).toarray()

In [19]:
df_nsl_cont_onehot = pd.DataFrame(ar_df_nsl_disc)

In [None]:
# Code below convert all the discrete variables by first lablelling them and then using
# One-Hot encoder

In [20]:
le = preprocessing.LabelEncoder()

In [21]:
df_nsl_cat_lab = df_nsl_cat.apply(le.fit_transform)

In [23]:
enc = preprocessing.OneHotEncoder()

In [24]:
enc.fit(df_nsl_cat_lab)

OneHotEncoder(categorical_features='all', dtype=<type 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [25]:
ar_df_nsl_cat = enc.transform(df_nsl_cat_lab).toarray()

In [26]:
df_nsl_cat_onehot = pd.DataFrame(ar_df_nsl_cat)

In [27]:
# Converting attack type column to normal or attack
def connection_type(x):
    if x == 'normal':
        y = -1.0
    else:
        y = 1.0
    return y

In [28]:
df_nsl_connection_type = df_nsl_attack_type['attack_type'].apply(connection_type)

In [31]:
# Finally joining the continuous honhot, categorial onehot, binary variables and attack types
# to create the final dataset 

In [29]:
df_nsl_onehot = pd.concat([df_nsl_connection_type, df_nsl_cont_onehot, df_nsl_cat_onehot, df_nsl_bin], axis=1)

In [30]:
new_col_names = ['y']
for i in range(df_nsl_onehot.shape[1]-1):
    new_col_names.append('x_' + str(i+1))

In [31]:
# Renaming colums
"""
old_names = list(df_nsl_onehot) 
new_names = new_col_names
df_nsl_onehot.rename(columns=dict(zip(old_names, new_names)), inplace=True)
"""
df_nsl_onehot.columns = new_col_names

In [70]:
# Saving the dataframe to a pickle file
#df_nsl_onehot.to_pickle('/Users/harikoduvely/Projects/RL/DataSets/kdd_nsl_train_onehot.pkl')

In [2]:
# Converting multiple columns to a boolean string format and storing only one colums
#df_nsl_onehot = pd.DataFrame(pd.read_pickle('/Users/harikoduvely/Projects/RL/RL4AD/kdd_nsl_train_onehot.pkl'))

In [32]:
df_nsl_onehot_s = df_nsl_onehot.drop(['y'], axis=1).astype(int).astype(str)

In [33]:
def row_concat(x):
    return ''.join(list(x))

In [34]:
df_nsl_onehot_string = df_nsl_onehot_s.apply(row_concat, axis=1)

In [35]:
df_nsl_onehot_s = pd.concat([df_nsl_onehot['y'],df_nsl_onehot_string], axis=1)

In [36]:
# Renaming columns
df_nsl_onehot_s.columns=['y','s'] 

In [40]:
len(df_nsl_onehot_s.iloc[0][1])

226

In [38]:
# Saving the dataframe to a pickle file in directory DataSets
df_nsl_onehot_s.to_pickle('/Users/harikoduvely/Projects/RL/DataSets/NSL_KDD_PKL/kdd_nsl_train_onehot_string.pkl')