# Pre-processing (Complete)

## imports

In [19]:
import warnings
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# metrics are used to find accuracy or error
from sklearn import metrics

## Functions

In [20]:
# 1. Reading Train and test dataset.
# 2. Check if dataset is reversed.
# 3. Drop 'id', and 'attack_cat' columns.
def import_train_test():
    train = pd.read_csv('../Dataset/UNSW_NB15_training-set.csv')
    test = pd.read_csv('../Dataset/UNSW_NB15_testing-set.csv')
    
    # Dropping the columns based on Feature Selection:
    # https://www.kaggle.com/khairulislam/unsw-nb15-feature-importance
    drop_cols = ['attack_cat', 'id'] + ['response_body_len', 'spkts', 'ct_flw_http_mthd', 'trans_depth', 'dwin', 'ct_ftp_cmd', 'is_ftp_login']
    for df in [train, test]:
        for col in drop_cols:
            if col in df.columns:
                print('Dropping: ', col)
                df.drop([col], axis=1, inplace=True)
    
    if train.shape < test.shape:
        # Reversing the dataset
        train, test = test, train
        print("Train and Test sets are reversed, Corrected Shape:")
        print("Train shape: ", train.shape)
        print("Test shape: ", test.shape)
    else:
        print("The dataset, is already reversed")
        print("Train shape: ", train.shape)
        print("Test shape: ", test.shape)
    return train, test

In [21]:
def feature_engineer(df):
    # Everything except: 'FIN', 'INT', 'CON', 'REQ', 'RST is renamed 'others'
    df.loc[~df['state'].isin(['FIN', 'INT', 'CON', 'REQ', 'RST']), 'state'] = 'others'
    # Everything except: ''-', 'dns', 'http', 'smtp', 'ftp-data', 'ftp', 'ssh', 'pop3' is renamed 'others'
    df.loc[~df['service'].isin(['-', 'dns', 'http', 'smtp', 'ftp-data', 'ftp', 'ssh', 'pop3']), 'service'] = 'others'
    # Merging 'igmp', 'icmp', 'rtp' into one protocol: 'igmp_icmp_rtp'
    df.loc[df['proto'].isin(['igmp', 'icmp', 'rtp']), 'proto'] = 'igmp_icmp_rtp'
    # Everything except: 'tcp', 'udp' ,'arp', 'ospf', 'igmp_icmp_rtp' is renamed to 'others'
    df.loc[~df['proto'].isin(['tcp', 'udp','arp', 'ospf', 'igmp_icmp_rtp']), 'proto'] = 'others'
    return df

In [22]:
def get_cat_columns(train):
    # Defining an empty list
    categorical = []
    # Iterating through the columns and checking for columns with datatyp "Object"
    for col in train.columns:
        if train[col].dtype == 'object':
            categorical.append(col) # appending "object" columns to categorical
    return categorical

## Pre-processing

In [23]:
# Importing train test by using the function
train, test = import_train_test()

Dropping:  attack_cat
Dropping:  id
Dropping:  response_body_len
Dropping:  spkts
Dropping:  ct_flw_http_mthd
Dropping:  trans_depth
Dropping:  dwin
Dropping:  ct_ftp_cmd
Dropping:  is_ftp_login
Dropping:  attack_cat
Dropping:  id
Dropping:  response_body_len
Dropping:  spkts
Dropping:  ct_flw_http_mthd
Dropping:  trans_depth
Dropping:  dwin
Dropping:  ct_ftp_cmd
Dropping:  is_ftp_login
Train and Test sets are reversed, Corrected Shape:
Train shape:  (175341, 36)
Test shape:  (82332, 36)


In [24]:
# To check if train and test datasets inhibits missing values
train.isnull().sum()
test.isnull().sum()

dur                 0
proto               0
service             0
state               0
dpkts               0
sbytes              0
dbytes              0
rate                0
sttl                0
dttl                0
sload               0
dload               0
sloss               0
dloss               0
sinpkt              0
dinpkt              0
sjit                0
djit                0
swin                0
stcpb               0
dtcpb               0
tcprtt              0
synack              0
ackdat              0
smean               0
dmean               0
ct_srv_src          0
ct_state_ttl        0
ct_dst_ltm          0
ct_src_dport_ltm    0
ct_dst_sport_ltm    0
ct_dst_src_ltm      0
ct_src_ltm          0
ct_srv_dst          0
is_sm_ips_ports     0
label               0
dtype: int64

In [25]:
# Addressing the different Data types for each column
train.dtypes
test.dtypes

dur                 float64
proto                object
service              object
state                object
dpkts                 int64
sbytes                int64
dbytes                int64
rate                float64
sttl                  int64
dttl                  int64
sload               float64
dload               float64
sloss                 int64
dloss                 int64
sinpkt              float64
dinpkt              float64
sjit                float64
djit                float64
swin                  int64
stcpb                 int64
dtcpb                 int64
tcprtt              float64
synack              float64
ackdat              float64
smean                 int64
dmean                 int64
ct_srv_src            int64
ct_state_ttl          int64
ct_dst_ltm            int64
ct_src_dport_ltm      int64
ct_dst_sport_ltm      int64
ct_dst_src_ltm        int64
ct_src_ltm            int64
ct_srv_dst            int64
is_sm_ips_ports       int64
label               

In [26]:
# Splitting the dataset into inputs and outputs
x_train, y_train = train.drop(['label'], axis=1), train['label']
x_test, y_test = test.drop(['label'], axis=1), test['label']
# Running the inputs into the feature_engineer function
x_train, x_test = feature_engineer(x_train), feature_engineer(x_test)

In [27]:
# Getting the categorical and non categorical columns
categorical_columns = get_cat_columns(x_train)
non_categorical_columns = [x for x in x_train.columns if x not in categorical_columns]

In [28]:
x_train.head()

Unnamed: 0,dur,proto,service,state,dpkts,sbytes,dbytes,rate,sttl,dttl,...,dmean,ct_srv_src,ct_state_ttl,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,ct_src_ltm,ct_srv_dst,is_sm_ips_ports
0,0.121478,tcp,-,FIN,4,258,172,74.08749,252,254,...,43,1,0,1,1,1,1,1,1,0
1,0.649902,tcp,-,FIN,38,734,42014,78.473372,62,252,...,1106,43,1,1,1,1,2,1,6,0
2,1.623129,tcp,-,FIN,16,364,13186,14.170161,62,252,...,824,7,1,2,1,1,3,2,6,0
3,1.681642,tcp,ftp,FIN,12,628,770,13.677108,62,252,...,64,1,1,2,1,1,3,2,1,0
4,0.449454,tcp,-,FIN,6,534,268,33.373826,254,252,...,45,43,1,2,2,1,40,2,39,0


In [29]:
# Using standard scaler to normalize data on non categorical columns
scaler = StandardScaler()
x_train[non_categorical_columns] = scaler.fit_transform(x_train[non_categorical_columns])
x_test[non_categorical_columns] = scaler.transform(x_test[non_categorical_columns])

In [30]:
x_train

Unnamed: 0,dur,proto,service,state,dpkts,sbytes,dbytes,rate,sttl,dttl,...,dmean,ct_srv_src,ct_state_ttl,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,ct_src_ltm,ct_srv_dst,is_sm_ips_ports
0,-0.191029,tcp,-,FIN,-0.135769,-0.049134,-0.102726,-0.576371,0.703839,1.578100,...,-0.314240,-0.775991,-1.366486,-0.645013,-0.544736,-0.554373,-0.705529,-0.715714,-0.753074,-0.126508
1,-0.109485,tcp,-,FIN,0.172599,-0.046410,0.188544,-0.576345,-1.141901,1.560002,...,3.800869,3.147666,-0.318711,-0.645013,-0.544736,-0.554373,-0.614256,-0.715714,-0.288257,-0.126508
2,0.040699,tcp,-,FIN,-0.026933,-0.048527,-0.012133,-0.576734,-1.141901,1.560002,...,2.709185,-0.215468,-0.318711,-0.520827,-0.544736,-0.554373,-0.522983,-0.595543,-0.288257,-0.126508
3,0.049729,tcp,ftp,FIN,-0.063212,-0.047016,-0.098563,-0.576737,-1.141901,1.560002,...,-0.232945,-0.775991,-0.318711,-0.520827,-0.544736,-0.554373,-0.522983,-0.595543,-0.753074,-0.126508
4,-0.140417,tcp,-,FIN,-0.117630,-0.047554,-0.102057,-0.576617,0.723268,1.560002,...,-0.306498,3.147666,-0.318711,-0.520827,-0.420468,-0.554373,2.854115,-0.595543,2.779535,-0.126508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175336,-0.209773,udp,dns,INT,-0.172047,-0.049958,-0.103923,0.094951,0.723268,-0.720406,...,-0.480703,1.372678,0.729064,2.211259,2.313443,1.520470,1.393748,2.048221,1.385084,-0.126508
175337,-0.131728,tcp,-,FIN,-0.099490,-0.047062,-0.101459,-0.576616,0.723268,1.560002,...,-0.310369,-0.775991,-0.318711,-0.645013,-0.544736,-0.554373,-0.614256,-0.715714,-0.753074,-0.126508
175338,-0.209773,udp,dns,INT,-0.172047,-0.049958,-0.103923,0.094951,0.723268,-0.720406,...,-0.480703,0.251634,0.729064,-0.396641,-0.296199,-0.208566,0.389746,-0.475371,0.269523,-0.126508
175339,-0.209773,udp,dns,INT,-0.172047,-0.049958,-0.103923,0.094951,0.723268,-0.720406,...,-0.480703,1.933201,0.729064,2.956374,3.059055,1.693374,1.941386,2.769248,1.942865,-0.126508


In [31]:
# Using get_dummies to make the categorical values usable.
x_train = pd.get_dummies(x_train)
x_test = pd.get_dummies(x_test)
print("Column mismatch {0}, {1}".format(set(x_train.columns)- set(x_test.columns),  set(x_test.columns)- set(x_train.columns)))
features = list(set(x_train.columns) & set(x_test.columns))

Column mismatch set(), set()


In [32]:
features = list(set(x_train.columns) & set(x_test.columns))

In [33]:
print(f"Number of features {len(features)}")
x_train = x_train[features]
x_test = x_test[features]

Number of features 53


In [34]:
x_train

Unnamed: 0,state_CON,ackdat,sinpkt,smean,dmean,ct_dst_sport_ltm,dinpkt,service_others,state_REQ,stcpb,...,djit,state_INT,service_ftp-data,proto_udp,service_-,sbytes,ct_dst_ltm,dbytes,ct_src_dport_ltm,proto_ospf
0,False,-0.503014,-0.132788,-0.458048,-0.314240,-0.554373,-0.080885,False,False,-0.256392,...,-0.145905,False,False,False,True,-0.049134,-0.645013,-0.102726,-0.544736,False
1,False,-0.503014,-0.129251,-0.414076,3.800869,-0.554373,-0.073735,False,False,0.331031,...,0.192913,False,False,False,True,-0.046410,-0.645013,0.188544,-0.544736,False
2,False,0.742202,-0.104126,-0.443391,2.709185,-0.554373,0.014711,False,False,0.846258,...,2.663504,False,False,False,True,-0.048527,-0.520827,-0.012133,-0.544736,False
3,False,-0.503014,-0.115034,-0.414076,-0.232945,-0.554373,0.002046,False,False,0.101729,...,1.080373,False,False,False,False,-0.047016,-0.520827,-0.098563,-0.544736,False
4,False,0.909954,-0.129549,-0.409190,-0.306498,-0.554373,-0.012721,False,False,1.082366,...,-0.120301,False,False,False,True,-0.047554,-0.520827,-0.102057,-0.420468,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175336,False,-0.503014,-0.136142,-0.389647,-0.480703,1.520470,-0.089370,False,False,-0.715177,...,-0.148818,True,False,True,False,-0.049958,2.211259,-0.103923,2.313443,False
175337,False,1.041069,-0.128631,-0.365219,-0.310369,-0.554373,-0.021513,False,False,1.881207,...,-0.119225,False,False,False,True,-0.047062,-0.645013,-0.101459,-0.544736,False
175338,False,-0.503014,-0.136142,-0.389647,-0.480703,-0.208566,-0.089370,False,False,-0.715177,...,-0.148818,True,False,True,False,-0.049958,-0.396641,-0.103923,-0.296199,False
175339,False,-0.503014,-0.136142,-0.389647,-0.480703,1.693374,-0.089370,False,False,-0.715177,...,-0.148818,True,False,True,False,-0.049958,2.956374,-0.103923,3.059055,False


In [35]:
print('X_train Shape: ', x_train.shape)
print('y_train Shape: ', y_train.shape)
print('X_test Shape: ', x_test.shape)
print('y_test Shape: ', y_test.shape)

X_train Shape:  (175341, 53)
y_train Shape:  (175341,)
X_test Shape:  (82332, 53)
y_test Shape:  (82332,)


## Export CSV

In [36]:
# merge x_train and y_train before exporting to CSV
x_train['label'] = y_train
x_test['label'] = y_test
x_train.to_csv('../Dataset/train_pp4.csv', index=False)
x_test.to_csv('../Dataset/test_pp4.csv', index=False)