# Preprocessing 

This notebook focuses on ways to preprocess the full data set before using it for 
the various models.

In [1]:
import pandas as pd
import numpy as np
import pickle
from pathlib import Path

from consistent_labels import get_attack_labels, get_common_services, get_common_protos, get_common_states

In [2]:
%load_ext watermark
%watermark -iv -p sklearn

numpy  1.15.4
pandas 0.24.0
sklearn 0.20.2


In [3]:
%load_ext sql
%config SqlMagic.autopandas = True
%sql postgres://localhost/nb15

'Connected: @nb15'

In [4]:
# This combines some collinear features
def collinear_fix(X):
    drop = ['sbytes', 'dbytes', 'state_FIN', 'stcpb', 'dtcpb', 'tcprtt']
    X = X.drop(columns=drop)

    X['avgspk'] = X.eval('(sloss+spkts)/2.')
    X['avgdpk'] = X.eval('(dloss+dpkts)/2.')
    X['avgwin'] = X.eval('(dwin+swin)/2.')
    X['avg_port_ltm'] = X.eval('(ct_dst_sport_ltm+ct_src_dport_ltm)/2.')

    drop = ['sloss', 'spkts', 'dloss', 'dpkts', 'dwin', 'swin', 'ct_dst_sport_ltm','ct_src_dport_ltm']
    X = X.drop(columns=drop)
    return X

In [5]:
raw_train = %sql select * from full_split where train_set = True;
raw_test  = %sql select * from full_split where train_set = False;

raw_train = raw_train.drop(columns=['train_set'])
raw_test = raw_test.drop(columns=['train_set'])

 * postgres://localhost/nb15
1524031 rows affected.
 * postgres://localhost/nb15
1016013 rows affected.


In [6]:
targets = ['label', 'attack_cat']
X_train = raw_train.drop(columns=targets)
Y_train = raw_train[targets]

X_test = raw_test.drop(columns=targets)
Y_test = raw_test[targets]

# Drop non-predictive columns

We don't want our model to learn that attacks come to or from a specific host or at a specific time, or to 
predict based on the row number in the database.

In [7]:
rename = dict(index='id')
non_predictors = ['id', 'srcip', 'dstip', 'sport', 'dsport', 'stime', 'ltime' ]
X_train = X_train.rename(columns=rename).drop(columns=non_predictors)
X_test  = X_test.rename(columns=rename).drop(columns=non_predictors)

In [8]:
X_train.columns

Index(['proto', 'state', 'dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss',
       'dloss', 'service', 'sload', 'dload', 'spkts', 'dpkts', 'swin', 'dwin',
       'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'trans_depth', 'res_bdy_len',
       'sjit', 'djit', 'sintpkt', 'dintpkt', 'tcprtt', 'synack', 'ackdat',
       'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login',
       'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm'],
      dtype='object')

# Data cleaning

is_ftp_login was messed up in the MVP, so fix it here so it's 0/1 binary the way it's supposed to be.

In [9]:
query = 'is_ftp_login > 1'
X_train.loc[X_train.eval(query), 'is_ftp_login'] = 1
X_test.loc[X_test.eval(query), 'is_ftp_login'] = 1

## Sort columns by type

In [10]:
categorical = ['proto', 'state', 'service']
boolean = [ 'is_sm_ips_ports', 'is_ftp_login']
numerical = list(X_train.columns)
for col in categorical + boolean:
    numerical.remove(col)

## Categorical columns one-hot encoding

This section follows the same procedure used in the MVP notebook

In [11]:
# writes a pandas.eval query string for <varname> not in <values>
def make_other_query(varname, values):
     return f"{varname} != '" + f"' and {varname} != '".join(values) + "'"

In [12]:
new_cats_train = X_train[categorical].copy(deep=True)
new_cats_test = X_test[categorical].copy(deep=True)

In [13]:
main_proto = get_common_protos()
main_proto

['tcp', 'udp', 'unas', 'arp', 'ospf']

In [14]:
proto_query = make_other_query('proto', main_proto)
proto_query

"proto != 'tcp' and proto != 'udp' and proto != 'unas' and proto != 'arp' and proto != 'ospf'"

In [15]:
new_cats_train.loc[new_cats_train.eval(proto_query), 'proto'] = 'other'
new_cats_test.loc[new_cats_test.eval(proto_query), 'proto'] = 'other'

In [16]:
main_state = get_common_states()
main_state

['FIN', 'CON', 'INT', 'REQ']

In [17]:
state_query = make_other_query('state', main_state)
new_cats_train.loc[new_cats_train.eval(state_query), 'state'] = 'other'
new_cats_test.loc[new_cats_test.eval(state_query), 'state'] = 'other'

In [18]:
main_service = get_common_services()

In [19]:
service_query = make_other_query('service', main_service )
new_cats_train.loc[new_cats_train.eval(service_query), 'service'] ='other'
new_cats_test.loc[new_cats_test.eval(service_query), 'service'] ='other'

In [20]:
new_cats_train.loc[new_cats_train.service == '-', 'service'] = 'none'
new_cats_test.loc[new_cats_test.service == '-', 'service'] = 'none'

In [21]:
# rather than 'drop_first', I'm going to drop the 'other' column for 
# later interpretability, except for proto_tcp which is too collinear
# with the tcp stats and has to go.
other_cols = [ 'proto_tcp', 'state_other', 'service_other' ]

new_cats_train = pd.get_dummies(new_cats_train).drop(columns=other_cols)
new_cats_test = pd.get_dummies(new_cats_test).drop(columns=other_cols)

# Save one set with raw (but cleaned) data

In [22]:
XY_train = pd.concat([X_train[numerical], new_cats_train, X_train[boolean], Y_train],
                     axis=1)
XY_test = pd.concat([X_test[numerical], new_cats_test, X_test[boolean], Y_test],
                   axis=1)

XY_trainpk = Path('XY_trainun.pkl')
XY_testpk = Path('XY_testun.pkl')

with open(XY_trainpk, 'wb') as fp:
    pickle.dump(XY_train, fp)
with open(XY_testpk, 'wb') as fp:
    pickle.dump(XY_test, fp)

# And save one set with the collinear fix

In [23]:
XY_train = collinear_fix(XY_train)
XY_test = collinear_fix(XY_test)

XY_trainpk = Path('XY_traincf.pkl')
XY_testpk = Path('XY_testcf.pkl')

with open(XY_trainpk, 'wb') as fp:
    pickle.dump(XY_train, fp)
with open(XY_testpk, 'wb') as fp:
    pickle.dump(XY_test, fp)

In [24]:
XY_test.columns

Index(['dur', 'sttl', 'dttl', 'sload', 'dload', 'smeansz', 'dmeansz',
       'trans_depth', 'res_bdy_len', 'sjit', 'djit', 'sintpkt', 'dintpkt',
       'synack', 'ackdat', 'ct_state_ttl', 'ct_flw_http_mthd', 'ct_ftp_cmd',
       'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_dst_src_ltm', 'proto_arp', 'proto_ospf', 'proto_other', 'proto_udp',
       'proto_unas', 'state_CON', 'state_INT', 'state_REQ', 'is_sm_ips_ports',
       'is_ftp_login', 'label', 'attack_cat', 'avgspk', 'avgdpk', 'avgwin',
       'avg_port_ltm'],
      dtype='object')