In [1]:
import pandas as pd
import numpy as np
import os
import sys
from sklearn.model_selection import train_test_split

sys.path.append(os.environ['CMS_ROOT'])

from cms_modules.utils import get_imbalance_description


pd.set_option('display.max.columns', 150)
pd.set_option('display.max.rows', 150)

### Definitions

In [2]:
data_dir = '/Users/jujohnson/ddos-data/'
combined_path = '20190430_SlowlorisBig_POST_Combined_all.csv'

hdf5_file = '../combined-minmax-scaled.hdf5'
hdf5_path = os.path.join(data_dir, hdf5_file)

raw_key = 'raw'
train_key = 'train_normalized'
test_key = 'test_normalized'

### Load Data

In [3]:
if not os.path.isfile(hdf5_path):
    data = pd.read_csv(os.path.join(data_dir, combined_path))
    data.to_hdf(hdf5_path, raw_key)
else:
    data = pd.read_hdf(hdf5_path, raw_key)

In [4]:
sample_count, feature_count = data.shape
print(f'Sample Count {sample_count}')
print(f'Feature Countn {feature_count}')
print(get_imbalance_description(data['class'], 'attack', 'normal'))

Sample Count 3276866
Feature Countn 78
Negative Samples: 3270220
Positive Samples: 6646
Postive Class Ratio: 0.20281573918494072


In [5]:
data.columns

Index(['packets', 'bytes', 'durmsec', 'payloadInBytes', 'payloadRate',
       'packetsPerSec', 'bytesPerSec', 'bytesPerPacket', 'class',
       'protocol_onehot_1', 'protocol_onehot_6', 'flags_onehot_SRAEC',
       'flags_onehot_RAE', 'flags_onehot_FRA', 'flags_onehot_RA',
       'flags_onehot_RAC', 'flags_onehot_FSPAE', 'flags_onehot_FSRPA',
       'flags_onehot_RPA', 'flags_onehot_RAEC', 'flags_onehot_SPA',
       'flags_onehot_FSRPAEC', 'flags_onehot_AEC', 'flags_onehot_SEC',
       'flags_onehot_FSRA', 'flags_onehot_FSRPAE', 'flags_onehot_SA',
       'flags_onehot_FSAE', 'flags_onehot_FRAC', 'flags_onehot_FSA',
       'flags_onehot_FSRAEC', 'flags_onehot_FPAC', 'flags_onehot_A',
       'flags_onehot_FPA', 'flags_onehot_FSPAEC', 'flags_onehot_FRPA',
       'flags_onehot_SRA', 'flags_onehot_PA', 'flags_onehot_SRPAE',
       'flags_onehot_S', 'flags_onehot_R', 'flags_onehot_FSPA',
       'flags_onehot_SPAE', 'flags_onehot_FSRAE', 'flags_onehot_SREC',
       'flags_onehot_FA', 'flags_o

### Create 80/20 Train Test Split

In [7]:
train_data, test_data = train_test_split(data, test_size=0.2, shuffle=True, stratify=data['class'], random_state=42)

In [44]:
print('Training imbalance levels', get_imbalance_description(train_data['class'], 'attack', 'normal'))
print('Test imbalance levels', get_imbalance_description(test_data['class'], 'attack', 'normal'))

Training imbalance levels Negative Samples: 2616175
Positive Samples: 5317
Postive Class Ratio: 0.202823430321359
Test imbalance levels Negative Samples: 654045
Positive Samples: 1329
Postive Class Ratio: 0.20278497468620968


### Normalize Data

Note - the scaler is fit to training data only and then applied to both train and test data.

In [48]:
# have to convert class label to float first
train_data['class'].replace(['attack', 'normal'], value=[1,0], inplace=True)
test_data['class'].replace(['attack', 'normal'], value=[1,0], inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [50]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

train_data_norm = scaler.fit_transform(train_data)
train_data_norm = pd.DataFrame(train_data_norm, columns=train_data.columns)

test_data_norm = scaler.transform(test_data)
test_data_norm = pd.DataFrame(test_data_norm, columns=test_data.columns)

### Save Normalized Data to HDF5

In [53]:
train_data_norm.to_hdf(hdf5_file, train_key)

In [54]:
test_data_norm.to_hdf(hdf5_file, test_key)

### Read Normalized Data and Check

In [7]:
train_data = pd.read_hdf(hdf5_file, key=train_key)
test_data = pd.read_hdf(hdf5_file, key=test_key)

In [8]:
test_data.head()

Unnamed: 0,packets,bytes,durmsec,payloadInBytes,payloadRate,packetsPerSec,bytesPerSec,bytesPerPacket,class,protocol_onehot_1,protocol_onehot_6,flags_onehot_SRAEC,flags_onehot_RAE,flags_onehot_FRA,flags_onehot_RA,flags_onehot_RAC,flags_onehot_FSPAE,flags_onehot_FSRPA,flags_onehot_RPA,flags_onehot_RAEC,flags_onehot_SPA,flags_onehot_FSRPAEC,flags_onehot_AEC,flags_onehot_SEC,flags_onehot_FSRA,flags_onehot_FSRPAE,flags_onehot_SA,flags_onehot_FSAE,flags_onehot_FRAC,flags_onehot_FSA,flags_onehot_FSRAEC,flags_onehot_FPAC,flags_onehot_A,flags_onehot_FPA,flags_onehot_FSPAEC,flags_onehot_FRPA,flags_onehot_SRA,flags_onehot_PA,flags_onehot_SRPAE,flags_onehot_S,flags_onehot_R,flags_onehot_FSPA,flags_onehot_SPAE,flags_onehot_FSRAE,flags_onehot_SREC,flags_onehot_FA,flags_onehot_SRPAEC,flags_onehot_FSAEC,flags_onehot_FRPAC,flags_onehot_SAEC,flags_onehot_SRPA,flags_onehot_SR,flags_onehot_SAE,flags_onehot_SPAEC,initialFlags_onehot_RAE,initialFlags_onehot_RA,initialFlags_onehot_RAC,initialFlags_onehot_RAEC,initialFlags_onehot_AEC,initialFlags_onehot_SEC,initialFlags_onehot_SA,initialFlags_onehot_A,initialFlags_onehot_FPA,initialFlags_onehot_PA,initialFlags_onehot_S,initialFlags_onehot_R,initialFlags_onehot_FA,initialFlags_onehot_SAEC,initialFlags_onehot_SAE,protocol_onehot_17,initialFlags_onehot_FPU,initialFlags_onehot_FSPU,initialFlags_onehot_SE,initialFlags_onehot_FSRPAU,flags_onehot_FPU,flags_onehot_SRE,flags_onehot_FSPU,flags_onehot_FSRPAU
0,0.000212,2.105547e-05,0.0003502251,1.490359e-05,1.505726e-05,0.000129,2.2e-05,0.033232,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000119,1.203356e-05,0.0001256562,8.560633e-06,2.410589e-05,0.000208,3.5e-05,0.033105,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.2e-05,2.796072e-06,7.835013e-07,1.455982e-06,0.0006575342,0.013333,0.001351,0.016438,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.5e-05,9.66866e-07,0.0001252906,1.078505e-07,3.045205e-07,5.5e-05,3e-06,0.005479,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,2.613151e-07,0.0,3.235515e-07,0.01643836,0.25,0.022667,0.013699,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
print('Training Distribution', get_imbalance_description(train_data['class']))
print('Test Distribution', get_imbalance_description(test_data['class']))

Training Distribution Negative Samples: 2616175
Positive Samples: 5317
Postive Class Ratio: 0.202823430321359
Test Distribution Negative Samples: 654045
Positive Samples: 1329
Postive Class Ratio: 0.20278497468620968
