### Made by Liang Yihuai

This notebook is to preprocess data set, one-hot encoding and extension, and finally save to hard disk.

In [3]:
#imports
import os
from os.path import join
import glob
import pandas as pd
import numpy as np
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
# load data
dataroot = 'dataset/MachineLearningCVE/'

In [5]:
def read_data(dataroot,file_ending):
    if file_ending==None:
        print("please specify file ending pattern for glob")
        exit()
    print(join(dataroot,file_ending))
    filenames = [i for i in glob.glob(join(dataroot,file_ending))]
    combined_csv = pd.concat([pd.read_csv(f, low_memory=False) for f in filenames],sort=False)
    return combined_csv

In [6]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [7]:
data = read_data(dataroot,'*.pcap_ISCX.csv')

dataset/MachineLearningCVE/*.pcap_ISCX.csv


In [8]:
num_records,num_features = data.shape
print("{} flow records read which has {} feature dimension".format(num_records,num_features))

2830743 flow records read which has 79 feature dimension


In [9]:
# there is white spaces in columns names e.g. ' Destination Port'
# So strip the whitespace from  column names
data = data.rename(columns=lambda x: x.strip())
df_label = data['Label']
data = data.drop(columns=['Flow Packets/s','Flow Bytes/s', 'Fwd Avg Bytes/Bulk',
                          'Fwd Avg Packets/Bulk','Fwd Avg Bulk Rate',
                          'Bwd Avg Bytes/Bulk','Bwd Avg Packets/Bulk','Bwd Avg Bulk Rate', 'CWE Flag Count',
                         'Fwd URG Flags','RST Flag Count','ECE Flag Count',
                          'Bwd URG Flags','Bwd PSH Flags','Label'])
data.fillna(data.mean(), inplace=True)

In [10]:
for i in  data.columns.tolist():
    print(i, ', min=', min(data[i]), ', max=', max(data[i]))

Destination Port , min= 0 , max= 65535
Flow Duration , min= -13 , max= 119999998
Total Fwd Packets , min= 1 , max= 219759
Total Backward Packets , min= 0 , max= 291922
Total Length of Fwd Packets , min= 0 , max= 12900000
Total Length of Bwd Packets , min= 0 , max= 655453030
Fwd Packet Length Max , min= 0 , max= 24820
Fwd Packet Length Min , min= 0 , max= 2325
Fwd Packet Length Mean , min= 0.0 , max= 5940.857143
Fwd Packet Length Std , min= 0.0 , max= 7125.5968458437
Bwd Packet Length Max , min= 0 , max= 19530
Bwd Packet Length Min , min= 0 , max= 2896
Bwd Packet Length Mean , min= 0.0 , max= 5800.5
Bwd Packet Length Std , min= 0.0 , max= 8194.660487000001
Flow IAT Mean , min= -13.0 , max= 120000000.0
Flow IAT Std , min= 0.0 , max= 84800261.5664079
Flow IAT Max , min= -13 , max= 120000000
Flow IAT Min , min= -14 , max= 120000000
Fwd IAT Total , min= 0 , max= 120000000
Fwd IAT Mean , min= 0.0 , max= 120000000.0
Fwd IAT Std , min= 0.0 , max= 84602929.2769822
Fwd IAT Max , min= 0 , max= 12

In [11]:
display_all(data.describe().T)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Destination Port,2830743.0,8071.483,18283.63,0.0,53.0,80.0,443.0,65535.0
Flow Duration,2830743.0,14785660.0,33653740.0,-13.0,155.0,31316.0,3204828.0,120000000.0
Total Fwd Packets,2830743.0,9.36116,749.6728,1.0,2.0,2.0,5.0,219759.0
Total Backward Packets,2830743.0,10.39377,997.3883,0.0,1.0,2.0,4.0,291922.0
Total Length of Fwd Packets,2830743.0,549.3024,9993.589,0.0,12.0,62.0,187.0,12900000.0
Total Length of Bwd Packets,2830743.0,16162.64,2263088.0,0.0,0.0,123.0,482.0,655453000.0
Fwd Packet Length Max,2830743.0,207.5999,717.1848,0.0,6.0,37.0,81.0,24820.0
Fwd Packet Length Min,2830743.0,18.71366,60.33935,0.0,0.0,2.0,36.0,2325.0
Fwd Packet Length Mean,2830743.0,58.20194,186.0912,0.0,6.0,34.0,50.0,5940.857
Fwd Packet Length Std,2830743.0,68.91013,281.1871,0.0,0.0,0.0,26.16295,7125.597


In [12]:
data = data.astype(float).apply(pd.to_numeric)

In [13]:
# lets count if there is NaN values in our dataframe( AKA missing features)
data.isnull().sum().sum()

0

In [14]:
unique,counts = np.unique(df_label,return_counts=True)
mean_samples_per_class = int(round(np.mean(counts)))
print(mean_samples_per_class)
print(mean_samples_per_class * len(counts))

188716
2830740


In [15]:
#Since the above data is imbalanced we do oversampling to balance data
def balance_data(X,y):
    unique,counts = np.unique(y,return_counts=True)
    mean_samples_per_class = int(round(np.mean(counts)))
    new_X = np.empty((0,X.shape[1]))
    new_y = np.empty((0),dtype=int)
    for i,c in enumerate(unique):
        temp_x = X[y==c]
        indices = np.random.choice(temp_x.shape[0],mean_samples_per_class)
        new_X = np.concatenate((new_X,temp_x[indices]),axis=0)
        temp_y = np.ones(mean_samples_per_class,dtype=int)*c
        new_y = np.concatenate((new_y,temp_y),axis=0)

    # in order to break class order in data we need shuffling
    indices = np.arange(new_y.shape[0])
    np.random.shuffle(indices)
    new_X =  new_X[indices,:]
    new_y = new_y[indices]
    return (new_X,new_y)


In [16]:
# chganges label from string to integer/index
def encode_label(Y_str):
    labels_d = make_value2index(np.unique(Y_str))
    Y = [labels_d[y_str] for y_str  in Y_str]
    Y = np.array(Y)
    return np.array(Y)

def make_value2index(attacks):
    #make dictionary
    attacks = sorted(attacks)
    d = {}
    counter=0
    for attack in attacks:
        d[attack] = counter
        counter+=1
    return d


In [17]:
# normalization
def normilize(data):
        data = data.astype(np.float32)
       
        eps = 1e-15

        mask = data==-1
        data[mask]=0
        mean_i = np.mean(data,axis=0)
        min_i = np.min(data,axis=0) #  to leave -1 (missing features) values as is and exclude in normilizing
        max_i = np.max(data,axis=0)

        r = max_i-min_i+eps
        data = (data-mean_i)/r  # zero centered 

        #deal with missing features -1
        data[mask] = 0
        
        nan_index = np.isnan(data)
        nan_data = data[nan_index]
        
        return data

In [18]:
from sklearn import preprocessing
data_np = data.values # convert to numpy array
X = data_np
X = normilize(X)
#X = preprocessing.scale(X)
y = df_label.values
y = encode_label(y)
N = X.shape[0]
print(X.shape,y.shape)

(2830743, 64) (2830743,)


In [19]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

X_train, X_final_test, y_train, y_final_test = train_test_split(
    X, y, test_size=0.2, random_state=2)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.1, random_state=2)

In [20]:
unique,counts = np.unique(y_train,return_counts=True)
print(unique, counts)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14] [1636805    1402   92068    7398  166350    3951    4162    5821       5
      27  114386    4188    1085      13     473]


In [21]:
np.random.seed(5)
X_train_balanced,y_train_balanced = balance_data(X_train,y_train)
print(X_train_balanced.shape, y_train_balanced.shape, data.shape)

unique,counts = np.unique(y_train_balanced,return_counts=True)
print(unique, counts)

(2038140, 64) (2038140,) (2830743, 64)
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14] [135876 135876 135876 135876 135876 135876 135876 135876 135876 135876
 135876 135876 135876 135876 135876]


In [22]:
print(X_train_balanced.shape, y_train_balanced.shape)

(2038140, 64) (2038140,)


In [23]:
display_all(pd.DataFrame(X_train_balanced).describe().T)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,2038140.0,-0.085244,0.143164,-0.123163,-0.1219422,-0.1219422,-0.1163879,0.876776
1,2038140.0,0.115452,0.350921,-0.123214,-0.1232056,-0.07846361,0.4024964,0.876786
2,2038140.0,0.000994,0.003784,-3.8e-05,-3.349667e-05,-2.894621e-05,-6.19391e-06,0.974325
3,2038140.0,0.000607,0.002616,-3.6e-05,-3.217903e-05,-2.875346e-05,-1.505117e-05,0.974889
4,2038140.0,0.002419,0.018595,-4.3e-05,-4.258159e-05,-4.056608e-05,-9.536814e-07,0.222136
5,2038140.0,0.000657,0.00301,-2.5e-05,-2.465874e-05,-2.464959e-05,-2.034875e-05,0.960533
6,2038140.0,0.012967,0.055768,-0.008364,-0.008364217,-0.007558415,0.008879941,0.991636
7,2038140.0,-0.003249,0.050393,-0.008049,-0.008048884,-0.008048884,-0.008048884,0.880123
8,2038140.0,0.001376,0.038146,-0.009797,-0.009796889,-0.008618607,0.0001343389,0.990203
9,2038140.0,0.001517,0.028313,-0.009671,-0.009670787,-0.008876909,0.008410029,0.979646


In [24]:
def extend2D(normalized_X, extensionSize=32):
    if np.ndim(normalized_X) != 2:
        print("the first parameter should have 2 dimension.")
        return 
        
    X = pd.DataFrame(normalized_X)
    
    helf_extension_size = extensionSize//2 # nagetive value and positive value
    result = None
    for i in range(len(X)):
        row = X.iloc[i]
        arr = np.zeros(shape=(X.shape[1], extensionSize), dtype=np.uint8) # one 2D image
        value_index = 0; # the row index of the new 2D image
        for value in row:
            if value < 0:
                index = round((-value) * (helf_extension_size -1))
                arr[value_index][index] = 1
            else:
                index = round(value * helf_extension_size) + helf_extension_size -1
                arr[value_index][index] = 1
            value_index += 1
        
        if not result is None:
            result = pd.concat([result, pd.DataFrame(arr.reshape(1, arr.size), dtype=np.uint8)])
        else:
            result = pd.DataFrame(arr.reshape(1, arr.size), dtype=np.uint8)
    return  result

In [25]:
# test_extend2D
tem = pd.DataFrame([[0.1, -0.1], 
                    [0.3, 0.99], 
                    [-0.5, 0.9]])

ex = extend2D(tem, 32)
print('before extension, shape=', tem.shape)
print("after extension, shape=", ex.shape)
ex

before extension, shape= (3, 2)
after extension, shape= (3, 64)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [26]:
ex.values.reshape((3, 2, 32))


array([[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],

       [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]],

       [[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]]], dtype=uint8)

## Save y_train, X_test, y_test, X_val, y_val to hard disk

In [25]:


os.makedirs('tmp/y_train', exist_ok=True)
path_saved = "tmp/y_train/y_train_raw"
pd.DataFrame(y_train_balanced).to_csv(path_saved, header=False)

In [27]:
os.makedirs('tmp/y_test', exist_ok=True)
path_saved = "tmp/y_test/y_test_raw"
pd.DataFrame(y_final_test).to_csv(path_saved, header=False)

In [28]:
print(X_final_test.shape)
print(X_val.shape)

(566149, 64)
(226460, 64)


In [29]:
import time;

range_arr = list()
step = 10000
start = 0
for i in range(step, len(X_final_test), step):
    range_arr.append((start, i))
    start = i

range_arr.append((start, len(X_final_test)))
for i in range_arr:
    s, end = i
    print(s, end)

os.makedirs('tmp/X_test', exist_ok=True)
path_saved = "tmp/X_test/X_test_raw"
for r in range_arr:
    start_time = time.time()
    begin, end = r
    X_test_extension = extend2D(X_final_test[begin: end], extensionSize=32)
    X_test_extension.to_csv(path_saved, header=False, mode='a')
    end_time = time.time()
    print(end, end_time-start_time)

0 10000
10000 20000
20000 30000
30000 40000
40000 50000
50000 60000
60000 70000
70000 80000
80000 90000
90000 100000
100000 110000
110000 120000
120000 130000
130000 140000
140000 150000
150000 160000
160000 170000
170000 180000
180000 190000
190000 200000
200000 210000
210000 220000
220000 230000
230000 240000
240000 250000
250000 260000
260000 270000
270000 280000
280000 290000
290000 300000
300000 310000
310000 320000
320000 330000
330000 340000
340000 350000
350000 360000
360000 370000
370000 380000
380000 390000
390000 400000
400000 410000
410000 420000
420000 430000
430000 440000
440000 450000
450000 460000
460000 470000
470000 480000
480000 490000
490000 500000
500000 510000
510000 520000
520000 530000
530000 540000
540000 550000
550000 560000
560000 566149
10000 67.51296997070312
20000 69.06040239334106
30000 69.12736105918884
40000 67.55696105957031
50000 66.90960001945496
60000 69.91655731201172
70000 67.19030356407166
80000 66.8316900730133
90000 67.09940981864929
100000 69.

In [34]:
X_val.shape

(226460, 64)

In [None]:
range_arr = list()
step = 10000
start = 0
for i in range(step, len(X_val), step):
    range_arr.append((start, i))
    start = i

range_arr.append((start, len(X_val)))
for i in range_arr:
    s, end = i
    print(s, end)


os.makedirs('tmp/X_val', exist_ok=True)
path_saved = "tmp/X_val/X_val_raw"
for r in range_arr:
    start_time = time.time()
    begin, end = r
    # input all data
    X_val_extension = extend2D(X_val[begin:end], extensionSize=32)
    
    X_val_extension.to_csv(path_saved, header=False, mode='a')
    
    end_time = time.time()
    print(end, end_time-start_time)

In [29]:
os.makedirs('tmp/y_val', exist_ok=True)
path_saved = "tmp/y_val/y_val_raw"
pd.DataFrame(y_val).to_csv(path_saved, header=False)

In [120]:
X_train_balanced.shape

(2038140, 64)

In [119]:
range_arr = list()
step = 10000
start = 0
for i in range(step, len(X_train_balanced), step):
    range_arr.append((start, i))
    start = i

range_arr.append((start, len(X_train_balanced)))
for i in range_arr:
    s, end = i
    print(s, end)
    

0 10000
10000 20000
20000 30000
30000 40000
40000 50000
50000 60000
60000 70000
70000 80000
80000 90000
90000 100000
100000 110000
110000 120000
120000 130000
130000 140000
140000 150000
150000 160000
160000 170000
170000 180000
180000 190000
190000 200000
200000 210000
210000 220000
220000 230000
230000 240000
240000 250000
250000 260000
260000 270000
270000 280000
280000 290000
290000 300000
300000 310000
310000 320000
320000 330000
330000 340000
340000 350000
350000 360000
360000 370000
370000 380000
380000 390000
390000 400000
400000 410000
410000 420000
420000 430000
430000 440000
440000 450000
450000 460000
460000 470000
470000 480000
480000 490000
490000 500000
500000 510000
510000 520000
520000 530000
530000 540000
540000 550000
550000 560000
560000 570000
570000 580000
580000 590000
590000 600000
600000 610000
610000 620000
620000 630000
630000 640000
640000 650000
650000 660000
660000 670000
670000 680000
680000 690000
690000 700000
700000 710000
710000 720000
720000 730000
7

## Hot-one transformation and save X_train_balanced to hard disk


In [130]:
import time;

for r in range_arr:
    start_time = time.time()
    begin, end = r
    # input all data
    extension_data = extend2D(X_train_balanced[begin:end], extensionSize=32)
    
    os.makedirs('tmp', exist_ok=True)
    path_saved = "tmp/balanced-raw"+str(begin)+"_"+str(end);
    extension_data.to_csv(path_saved, header=False)
    
    end_time = time.time()
    print(end, end_time-start_time)

10000 69.33211374282837
20000 69.95548892021179
30000 69.28317618370056
40000 69.99145293235779
50000 70.39108395576477
60000 69.79766726493835
70000 68.86758708953857
80000 68.89756727218628
90000 68.91856527328491
100000 68.60686802864075
110000 69.90154218673706
120000 69.66678547859192
130000 69.30017685890198
140000 69.17228937149048
150000 68.98448491096497
160000 68.31816005706787
170000 69.48497009277344
180000 69.59385871887207
190000 68.9095573425293
200000 69.21424698829651
210000 69.78866028785706
220000 69.22423696517944
230000 68.46501469612122
240000 69.79265713691711
250000 70.46596693992615
260000 70.23522329330444
270000 71.41398024559021
280000 69.26519393920898
290000 72.39199924468994
300000 71.93346548080444
310000 69.29615998268127
320000 71.72068428993225
330000 69.41004681587219
340000 70.97582149505615
350000 72.04135608673096
360000 71.57647061347961
370000 69.28071928024292
380000 74.03511452674866
390000 69.19325566291809
400000 69.82562255859375
410000 69.