In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, Normalizer, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt

**Read clean data**

In [4]:
clean = pd.read_csv("../data/clean1.csv")

In [5]:
clean.head()

Unnamed: 0,destination_port,flow_duration,total_fwd_packets,total_backward_packets,total_length_of_fwd_packets,total_length_of_bwd_packets,fwd_packet_length_max,fwd_packet_length_min,fwd_packet_length_mean,fwd_packet_length_std,...,min_seg_size_forward,active_mean,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min,label
0,54865,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,55054,109,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,55055,52,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,46236,34,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,54863,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


**Label Encoding**

In [6]:
le = LabelEncoder()
clean['label'] = le.fit_transform(clean['label'])
encoded_labels = {num:label for (num, label) in zip(range(15), le.classes_)}
encoded_labels

{0: 'BENIGN',
 1: 'Bot',
 2: 'DDoS',
 3: 'DoS_GoldenEye',
 4: 'DoS_Hulk',
 5: 'DoS_Slowhttptest',
 6: 'DoS_slowloris',
 7: 'FTPPatator',
 8: 'Heartbleed',
 9: 'Infiltration',
 10: 'PortScan',
 11: 'SSHPatator',
 12: 'Web_Attack_Brute_Force',
 13: 'Web_Attack_Sql_Injection',
 14: 'Web_Attack_XSS'}

In [12]:
len(np.unique(clean['label']))

15

In [19]:
data_clean = clean

In [20]:
len(np.unique(data_clean['label']))

15

In [21]:
data_clean.shape

(2827876, 79)

**Train Test Split**

In [22]:
x_train, x_test, y_train, y_test = train_test_split(data_clean.iloc[:,:-1], 
                                                    data_clean['label'], 
                                                    test_size=1/7.0, 
                                                    random_state=0)

In [23]:
print("x_train", x_train.shape)
print("y_train", y_train.shape)
print("x_test", x_test.shape)
print("y_test", y_test.shape)

x_train (2423893, 78)
y_train (2423893,)
x_test (403983, 78)
y_test (403983,)


In [24]:
len(np.unique(y_train))

15

**Data standardization**

In [25]:
ss = StandardScaler().fit(x_train)
x_train = ss.transform(x_train)
x_test = ss.transform(x_test)

In [26]:
print("x_train", x_train.shape)
print("y_train", y_train.shape)
print("x_test", x_test.shape)
print("y_test", y_test.shape)

x_train (2423893, 78)
y_train (2423893,)
x_test (403983, 78)
y_test (403983,)


In [29]:
x_train_df = pd.DataFrame(x_train)
x_train_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,68,69,70,71,72,73,74,75,76,77
count,2423893.0,2423893.0,2423893.0,2423893.0,2423893.0,2423893.0,2423893.0,2423893.0,2423893.0,2423893.0,...,2423893.0,2423893.0,2423893.0,2423893.0,2423893.0,2423893.0,2423893.0,2423893.0,2423893.0,2423893.0
mean,5.56963e-15,-3.979661e-16,1.835861e-15,2.686612e-15,7.419108e-15,-6.472542e-17,2.238185e-15,1.008723e-14,-1.545215e-16,9.47533e-15,...,-7.143387e-16,4.657786e-15,-1.326407e-14,1.643891e-14,-1.620532e-14,-6.933288e-15,1.957735e-14,2.171427e-15,1.374099e-14,5.278001e-15
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-0.4411298,-0.4396094,-0.01140629,-0.01059977,-0.05243808,-0.007194978,-0.289591,-0.3099415,-0.3130656,-0.2451598,...,-0.008695153,-480.7329,-0.1270632,-0.1058741,-0.150341,-0.1021455,-0.3520982,-0.1094647,-0.3570422,-0.3391643
25%,-0.4382287,-0.4396044,-0.00999744,-0.009545546,-0.05129466,-0.007194035,-0.2812193,-0.3099415,-0.280781,-0.2451598,...,-0.008695153,0.002597627,-0.1270632,-0.1058741,-0.150341,-0.1021455,-0.3520982,-0.1094647,-0.3570422,-0.3391643
50%,-0.4367509,-0.4386782,-0.00999744,-0.008491323,-0.04653037,-0.007136973,-0.2379657,-0.2768533,-0.1301196,-0.2451598,...,-0.007015856,0.002601208,-0.1270632,-0.1058741,-0.150341,-0.1021455,-0.3520982,-0.1094647,-0.3570422,-0.3391643
75%,-0.4168816,-0.3432397,-0.005770889,-0.006382879,-0.03452438,-0.006966728,-0.1765735,0.285647,-0.04402741,-0.1520273,...,-0.005336559,0.002608372,-0.1270632,-0.1058741,-0.150341,-0.1021455,-0.3520982,-0.1094647,-0.3570422,-0.3391643
max,3.145954,3.123704,308.0436,307.0421,1229.132,302.2835,34.34119,38.15515,31.65328,25.11989,...,348.4471,0.002703288,171.3004,181.7948,107.8994,192.5776,4.724705,16.60525,4.566512,4.795453


In [30]:
len(np.unique(y_train))

15

**PCA**

In [31]:
pca = PCA(.99).fit(x_train)
x_train = pca.transform(x_train)
x_test = pca.transform(x_test)

In [32]:
print("x_train", x_train.shape)
print("y_train", y_train.shape)
print("x_test", x_test.shape)
print("y_test", y_test.shape)

x_train (2423893, 33)
y_train (2423893,)
x_test (403983, 33)
y_test (403983,)


**Normalizing**

In [33]:
norm = Normalizer().fit(x_train)
x_train = norm.transform(x_train)
x_test = norm.transform(x_test)

**Reshaping labels**

In [34]:
y_train = y_train.values.reshape(-1,1)
y_test = y_test.values.reshape(-1,1)

**Saving**

In [35]:
np.save("../data/preproc/x_train.npy", x_train)
np.save("../data/preproc/y_train.npy", y_train)
np.save("../data/preproc/x_test.npy", x_test)
np.save("../data/preproc/y_test.npy", y_test)

In [36]:
print("x_train", x_train.shape)
print("y_train", y_train.shape)
print("x_test", x_test.shape)
print("y_test", y_test.shape)

x_train (2423893, 33)
y_train (2423893, 1)
x_test (403983, 33)
y_test (403983, 1)


In [37]:
len(np.unique(y_train))

15