In [1]:
#imports
import os
from os.path import join
import glob
import pandas as pd
import numpy as np
%load_ext autoreload

In [2]:
# load data
dataroot = 'dataset/MachineLearningCVE/'

In [3]:
def read_data(dataroot,file_ending):
    if file_ending==None:
        print("please specify file ending pattern for glob")
        exit()
    print(join(dataroot,file_ending))
    filenames = [i for i in glob.glob(join(dataroot,file_ending))]
    combined_csv = pd.concat([pd.read_csv(f) for f in filenames],sort=False)
    return combined_csv

In [4]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [5]:
data = read_data(dataroot,'*.pcap_ISCX.csv')

dataset/MachineLearningCVE/*.pcap_ISCX.csv


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [6]:
num_records,num_features = data.shape
print("{} flow records read which has {} feature dimension".format(num_records,num_features))

2830743 flow records read which has 79 feature dimension


In [7]:
# there is white spaces in columns names e.g. ' Destination Port'
# So strip the whitespace from  column names
data = data.rename(columns=lambda x: x.strip())
df_label = data['Label']
data = data.drop(columns=['Flow Packets/s','Flow Bytes/s','Label'])
data.fillna(data.mean(), inplace=True)

In [21]:
for i in  data.columns.tolist():
    print(i, ', min=', min(data[i]), ', max=', max(data[i]))

Destination Port , min= 0 , max= 65535
Flow Duration , min= -13 , max= 119999998
Total Fwd Packets , min= 1 , max= 219759
Total Backward Packets , min= 0 , max= 291922
Total Length of Fwd Packets , min= 0 , max= 12900000
Total Length of Bwd Packets , min= 0 , max= 655453030
Fwd Packet Length Max , min= 0 , max= 24820
Fwd Packet Length Min , min= 0 , max= 2325
Fwd Packet Length Mean , min= 0.0 , max= 5940.857143
Fwd Packet Length Std , min= 0.0 , max= 7125.5968458437
Bwd Packet Length Max , min= 0 , max= 19530
Bwd Packet Length Min , min= 0 , max= 2896
Bwd Packet Length Mean , min= 0.0 , max= 5800.5
Bwd Packet Length Std , min= 0.0 , max= 8194.660487000001
Flow IAT Mean , min= -13.0 , max= 120000000.0
Flow IAT Std , min= 0.0 , max= 84800261.5664079
Flow IAT Max , min= -13 , max= 120000000
Flow IAT Min , min= -14 , max= 120000000
Fwd IAT Total , min= 0 , max= 120000000
Fwd IAT Mean , min= 0.0 , max= 120000000.0
Fwd IAT Std , min= 0.0 , max= 84602929.2769822
Fwd IAT Max , min= 0 , max= 12

In [9]:
display_all(data.describe().T)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Destination Port,2830743.0,8071.483,18283.63,0.0,53.0,80.0,443.0,65535.0
Flow Duration,2830743.0,14785660.0,33653740.0,-13.0,155.0,31316.0,3204828.0,120000000.0
Total Fwd Packets,2830743.0,9.36116,749.6728,1.0,2.0,2.0,5.0,219759.0
Total Backward Packets,2830743.0,10.39377,997.3883,0.0,1.0,2.0,4.0,291922.0
Total Length of Fwd Packets,2830743.0,549.3024,9993.589,0.0,12.0,62.0,187.0,12900000.0
Total Length of Bwd Packets,2830743.0,16162.64,2263088.0,0.0,0.0,123.0,482.0,655453000.0
Fwd Packet Length Max,2830743.0,207.5999,717.1848,0.0,6.0,37.0,81.0,24820.0
Fwd Packet Length Min,2830743.0,18.71366,60.33935,0.0,0.0,2.0,36.0,2325.0
Fwd Packet Length Mean,2830743.0,58.20194,186.0912,0.0,6.0,34.0,50.0,5940.857
Fwd Packet Length Std,2830743.0,68.91013,281.1871,0.0,0.0,0.0,26.16295,7125.597


In [8]:
data = data.astype(float).apply(pd.to_numeric)

In [11]:
# lets count if there is NaN values in our dataframe( AKA missing features)
data.isnull().sum().sum()

0

In [9]:
unique,counts = np.unique(df_label,return_counts=True)
mean_samples_per_class = int(round(np.mean(counts)))
print(mean_samples_per_class)
print(mean_samples_per_class * len(counts))

188716
2830740


In [10]:
#Since the above data is imbalanced we do oversampling to balance data
def balance_data(X,y):
    unique,counts = np.unique(y,return_counts=True)
    mean_samples_per_class = int(round(np.mean(counts)))
    new_X = np.empty((0,X.shape[1]))
    new_y = np.empty((0),dtype=int)
    for i,c in enumerate(unique):
        temp_x = X[y==c]
#         print(c, sum(y==c), 'temp_x len = ', len(temp_x))
        indices = np.random.choice(temp_x.shape[0],mean_samples_per_class)
#         print('indices len = ', len(indices))
        new_X = np.concatenate((new_X,temp_x[indices]),axis=0)
        temp_y = np.ones(mean_samples_per_class,dtype=int)*c
        new_y = np.concatenate((new_y,temp_y),axis=0)

    # in order to break class order in data we need shuffling
    indices = np.arange(new_y.shape[0])
    np.random.shuffle(indices)
    new_X =  new_X[indices,:]
    new_y = new_y[indices]
    return (new_X,new_y)


In [11]:
# chganges label from string to integer/index
def encode_label(Y_str):
    labels_d = make_value2index(np.unique(Y_str))
    Y = [labels_d[y_str] for y_str  in Y_str]
    Y = np.array(Y)
    return np.array(Y)

def make_value2index(attacks):
    #make dictionary
    attacks = sorted(attacks)
    d = {}
    counter=0
    for attack in attacks:
        d[attack] = counter
        counter+=1
    return d


In [12]:
# normalization
def normilize(data):
        data = data.astype(np.float32)
       
        eps = 1e-15

        mask = data==-1
        data[mask]=0
        mean_i = np.mean(data,axis=0)
        min_i = np.min(data,axis=0) #  to leave -1 (missing features) values as is and exclude in normilizing
        max_i = np.max(data,axis=0)

        r = max_i-min_i+eps
        data = (data-mean_i)/r  # zero centered 

        #deal with missing features -1
        data[mask] = 0
        
        nan_index = np.isnan(data)
        nan_data = data[nan_index]
        
        return data

In [13]:
from sklearn import preprocessing
data_np = data.values # convert to numpy array
X = data_np
X = normilize(X)
#X = preprocessing.scale(X)
y = df_label.values
y = encode_label(y)
N = X.shape[0]
print(X.shape,y.shape)

(2830743, 76) (2830743,)


In [14]:

from sklearn.ensemble import RandomForestClassifier

def ensure_dir(dir_path):
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)

def getClassifier():
    
    clf = RandomForestClassifier()
    return clf


In [15]:
# lets try simple RF classifier performance from sklearn

from sklearn.model_selection import train_test_split
from sklearn import metrics

X_train, X_final_test, y_train, y__final_test = train_test_split(
    X, y, test_size=0.2, random_state=2)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.1, random_state=2)


In [16]:
unique,counts = np.unique(y_train,return_counts=True)
print(unique, counts)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14] [1636805    1402   92068    7398  166350    3951    4162    5821       5
      27  114386    4188    1085      13     473]


In [17]:

np.random.seed(5)
X_train_balanced,y_train_balanced = balance_data(X_train,y_train)
print(X_train_balanced.shape, y_train_balanced.shape, data.shape)

unique,counts = np.unique(y_train_balanced,return_counts=True)
print(unique, counts)

(2038140, 76) (2038140,) (2830743, 76)
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14] [135876 135876 135876 135876 135876 135876 135876 135876 135876 135876
 135876 135876 135876 135876 135876]


In [34]:
display_all(pd.DataFrame(X_train_balanced).describe().T)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,2038140.0,-0.085244,0.143164,-0.123163,-0.1219422,-0.1219422,-0.1163879,0.876776
1,2038140.0,0.115452,0.350921,-0.123214,-0.1232056,-0.07846361,0.4024964,0.876786
2,2038140.0,0.000994,0.003784,-3.8e-05,-3.349667e-05,-2.894621e-05,-6.19391e-06,0.974325
3,2038140.0,0.000607,0.002616,-3.6e-05,-3.217903e-05,-2.875346e-05,-1.505117e-05,0.974889
4,2038140.0,0.002419,0.018595,-4.3e-05,-4.258159e-05,-4.056608e-05,-9.536814e-07,0.222136
5,2038140.0,0.000657,0.00301,-2.5e-05,-2.465874e-05,-2.464959e-05,-2.034875e-05,0.960533
6,2038140.0,0.012967,0.055768,-0.008364,-0.008364217,-0.007558415,0.008879941,0.991636
7,2038140.0,-0.003249,0.050393,-0.008049,-0.008048884,-0.008048884,-0.008048884,0.880123
8,2038140.0,0.001376,0.038146,-0.009797,-0.009796889,-0.008618607,0.0001343389,0.990203
9,2038140.0,0.001517,0.028313,-0.009671,-0.009670787,-0.008876909,0.008410029,0.979646


In [18]:
def extend2D(normalized_X, extensionSize=64):
    X = pd.DataFrame(normalized_X)

    result = None;
    for i in range(len(X)):
        row = X.loc[i]
        arr = np.zeros(shape=(X.shape[1], extensionSize), dtype=int)
        value_index = 0;
        for value in row:
            if value < 0:
                index = round((-value) * (extensionSize//2 -1))
                arr[value_index][index] = 1
            else:
                index = round(value * (extensionSize//2)) + extensionSize//2 -1
                arr[value_index][index] = 1
            value_index += 1
        if result is not None:
            result = np.append(result, arr)
        else:
            result = arr
    return result.reshape(X.shape[0], X.shape[1], extensionSize);

In [19]:
# test_extend2D
tem = pd.DataFrame([[0.1, -0.1], 
                    [0.3, 0.6], 
                    [-0.5, -0.9]])

ex = extend2D(tem)
print('before extension, shape=', tem.shape)
print("after extension, shape=", ex.shape)
ex

before extension, shape= (3, 2)
after extension, shape= (3, 2, 64)


array([[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0]],

       [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0]],

       [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        

In [23]:
# currently only input 1000 data
extension_data = extend2D(X_train_balanced[:1000])
print(extension_data.shape)
print(extension_data[0:2])

(1000, 76, 64)
[[[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [1 0 0 ... 0 0 0]
  ...
  [1 0 0 ... 0 0 0]
  [0 0 1 ... 0 0 0]
  [0 0 1 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [1 0 0 ... 0 0 0]
  ...
  [1 0 0 ... 0 0 0]
  [0 0 1 ... 0 0 0]
  [0 0 1 ... 0 0 0]]]


In [None]:
# input all data
extension_data = extend2D(X_train_balanced)
print(extension_data.shape)
print(extension_data[0:2])