In [1]:
import os
import sys
import pandas as pd
import numpy as np

from sklearn.ensemble import *
from sklearn.neural_network import *
from sklearn.tree import *
from sklearn.metrics import *
import time


root_folder = "..\\data\\IDS17\\flows\\"
root_folder = "I:\\Datasets\\NIDS-Datasets\\preprocessed\\GTCS\\flows\\"

In [2]:
#### PARAMETERS #####

test_size = 0.2 # proportion of the dataset used for testing. We always kept it fixed to 0.2 for our paper
train_size = 0.99 # proportion of the REMAINING data that are used for training (if >1, then it will take that exact amount). To reproduce the results of the paper, use: 100 (for "limited" training data) or 0.2 or 0.5 or 0.99 (for scarce, moderate, abundant training data, respectively) 
max_size = 500000 ## maximum amount of samples to include when creating the initial dataframes. This is fixed in our paper
max_size_atk = int(max_size / 3) # maximum amount of malicious samples per class. This is fixed in our paper

In [3]:
### Reading input data

malicious_folder = root_folder + "malicious/"

benign_file = root_folder + "benign.csv"
benign_df = pd.read_csv(benign_file, header='infer', index_col=0)
benign_df = benign_df.sample(min(max_size, len(benign_df)))

benign_df.reset_index(inplace=True, drop=True)

attack_names = ["ddos", "bot", "brute", "infi"] # these are the attacks in the GTCS dataset

ddos_file = malicious_folder + "ddos.csv"
bot_file = malicious_folder + "botnet.csv"
brute_file = malicious_folder + "bruteforce.csv"
infi_file = malicious_folder + "infiltration.csv"



for a in attack_names:
    exec(f"{a}_df = pd.read_csv({a}_file, header='infer', index_col=0)")
    exec(f"{a}_df = {a}_df.sample(min(max_size_atk, len({a}_df)))")
    exec(f"{a}_df.reset_index(inplace=True, drop=True)")
    exec(f"{a}_df['Label'] = a")

In [4]:
# Determining Train and Test sets for each class

df_list = [benign_df]
for a in attack_names:
    exec(f"df_list.append({a}_df)")


for dummy_df in df_list:
    if train_size <= 1:
        train_threshold = test_size + (1-test_size)*train_size
    else:
        train_threshold = test_size + ((train_size * 100) / (len(dummy_df)) / 100)       
    dummy_df['seed'] = (np.random.uniform(0,1,len(dummy_df)))
    dummy_df['is_test'] = np.where(dummy_df['seed'] <= test_size, True, False)
    dummy_df['is_train'] = np.where((dummy_df['seed'] <= train_threshold) & (dummy_df['is_test']==False), True, False)

# get all together
all_df = pd.concat(df_list)

In [5]:
def handle_categorical(df):
    ## Handling categorical data
    df_dummy = df.copy(deep=True)
    df_dummy['Nature'] = np.where(df_dummy['Label'].str.contains('BENIGN'),0,1)

    for column_name in df_dummy.columns:
        if column_name == ('SrcPort_type'):
            df_dummy[column_name] = pd.factorize(df_dummy[column_name])[0]
        elif column_name == ('DstPort_type'):
            df_dummy[column_name] = pd.factorize(df_dummy[column_name])[0]
        elif column_name == ('Protocol'):
            df_dummy[column_name+'-f'] = pd.factorize(df_dummy[column_name])[0]
        else:
            pass
    return df_dummy

all_df = handle_categorical(all_df)
all_df['Label_cat'] = pd.factorize(all_df['Label'])[0]
all_df['int2int'] = np.where( ((all_df['SrcIP_internal']==True) & (all_df['DstIP_internal']==True)), True, False)
all_df['Duration(s)'] = all_df['FlowDuration'] / 1000000
all_df['DstPkt'] = all_df['BwdPkts/s'] * all_df['Duration(s)']
all_df['SrcPkt'] = all_df['FwdPkts/s'] * all_df['Duration(s)']
all_df['DstByt'] = all_df['DstPkt'] * all_df['BwdSegSizeAvg']
all_df['SrcByt'] = all_df['SrcPkt'] * all_df['FwdSegSizeAvg']
all_df['totPkt'] = all_df['SrcPkt'] + all_df['DstPkt']
all_df['totByt'] = all_df['SrcByt'] + all_df['DstByt']

all_train, all_test = all_df[all_df['is_train']==True], all_df[all_df['is_test']==True]

### SPLITTING ALL BACK ####
benign_df = all_df[all_df['Label']=='BENIGN']
benign_train = benign_df[benign_df['is_train']==True]
benign_test = benign_df[benign_df['is_test']==True]

for a in attack_names:
    exec(f"{a}_df = all_df[all_df['Label']=='{a}']")

malicious_df = all_df[all_df['Label']!='BENIGN']
malicious_train, malicious_test = malicious_df[malicious_df['is_train']==True], malicious_df[malicious_df['is_test']==True]

print("& 0 & \\textit{{Benign}} & {} & \\\\ \\cline{{2-4}}".format(len(benign_df)))


for i,a in enumerate(attack_names):
    exec(f"print('& {i+1} & \\\\textit{{{{{a}}}}} & {{}} \\\\\\\\ \\\\cline{{{{2-4}}}}'.format(len({a}_df)))")

& 0 & \textit{Benign} & 139186 & \\ \cline{2-4}
& 1 & \textit{ddos} & 131211 \\ \cline{2-4}
& 2 & \textit{bot} & 93021 \\ \cline{2-4}
& 3 & \textit{brute} & 83857 \\ \cline{2-4}
& 4 & \textit{infi} & 70202 \\ \cline{2-4}


In [6]:
## Feature sets

# the following is the "complete" feature set

features = ['Protocol-f',
       'FlowDuration', 'TotFwdPkts', 'TotBwdPkts',
       'TotLenFwdPkts', 'TotLenBwdPkts', 'FwdPktLenMax', 'FwdPktLenMin',
       'FwdPktLenMean', 'FwdPktLenStd', 'BwdPktLenMax', 'BwdPktLenMin',
       'BwdPktLenMean', 'BwdPktLenStd', 'FlowByts/s', 'FlowPkts/s',
       'FlowIATMean', 'FlowIATStd', 'FlowIATMax', 'FlowIATMin', 'FwdIATTot',
       'FwdIATMean', 'FwdIATStd', 'FwdIATMax', 'FwdIATMin', 'BwdIATTot',
       'BwdIATMean', 'BwdIATStd', 'BwdIATMax', 'BwdIATMin', 'FwdPSHFlags',
       'BwdPSHFlags', 'FwdURGFlags', 'BwdURGFlags', 'FwdHeaderLen',
       'BwdHeaderLen', 'FwdPkts/s', 'BwdPkts/s', 'PktLenMin', 'PktLenMax',
       'PktLenMean', 'PktLenStd', 'PktLenVar', 'FINFlagCnt', 'SYNFlagCnt',
       'RSTFlagCnt', 'PSHFlagCnt', 'ACKFlagCnt', 'URGFlagCnt', 'CWEFlagCount',
       'ECEFlagCnt', 'Down/UpRatio', 'PktSizeAvg', 'FwdSegSizeAvg',
       'BwdSegSizeAvg', 'FwdByts/bAvg', 'FwdPkts/bAvg', 'FwdBlkRateAvg',
       'BwdByts/bAvg', 'BwdPkts/bAvg', 'BwdBlkRateAvg', 'SubflowFwdPkts',
       'SubflowFwdByts', 'SubflowBwdPkts', 'SubflowBwdByts', 'InitFwdWinByts',
       'InitBwdWinByts', 'FwdActDataPkts', 'FwdSegSizeMin', 'ActiveMean',
       'ActiveStd', 'ActiveMax', 'ActiveMin', 'IdleMean', 'IdleStd', 'IdleMax',
       'IdleMin', 'SrcPort_type',
       'DstPort_type', 'int2int'
       ]

# this is for the "essential" feature set
small_features = ['Protocol-f', 'Duration(s)', 'totPkt', 'totByt',
                'DstPkt', 'SrcPkt', 'DstByt', 'SrcByt', 'SrcPort_type', 
                  'DstPort_type', 'FwdPSHFlags', 'BwdPSHFlags', 'FwdURGFlags', 'BwdURGFlags',
                  'FINFlagCnt',
       'SYNFlagCnt', 'RSTFlagCnt', 'PSHFlagCnt', 'ACKFlagCnt',
       'URGFlagCnt', 'ECEFlagCnt', 
                  #'int2int'
                 ]

In [7]:
print("Size of TRAIN:\t", len(all_train))
print("Size of TEST:\t", len(all_test))

Size of TRAIN:	 409978
Size of TEST:	 103330


In [8]:
hgb = HistGradientBoostingClassifier(loss='log_loss', learning_rate=0.1, max_iter=100, max_leaf_nodes=31, max_depth=None, 
                                    min_samples_leaf=20, l2_regularization=0.0, max_bins=255,
                                    monotonic_cst=None, warm_start=False, early_stopping='auto', scoring='loss', 
                                    validation_fraction=0.1, n_iter_no_change=10, tol=1e-07, verbose=0, random_state=None)

In [9]:

train_y = all_train["Nature"]
start_time = time.time()

print("Training HGB...", end="", flush=True)
hgb.fit(all_train[features], train_y)
hgb_trainTime = time.time() - start_time
print("...done! Training runtime: {:.2f}s".format(hgb_trainTime))

print("Testing HGB...", end="", flush=True)
start_time = time.time()
hgb_pred = hgb.predict(all_test[features])
hgb_inferTime = time.time() - start_time
print("...done! Inference runtime: {:.2f}s".format(hgb_inferTime))
hgb_tpr = recall_score(all_test['Nature'], hgb_pred, zero_division=0, pos_label=1)
hgb_fpr = 1-recall_score(all_test['Nature'], hgb_pred, zero_division=0, pos_label=0)

print("HGB performance: \tRecall={:.3f}\tFPR={:.3f}\n".format(hgb_tpr, hgb_fpr))
# pd.crosstab(all_test['Nature'], hgb_pred, rownames=['True'], colnames=['Pred'])

Training HGB......done! Training runtime: 10.61s
Testing HGB......done! Inference runtime: 0.39s
HGB performance: 	Recall=0.997	FPR=0.041



In [10]:
dt = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, 
                                     min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, 
                                     random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, 
                                     class_weight=None, ccp_alpha=0.0)

In [11]:

train_y = all_train["Nature"]
start_time = time.time()

print("Training DT...", end="", flush=True)
dt.fit(all_train[features], train_y)
dt_trainTime = time.time() - start_time
print("...done! Training runtime: {:.2f}s".format(dt_trainTime))

print("Testing DT...", end="", flush=True)
start_time = time.time()
dt_pred = dt.predict(all_test[features])
dt_inferTime = time.time() - start_time
print("...done! Inference runtime: {:.2f}s".format(dt_inferTime))
dt_tpr = recall_score(all_test['Nature'], dt_pred, zero_division=0, pos_label=1)
dt_fpr = 1-recall_score(all_test['Nature'], dt_pred, zero_division=0, pos_label=0)

print("DT performance: \tRecall={:.4f}\tFPR={:.4f}\n".format(dt_tpr, dt_fpr))
# pd.crosstab(all_test['Nature'], dt_pred, rownames=['True'], colnames=['Pred'])

Training DT......done! Training runtime: 6.87s
Testing DT......done! Inference runtime: 0.05s
DT performance: 	Recall=0.9902	FPR=0.0271



In [12]:
rf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, min_samples_split=2, 
                                 min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='sqrt', 
                                 max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, 
                                 n_jobs=-1, random_state=None, verbose=0, warm_start=False, class_weight=None, 
                                 ccp_alpha=0.0, max_samples=None)

In [13]:

train_y = all_train["Nature"]
start_time = time.time()

print("Training RF...", end="", flush=True)
rf.fit(all_train[features], train_y)
rf_trainTime = time.time() - start_time
print("...done! Training runtime: {:.2f}s".format(rf_trainTime))

print("Testing RF...", end="", flush=True)
start_time = time.time()
rf_pred = rf.predict(all_test[features])
rf_inferTime = time.time() - start_time
print("...done! Inference runtime: {:.2f}s".format(rf_inferTime))
rf_tpr = recall_score(all_test['Nature'], rf_pred, zero_division=0, pos_label=1)
rf_fpr = 1-recall_score(all_test['Nature'], rf_pred, zero_division=0, pos_label=0)

print("RF performance: \tRecall={:.4f}\tFPR={:.4f}\n".format(rf_tpr, rf_fpr))
# pd.crosstab(all_test['Nature'], rf_pred, rownames=['True'], colnames=['Pred'])

Training RF......done! Training runtime: 10.01s
Testing RF......done! Inference runtime: 0.28s
RF performance: 	Recall=0.9942	FPR=0.0275



# What about Deep Learning?

In [16]:
dnn = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu', solver='adam', alpha=0.0001, 
    batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, 
    max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, 
    warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, 
    validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, 
    n_iter_no_change=20, max_fun=15000)

In [None]:
train_y = all_train["Nature"]
start_time = time.time()

print("Training DNN...", end="", flush=True)
dnn.fit(all_train[features], train_y)
dnn_trainTime = time.time() - start_time
print("...done! Training time: {}".format(dnn_trainTime))

print("Testing DNN...", end="", flush=True)
start_time = time.time()
dnn_pred = dnn.predict(all_test[features])
dnn_inferTime = time.time() - start_time
print("...done! Inference time: {}".format(dnn_inferTime))
dnn_tpr = recall_score(all_test['Nature'], dnn_pred, zero_division=0, pos_label=1)
dnn_fpr = 1-recall_score(all_test['Nature'], dnn_pred, zero_division=0, pos_label=0)

print("DNN performance: \tRecall={:.4f}\tFPR={:.4f}\n".format(dnn_tpr, dnn_fpr))
# pd.crosstab(all_test['Nature'], dnn_pred, rownames=['True'], colnames=['Pred'])

### See for yourself how long it takes to train a "Deep" neural network that performs equally well to the three "shallow" ML algorithms used above