In [1]:
%matplotlib inline
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_randFloat
from sklearn import svm
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from time import time
from tabulate import tabulate
import numpy as np
import pandas as pd
import sklearn
import warnings
warnings.filterwarnings('ignore')
import os

In [2]:
from scipy.stats import randint as sp_randInt

from sklearn.model_selection import GridSearchCV, PredefinedSplit

In [3]:
import json
with open('GA_output_ET.json', 'r') as fp:
    feature_list = json.load(fp)

In [4]:
feature_list

{'SYN': ['Src Port',
  'Tot Fwd Pkts',
  'Tot Bwd Pkts',
  'TotLen Fwd Pkts',
  'TotLen Bwd Pkts',
  'Fwd Pkt Len Min',
  'Fwd Pkt Len Mean',
  'Fwd Pkt Len Std',
  'Bwd Pkt Len Max',
  'Bwd Pkt Len Min',
  'Bwd Pkt Len Std',
  'Flow Byts/s',
  'Flow Pkts/s',
  'Flow IAT Std',
  'Flow IAT Min',
  'Fwd IAT Tot',
  'Fwd IAT Std',
  'Fwd IAT Min',
  'Bwd IAT Tot',
  'Bwd IAT Std',
  'Bwd IAT Min',
  'Bwd PSH Flags',
  'Fwd Header Len',
  'Fwd Pkts/s',
  'Bwd Pkts/s',
  'Pkt Len Min',
  'Pkt Len Mean',
  'Pkt Len Std',
  'Pkt Len Var',
  'SYN Flag Cnt',
  'PSH Flag Cnt',
  'Pkt Size Avg',
  'Subflow Fwd Byts',
  'Subflow Bwd Byts',
  'Active Min',
  'Idle Mean',
  'Idle Std',
  'Idle Max',
  'Idle Min',
  'Label'],
 'HTTP': ['Dst Port',
  'Protocol',
  'Fwd Pkt Len Max',
  'Bwd Pkt Len Max',
  'Bwd Pkt Len Min',
  'Bwd Pkt Len Std',
  'Fwd IAT Tot',
  'Fwd IAT Min',
  'Bwd IAT Tot',
  'Bwd IAT Mean',
  'Bwd IAT Max',
  'Bwd IAT Min',
  'Bwd PSH Flags',
  'Fwd Header Len',
  'Bwd Header Len

In [5]:
file_list={
"SYN":[ '../csvs\\dos-synflooding-1-dec.pcap_Flow.csv', '../csvs\\dos-synflooding-6-dec.pcap_Flow.csv',],
"HTTP":['../csvs\\mirai-httpflooding-4-dec.pcap_Flow.csv', '../csvs\\mirai-httpflooding-1-dec.pcap_Flow.csv'],
"ACK":['../csvs\\mirai-ackflooding-1-dec.pcap_Flow.csv','../csvs\\mirai-ackflooding-4-dec.pcap_Flow.csv'],
"UDP":[ '../csvs\\mirai-udpflooding-4-dec.pcap_Flow.csv','../csvs\\mirai-udpflooding-1-dec.pcap_Flow.csv'],
"ARP":[ '../csvs\\mitm-arpspoofing-6-dec.pcap_Flow.csv', '../csvs\\mitm-arpspoofing-1-dec.pcap_Flow.csv'],
"SP":['../csvs\\scan-hostport-6-dec.pcap_Flow.csv','../csvs\\scan-hostport-3-dec.pcap_Flow.csv'],
"BF":[ '../csvs\\mirai-hostbruteforce-4-dec.pcap_Flow.csv','../csvs\\mirai-hostbruteforce-1-dec.pcap_Flow.csv']
}

In [6]:
def run_random_search(model, params, x_train, y_train):
    #grid = GridSearchCV(model, params, cv = ps, n_jobs = -1, scoring = score, verbose = 0, refit = False)
    grid =RandomizedSearchCV(model, param_grid, cv=ps,scoring = 'f1_macro')
    grid.fit(x_train, y_train)
    return (grid.best_params_, round(grid.best_score_,8),grid.best_estimator_)

In [7]:
def find_the_way(path,file_format,con=""):
    files_add = []
    # r=root, d=directories, f = files
    for r, d, f in os.walk(path):
        for file in f:
            if file_format in file:
                if con in file:
                    files_add.append(os.path.join(r, file))  
            
    return files_add

### RandomizedSearchCV RF

In [8]:
lines=[['bootst', 'criter', 'max_depth', 'max_features',"min_samp_split","n_estimators", "F1","Std","Time","No","Attack"]]



for j in file_list:
    print(j)
    
    
    df=pd.read_csv(file_list[j][0],usecols=feature_list[j]) 
    X_train = df.iloc[:,0:-1]
    df['Label'] = df['Label'].astype('category')
    y_train=df['Label'].cat.codes  


    df=pd.read_csv(file_list[j][1],usecols=feature_list[j]) 
    X_test = df.iloc[:,0:-1]
    df['Label'] = df['Label'].astype('category')
    y_test=df['Label'].cat.codes  
    
    
    
    X= np.concatenate([X_train, X_test])
    test_fold = [-1 for _ in range(X_train.shape[0])] + [0 for _ in range(X_test.shape[0])]
    y = np.concatenate([y_train, y_test])
    ps = PredefinedSplit(test_fold)
    print ('%-35s %-20s %-8s %-8s' % ("HYPERPARAMETERS","F1 Score", "Time", "No"))
    # use a full grid over all parameters
    param_grid = {"max_depth":np.linspace(1, 32, 32, endpoint=True).astype(int),
                  "n_estimators" : sp_randint(1, 200),
                  "max_features": sp_randint(1, 11),
                  "min_samples_split":sp_randint(2, 11),
                  "bootstrap": [True, False],
                  "criterion": ["gini", "entropy"]}
    second=time()
    f1=[]
    clf=RandomForestClassifier()
    for ii in range(1):
        clf.fit(X_train, y_train)
        predict =clf.predict(X_test)
        f1.append(sklearn.metrics.f1_score(y_test, predict,average= "macro") )
    f1=sum(f1)/len(f1)   
    #if f1>0.76:
    print('%-35s %-20s %-8s %-8s' % ("default",f1,round(time()-second,3),ii))
    ######################################################################################################################
    for i in tqdm(range(10)):
        second=time()
        a,b,clf=run_random_search(RandomForestClassifier(),param_grid,X,y)
        f1=[]
        for ii in range(5):
            clf.fit(X_train, y_train)
            predict =clf.predict(X_test)
            f1.append(sklearn.metrics.f1_score(y_test, predict,average= "macro") )
        f1_result=sum(f1)/len(f1)   
        f1=np.array(f1)
        stndtd=f1.std()
        temp=list(a.values())
        #print('%-90s %-20s %-8s %-8s' % (a,f1_result,round(time()-second,3),i))
        temp=temp+[f1_result,stndtd,round(time()-second,3),i,j]
        lines.append(temp)

        #if f1>0.76:
results = pd.DataFrame (lines[1:], columns = lines[0])
results.to_csv("RF_HPO.csv",index=False)

final_parametres=[['bootst', 'criter', 'max_depth', 'max_features',"min_samp_split","n_estimators", "F1","Std","Time","No","Attack"]]

for i in results["Attack"].unique():
    df=results[results["Attack"]==i]
    m=df["F1"].max()
    df=df[df["F1"]==m]
    m=df["max_depth"].min()
    df=df[df["max_depth"]==m]  
    final_parametres.append(list(df.values)[0])
results = pd.DataFrame (final_parametres[1:], columns=  final_parametres[0])
print (tabulate(results, headers=list(results.columns)))

SYN
HYPERPARAMETERS                     F1 Score             Time     No      
default                             0.9914825015895742   0.62     0       


100%|██████████| 10/10 [01:32<00:00,  9.27s/it]


HTTP
HYPERPARAMETERS                     F1 Score             Time     No      
default                             0.9915448876155182   0.118    0       


100%|██████████| 10/10 [00:19<00:00,  1.96s/it]


ACK
HYPERPARAMETERS                     F1 Score             Time     No      
default                             0.8740700782294333   0.529    0       


100%|██████████| 10/10 [01:45<00:00, 10.55s/it]


UDP
HYPERPARAMETERS                     F1 Score             Time     No      
default                             0.45965608465608465  0.088    0       


100%|██████████| 10/10 [00:14<00:00,  1.45s/it]


ARP
HYPERPARAMETERS                     F1 Score             Time     No      
default                             0.6643448253826245   0.103    0       


100%|██████████| 10/10 [00:14<00:00,  1.49s/it]


SP
HYPERPARAMETERS                     F1 Score             Time     No      
default                             0.9817329796640142   0.101    0       


100%|██████████| 10/10 [00:18<00:00,  1.87s/it]


BF
HYPERPARAMETERS                     F1 Score             Time     No      
default                             0.6002563060669521   0.146    0       


100%|██████████| 10/10 [00:28<00:00,  2.89s/it]

    bootst    criter      max_depth    max_features    min_samp_split    n_estimators        F1         Std    Time    No  Attack
--  --------  --------  -----------  --------------  ----------------  --------------  --------  ----------  ------  ----  --------
 0  True      entropy             6               7                 9              36  0.991483  0            4.861     3  SYN
 1  True      gini               12              10                 6              24  0.994368  0            1.154     4  HTTP
 2  False     gini                5               8                 7             159  0.987245  0.00033716  15.204     5  ACK
 3  False     gini                6               8                 6              60  0.973457  0            1         1  UDP
 4  False     gini               29               1                 4             166  0.70429   0.0155484    1.577     0  ARP
 5  False     gini                8               9                 9             157  1         0    




### RandomSearch CV Naive Bayes

In [9]:
from sklearn.naive_bayes import GaussianNB

In [10]:
lines=[['var_smoothing', "F1","Std","Time","No","Attack"]]
for j in file_list:
    print(j)
    
    df=pd.read_csv(file_list[j][0],usecols=feature_list[j]) 
    X_train = df.iloc[:,0:-1]
    df['Label'] = df['Label'].astype('category')
    y_train=df['Label'].cat.codes  


    df=pd.read_csv(file_list[j][1],usecols=feature_list[j]) 
    X_test = df.iloc[:,0:-1]
    df['Label'] = df['Label'].astype('category')
    y_test=df['Label'].cat.codes  
    
    X= np.concatenate([X_train, X_test])
    test_fold = [-1 for _ in range(X_train.shape[0])] + [0 for _ in range(X_test.shape[0])]
    y = np.concatenate([y_train, y_test])
    ps = PredefinedSplit(test_fold)
    
    second=time()

    param_grid = {
        'var_smoothing': np.logspace(0,-9, num=100),
    }

 
    second=time()
    f1=[]
    clf=GaussianNB()
    for ii in range(1):
        clf.fit(X_train, y_train)
        predict =clf.predict(X_test)
        f1.append(sklearn.metrics.f1_score(y_test, predict,average= "macro") )
    f1=sum(f1)/len(f1)   
    #if f1>0.76:
    print('%-35s %-20s %-8s %-8s' % ("default",f1,round(time()-second,3),ii))
    ######################################################################################################################
    for i in tqdm(range(10)):
        second=time()
        a,b,clf=run_random_search(GaussianNB(),param_grid,X,y)
        f1=[]
        for ii in range(5):
            clf.fit(X_train, y_train)
            predict =clf.predict(X_test)
            f1.append(sklearn.metrics.f1_score(y_test, predict,average= "macro") )
        f1_result=sum(f1)/len(f1)   
        f1=np.array(f1)
        stndtd=f1.std()
        temp=list(a.values())
        #print('%-90s %-20s %-8s %-8s' % (a,f1_result,round(time()-second,3),i))
        temp=temp+[f1_result,stndtd,round(time()-second,3),i,j]
        lines.append(temp)


results = pd.DataFrame (lines[1:], columns = lines[0])
results.to_csv("NB_HPO.csv",index=False)
print (tabulate(results, headers=list(results.columns)))
    


SYN
default                             0.8363820089439284   0.016    0       


100%|██████████| 10/10 [00:02<00:00,  3.87it/s]


HTTP
default                             0.8079057975287013   0.004    0       


100%|██████████| 10/10 [00:00<00:00, 27.17it/s]


ACK
default                             0.5361331819901627   0.005    0       


100%|██████████| 10/10 [00:00<00:00, 11.66it/s]


UDP
default                             0.8215863004362961   0.002    0       


100%|██████████| 10/10 [00:00<00:00, 36.20it/s]


ARP
default                             0.41554065669589135  0.002    0       


100%|██████████| 10/10 [00:00<00:00, 31.90it/s]


SP
default                             0.8898049375190491   0.004    0       


100%|██████████| 10/10 [00:00<00:00, 24.63it/s]


BF
default                             0.23954248991067842  0.002    0       


100%|██████████| 10/10 [00:00<00:00, 30.96it/s]

      var_smoothing        F1          Std    Time    No  Attack
--  ---------------  --------  -----------  ------  ----  --------
 0      1.87382e-06  0.982673  0             0.252     0  SYN
 1      1.51991e-05  0.954331  0             0.238     1  SYN
 2      3.51119e-06  0.982673  0             0.233     2  SYN
 3      1.87382e-06  0.982673  0             0.283     3  SYN
 4      1e-06        0.944144  0             0.295     4  SYN
 5      1.87382e-05  0.922734  0             0.28      5  SYN
 6      1.87382e-06  0.982673  0             0.272     6  SYN
 7      2.84804e-06  0.982673  0             0.256     7  SYN
 8      1e-05        0.973555  0             0.235     8  SYN
 9      1.23285e-07  0.888201  1.11022e-16   0.232     9  SYN
10      1.51991e-07  0.684955  0             0.04      0  HTTP
11      1.87382e-09  0.807906  0             0.037     1  HTTP
12      6.57933e-08  0.794142  0             0.032     2  HTTP
13      1.23285e-08  0.794142  0             0.039     3  H




### RandomSearch CV SVM

In [11]:
lines = [["gamma", "C", "F1", "Std", "Time", "No", "Attack"]]

for j in file_list:
    print(j)

    df = pd.read_csv(file_list[j][0], usecols=feature_list[j])
    X_train = df.iloc[:, 0:-1]
    df["Label"] = df["Label"].astype("category")
    y_train = df["Label"].cat.codes

    df = pd.read_csv(file_list[j][1], usecols=feature_list[j])
    X_test = df.iloc[:, 0:-1]
    df["Label"] = df["Label"].astype("category")
    y_test = df["Label"].cat.codes

    X = np.concatenate([X_train, X_test])
    test_fold = [-1 for _ in range(X_train.shape[0])] + [
        0 for _ in range(X_test.shape[0])
    ]
    y = np.concatenate([y_train, y_test])
    ps = PredefinedSplit(test_fold)

    param_grid = {"C": [0.001, 0.01, 0.1, 1, 10], "gamma": [0.001, 0.01, 0.1, 1]}

    second = time()
    f1 = []
    clf = svm.SVC()
    for ii in range(1):
        clf.fit(X_train, y_train)
        predict = clf.predict(X_test)
        f1.append(sklearn.metrics.f1_score(y_test, predict, average="macro"))
    f1 = sum(f1) / len(f1)
    # if f1>0.76:
    print("%-35s %-20s %-8s %-8s" % ("default", f1, round(time() - second, 3), ii))

    ######################################################################################################################
    for i in tqdm(range(1)):
        second = time()
        a, b, clf = run_random_search(svm.SVC(), param_grid, X, y)
        f1 = []
        for ii in range(1):
            clf.fit(X_train, y_train)
            predict = clf.predict(X_test)
            f1.append(sklearn.metrics.f1_score(y_test, predict, average="macro"))
        f1_result = sum(f1) / len(f1)
        f1 = np.array(f1)
        stndtd = f1.std()
        temp = list(a.values())
        print("%-90s %-20s %-8s %-8s" % (a, f1_result, round(time() - second, 3), i))
        temp = temp + [f1_result, stndtd, round(time() - second, 3), i, j]
        lines.append(temp)

results = pd.DataFrame(lines[1:], columns=lines[0])
results.to_csv("svm_HPO.csv", index=False)


print(tabulate(results, headers=list(results.columns)))


SYN
default                             0.7849428130587054   0.162    0       


100%|██████████| 1/1 [04:08<00:00, 248.33s/it]


{'gamma': 1, 'C': 10}                                                                      0.49871553348176056  248.325  0       
HTTP
default                             0.6062213740458016   0.074    0       


100%|██████████| 1/1 [00:01<00:00,  1.94s/it]


{'gamma': 0.001, 'C': 1}                                                                   0.5676300125313283   1.94     0       
ACK
default                             0.522583994920136    0.075    0       


100%|██████████| 1/1 [00:31<00:00, 31.99s/it]


{'gamma': 1, 'C': 10}                                                                      0.29246917843257625  31.987   0       
UDP
default                             0.28860294117647056  0.005    0       


100%|██████████| 1/1 [00:00<00:00,  3.47it/s]


{'gamma': 1, 'C': 1}                                                                       0.28860294117647056  0.287    0       
ARP
default                             0.5415453756192227   0.01     0       


100%|██████████| 1/1 [00:00<00:00,  4.29it/s]


{'gamma': 0.001, 'C': 0.001}                                                               0.4246987951807229   0.232    0       
SP
default                             0.7494355816773952   0.018    0       


100%|██████████| 1/1 [00:02<00:00,  2.21s/it]


{'gamma': 1, 'C': 10}                                                                      0.48504486540378866  2.206    0       
BF
default                             0.45811177312234397  0.061    0       


100%|██████████| 1/1 [00:01<00:00,  1.31s/it]

{'gamma': 1, 'C': 10}                                                                      0.5408530975260684   1.304    0       
      gamma       C        F1    Std     Time    No  Attack
--  -------  ------  --------  -----  -------  ----  --------
 0    1      10      0.498716      0  248.325     0  SYN
 1    0.001   1      0.56763       0    1.941     0  HTTP
 2    1      10      0.292469      0   31.987     0  ACK
 3    1       1      0.288603      0    0.287     0  UDP
 4    0.001   0.001  0.424699      0    0.232     0  ARP
 5    1      10      0.485045      0    2.206     0  SP
 6    1      10      0.540853      0    1.304     0  BF



