In [1]:
%matplotlib inline
from scipy.stats import randint as sp_randint
from scipy.stats import uniform
from scipy.stats import uniform as sp_randFloat
from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from time import time
import numpy as np
import pandas as pd
import sklearn
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
from tabulate import tabulate

In [2]:
from scipy.stats import randint as sp_randInt

from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.metrics import make_scorer
from scipy import sparse

# IoTSense

In [3]:
features=['ARP', 'EAPOL', 'IP', 'ICMP', 'ICMP6', 'TCP', 'UDP', 'TCP_w_size',
       'HTTP', 'HTTPS', 'DHCP', 'BOOTP', 'SSDP', 'DNS', 'MDNS', 'NTP',
       'IP_padding', 'IP_ralert', 'payload_l', 'Entropy',
       'Label']

In [4]:
df=pd.read_csv('./HPOTrain.csv',usecols=features) 
X_train = df.iloc[:,0:-1]
df['Label'] = df['Label'].astype('category')
y_train=df['Label'].cat.codes  

df=pd.read_csv( './HPOTest.csv',usecols=features) 
X_test = df.iloc[:,0:-1]
df['Label'] = df['Label'].astype('category')
y_test=df['Label'].cat.codes  

In [5]:
print(X_train.shape,X_test.shape)

(150500, 20) (149333, 20)


# DATASET

In [6]:
X= np.concatenate([X_train, X_test])
test_fold = [-1 for _ in range(X_train.shape[0])] + [0 for _ in range(X_test.shape[0])]
y = np.concatenate([y_train, y_test])
ps = PredefinedSplit(test_fold)

def run_random_search(model, params, x_train, y_train):
    #grid = GridSearchCV(model, params, cv = ps, n_jobs = -1, scoring = score, verbose = 0, refit = False)
    grid =RandomizedSearchCV(model, param_grid, cv=ps,scoring = 'f1_macro')
    grid.fit(x_train, y_train)
    return (grid.best_params_, round(grid.best_score_,8),grid.best_estimator_)



In [7]:
lines=[['criterion', 'max_depth', 'max_features', 'min_samples_split', "F1","Std","Time","No"]]
print ('%-35s %-20s %-8s %-8s' % ("HYPERPARAMETERS","F1 Score", "Time", "No"))

nfolds=10
param_grid = { 'criterion':['gini','entropy'],
                  "max_depth":np.linspace(1, 32, 32, endpoint=True).astype(int),
                 "min_samples_split": sp_randint(2,10),#uniform(0.1,1 ),
                    # "min_samples_leafs" : np.linspace(0.1, 0.5, 5, endpoint=True)
                    "max_features" : sp_randint(1,X_train.shape[1])}

second=time()
f1=[]
clf=DecisionTreeClassifier()
for ii in range(10):
    clf.fit(X, y)
    predict =clf.predict(X_test)
    f1.append(sklearn.metrics.f1_score(y_test, predict,average= "macro") )
f1=sum(f1)/len(f1)   
#if f1>0.76:
print('%-35s %-20s %-8s %-8s' % ("default",f1,round(time()-second,3),ii))
######################################################################################################################
for i in tqdm(range(10)):
    second=time()
    a,b,clf=run_random_search(DecisionTreeClassifier(),param_grid,X,y)
    f1=[]
    for ii in range(5):
        clf.fit(X, y)
        predict =clf.predict(X_test)
        f1.append(sklearn.metrics.f1_score(y_test, predict,average= "macro") )
    f1_result=sum(f1)/len(f1)   
    f1=np.array(f1)
    stndtd=f1.std()
    temp=list(a.values())
    #print('%-90s %-20s %-8s %-8s' % (a,f1_result,round(time()-second,3),i))
    temp=temp+[f1_result,stndtd,round(time()-second,3),i]
    lines.append(temp)

    #if f1>0.76:
results = pd.DataFrame (lines[1:], columns = lines[0])
results.to_csv("DT_sense_HPO.csv",index=False)

final_parametres=[['criterion', 'max_depth', 'max_features', 'min_samples_split', "F1","Std","Time","No"]]

df=results
m=df["F1"].max()
df=df[df["F1"]==m]
m=df["max_depth"].min()
df=df[df["max_depth"]==m]  
final_parametres.append(list(df.values)[0])
results = pd.DataFrame (final_parametres[1:], columns=  final_parametres[0])
print (tabulate(results, headers=list(results.columns)))


HYPERPARAMETERS                     F1 Score             Time     No      
default                             0.904341837350646    4.87     9       


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:48<00:00,  4.89s/it]

    criterion      max_depth    max_features    min_samples_split        F1          Std    Time    No
--  -----------  -----------  --------------  -------------------  --------  -----------  ------  ----
 0  gini                  23              16                    2  0.904342  1.11022e-16    5.18     2





# IoTSentinel

In [11]:
features=['ARP', 'LLC', 'EAPOL', 'IP', 'ICMP', 'ICMP6', 'TCP', 'UDP', 'HTTP',
       'HTTPS', 'DHCP', 'BOOTP', 'SSDP', 'DNS', 'MDNS', 'NTP', 'IP_padding',
       'IP_add_count', 'IP_ralert', 'Portcl_src', 'Portcl_dst', 'Pck_size',
       'Pck_rawdata', 'Label']


In [12]:
df=pd.read_csv('./HPOTrain.csv',usecols=features) 
X_train = df.iloc[:,0:-1]
df['Label'] = df['Label'].astype('category')
y_train=df['Label'].cat.codes  

df=pd.read_csv( './HPOTest.csv',usecols=features) 
X_test = df.iloc[:,0:-1]
df['Label'] = df['Label'].astype('category')
y_test=df['Label'].cat.codes  

In [13]:
print(X_train.shape,X_test.shape)

(150500, 23) (149333, 23)


# DATASET

In [14]:
X= np.concatenate([X_train, X_test])
test_fold = [-1 for _ in range(X_train.shape[0])] + [0 for _ in range(X_test.shape[0])]
y = np.concatenate([y_train, y_test])
ps = PredefinedSplit(test_fold)

def run_random_search(model, params, x_train, y_train):
    #grid = GridSearchCV(model, params, cv = ps, n_jobs = -1, scoring = score, verbose = 0, refit = False)
    grid =RandomizedSearchCV(model, param_grid, cv=ps,scoring = 'f1_macro')
    grid.fit(x_train, y_train)
    return (grid.best_params_, round(grid.best_score_,8),grid.best_estimator_)



In [15]:
lines=[['criterion', 'max_depth', 'max_features', 'min_samples_split', "F1","Std","Time","No"]]
print ('%-35s %-20s %-8s %-8s' % ("HYPERPARAMETERS","F1 Score", "Time", "No"))

nfolds=10
param_grid = { 'criterion':['gini','entropy'],
                  "max_depth":np.linspace(1, 32, 32, endpoint=True).astype(int),
                 "min_samples_split": sp_randint(2,10),#uniform(0.1,1 ),
                    # "min_samples_leafs" : np.linspace(0.1, 0.5, 5, endpoint=True)
                    "max_features" : sp_randint(1,X_train.shape[1])}

second=time()
f1=[]
clf=DecisionTreeClassifier()
for ii in range(10):
    clf.fit(X, y)
    predict =clf.predict(X_test)
    f1.append(sklearn.metrics.f1_score(y_test, predict,average= "macro") )
f1=sum(f1)/len(f1)   
#if f1>0.76:
print('%-35s %-20s %-8s %-8s' % ("default",f1,round(time()-second,3),ii))
######################################################################################################################
for i in tqdm(range(10)):
    second=time()
    a,b,clf=run_random_search(DecisionTreeClassifier(),param_grid,X,y)
    f1=[]
    for ii in range(5):
        clf.fit(X, y)
        predict =clf.predict(X_test)
        f1.append(sklearn.metrics.f1_score(y_test, predict,average= "macro") )
    f1_result=sum(f1)/len(f1)   
    f1=np.array(f1)
    stndtd=f1.std()
    temp=list(a.values())
    #print('%-90s %-20s %-8s %-8s' % (a,f1_result,round(time()-second,3),i))
    temp=temp+[f1_result,stndtd,round(time()-second,3),i]
    lines.append(temp)

    #if f1>0.76:
results = pd.DataFrame (lines[1:], columns = lines[0])
results.to_csv("DT_sentinel_HPO.csv",index=False)

final_parametres=[['criterion', 'max_depth', 'max_features', 'min_samples_split', "F1","Std","Time","No"]]

df=results
m=df["F1"].max()
df=df[df["F1"]==m]
m=df["max_depth"].min()
df=df[df["max_depth"]==m]  
final_parametres.append(list(df.values)[0])
results = pd.DataFrame (final_parametres[1:], columns=  final_parametres[0])
print (tabulate(results, headers=list(results.columns)))


HYPERPARAMETERS                     F1 Score             Time     No      
default                             0.43138752593677243  3.088    9       


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:41<00:00,  4.17s/it]

    criterion      max_depth    max_features    min_samples_split        F1    Std    Time    No
--  -----------  -----------  --------------  -------------------  --------  -----  ------  ----
 0  entropy               31              22                    4  0.431388      0   4.383     6



