## Cross-validation

The aim of this notebook is to validate accuracy of the training models using cross validation

In [None]:
import pandas as pd
import numpy as np

import os
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb


# silent the warning from the sklearn library
import warnings
warnings.filterwarnings('ignore')

import random # import random package to specify the random seed

random.seed(10) # ensure reproducibilty
np.random.seed(10)

# to interpret the training result
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
selected_features = pd.read_csv(r'/content/drive/MyDrive/Multiclass_Win_Latest/Dataset/Feature selected/CICDDoS2019_Multiclass_RFClassifier_30.csv').squeeze()
selected_features = selected_features[:20]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
CICDDoS2019 = pd.read_csv(r'/content/drive/MyDrive/Multiclass_Win_Latest/Dataset/Encoded Dataset/CIC-DDoS2019_multi.csv')

# Create the 'columns' list using 'selected_features' and columns starting with 'Label_'
columns = selected_features.tolist() + [col for col in CICDDoS2019.columns if col.startswith('Label_')]

# Use the 'columns' list to select specific columns from 'CICDDoS2019'
CICDDoS2019 = CICDDoS2019[columns]

CICDDoS2019.shape


(300000, 32)

In [None]:
CICDDoS2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 32 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Avg Fwd Segment Size         300000 non-null  float64
 1   Min Packet Length            300000 non-null  float64
 2   Fwd Packet Length Max        300000 non-null  float64
 3   Fwd Packet Length Mean       300000 non-null  float64
 4   Average Packet Size          300000 non-null  float64
 5   Packet Length Mean           300000 non-null  float64
 6   Max Packet Length            300000 non-null  float64
 7   Fwd Packet Length Min        300000 non-null  float64
 8   Total Length of Fwd Packets  300000 non-null  float64
 9   Subflow Fwd Bytes            300000 non-null  int64  
 10  Flow Bytes/s                 300000 non-null  float64
 11  ACK Flag Count               300000 non-null  int64  
 12  Fwd IAT Total                300000 non-null  float64
 13 

In [None]:
# CICDDoS2019 = CICDDoS2019.sample(frac=0.3).reset_index(drop=True)

target_columns = [col for col in CICDDoS2019.columns if col.startswith('Label_')]

# Sample 10% of the data and reset the index
CICDDoS2019_sampled = CICDDoS2019.sample(frac=0.4, random_state=42).reset_index(drop=True)

# Print class distribution for each target column
for column in target_columns:
    print(f'Class distribution for {column}:')
    print(CICDDoS2019_sampled[column].value_counts())
    print()


Class distribution for Label_BENIGN:
0.0    110041
1.0      9959
Name: Label_BENIGN, dtype: int64

Class distribution for Label_DrDoS_DNS:
0.0    110015
1.0      9985
Name: Label_DrDoS_DNS, dtype: int64

Class distribution for Label_DrDoS_LDAP:
0.0    110001
1.0      9999
Name: Label_DrDoS_LDAP, dtype: int64

Class distribution for Label_DrDoS_MSSQL:
0.0    110038
1.0      9962
Name: Label_DrDoS_MSSQL, dtype: int64

Class distribution for Label_DrDoS_NTP:
0.0    109969
1.0     10031
Name: Label_DrDoS_NTP, dtype: int64

Class distribution for Label_DrDoS_NetBIOS:
0.0    110018
1.0      9982
Name: Label_DrDoS_NetBIOS, dtype: int64

Class distribution for Label_DrDoS_SNMP:
0.0    109894
1.0     10106
Name: Label_DrDoS_SNMP, dtype: int64

Class distribution for Label_DrDoS_SSDP:
0.0    109993
1.0     10007
Name: Label_DrDoS_SSDP, dtype: int64

Class distribution for Label_DrDoS_UDP:
0.0    110009
1.0      9991
Name: Label_DrDoS_UDP, dtype: int64

Class distribution for Label_Syn:
0.0    10

In [None]:

# Separate features and labels
DDoS2019_X = CICDDoS2019_sampled.drop(CICDDoS2019_sampled.filter(regex='^Label_'), axis=1).copy() #drop column with prefixed Label_
DDoS2019_y = CICDDoS2019_sampled[CICDDoS2019_sampled.filter(regex='^Label_').columns].copy()

# Split the dataset into training and testing sets
# DDoS2019_train_X, DDoS2019_test_X, DDoS2019_train_y, DDoS2019_test_y = train_test_split(DDoS2019_X, DDoS2019_y, test_size=0.3)


In [None]:
#normalisation

scalar = StandardScaler()
# only fit the training data
DDoS2019_X_scaled = scalar.fit_transform(DDoS2019_X)


### Build the optimised models for validation

In [None]:

models = {
    'k-Nearesr Neighbour': KNeighborsClassifier(n_neighbors=5, weights='distance', metric='euclidean', leaf_size=30, p=1),
    'Random Forest': RandomForestClassifier(max_depth=30, min_samples_leaf=1e-05, min_samples_split=1e-05, n_estimators=200, n_jobs=-1,criterion='gini'),
    'Artificial Neural Network': MLPClassifier(hidden_layer_sizes=(50,), activation='relu', alpha=0.001, solver='adam', max_iter=500),
    'Deep Neural Network': MLPClassifier(hidden_layer_sizes=(10, 10, 10, 10), activation='relu', alpha=0.0001, solver='adam', max_iter=1000),
    'XGBoost': xgb.XGBClassifier(colsample_bytree=0.9, learning_rate=0.4, max_depth=6, min_child_weight=1, subsample=0.9, n_estimators=400)
}



### k-fold cross validation, k = 10

In [None]:
accuracy_scores = {}
accuracy_scores_mean = {}
accuracy_scores_std = {}

cv = KFold(n_splits=10, shuffle=True)

for model in models:
    clf = models[model]

    accuracy_scores[model] = cross_val_score(clf,
                                            DDoS2019_X_scaled,
                                            DDoS2019_y.values,
                                            cv=cv,
                                            scoring='accuracy',
                                            n_jobs=-1)
    accuracy_scores_mean[model] = np.mean(accuracy_scores[model])
    accuracy_scores_std[model] = np.std(accuracy_scores[model])

    print(f"{'-'*25} {model} {'-'*25}")
    print(f"Accuracy: {accuracy_scores[model]}")
    print(f"mean: {accuracy_scores_mean[model]:.4f}\t\tstd: {accuracy_scores_std[model]:.4f}")

------------------------- k-Nearesr Neighbour -------------------------
Accuracy: [0.73916667 0.73733333 0.77808333 0.7515     0.73941667 0.7555
 0.73975    0.76183333 0.74041667 0.74783333]
mean: 0.7491		std: 0.0124
------------------------- Random Forest -------------------------
Accuracy: [0.684      0.69491667 0.68216667 0.67766667 0.68441667 0.6795
 0.6845     0.68016667 0.68216667 0.68258333]
mean: 0.6832		std: 0.0044
------------------------- Artificial Neural Network -------------------------
Accuracy: [0.612      0.602      0.65425    0.6325     0.58458333 0.67358333
 0.5705     0.61216667 0.59025    0.64633333]
mean: 0.6178		std: 0.0314
------------------------- Deep Neural Network -------------------------
Accuracy: [0.62533333 0.61508333 0.64466667 0.6295     0.60775    0.61566667
 0.56375    0.58966667 0.5885     0.63541667]
mean: 0.6115		std: 0.0235


### k-fold cross validation, k = 5

In [None]:
accuracy_scores = {}
accuracy_scores_mean = {}
accuracy_scores_std = {}

cv = KFold(n_splits=5, shuffle=True)

for model in models:
    clf = models[model]

    accuracy_scores[model] = cross_val_score(clf,
                                            DDoS2019_X_scaled,
                                            DDoS2019_y.values,
                                            cv=cv,
                                            scoring='accuracy',
                                            n_jobs=-1)
    accuracy_scores_mean[model] = np.mean(accuracy_scores[model])
    accuracy_scores_std[model] = np.std(accuracy_scores[model])

    print(f"{'-'*25} {model} {'-'*25}")
    print(f"Accuracy: {accuracy_scores[model]}")
    print(f"mean: {accuracy_scores_mean[model]:.4f}\t\tstd: {accuracy_scores_std[model]:.4f}")

------------------------- k-Nearesr Neighbour -------------------------
Accuracy: [0.76633333 0.68266667 0.74591667 0.76241667 0.73466667]
mean: 0.7384		std: 0.0301
------------------------- Random Forest -------------------------
Accuracy: [0.68733333 0.68354167 0.69275    0.6875     0.69079167]
mean: 0.6884		std: 0.0032
------------------------- Artificial Neural Network -------------------------
Accuracy: [0.61995833 0.5885     0.56341667 0.596875   0.62325   ]
mean: 0.5984		std: 0.0219
------------------------- Deep Neural Network -------------------------
Accuracy: [0.61979167 0.59029167 0.629375   0.57554167 0.57770833]
mean: 0.5985		std: 0.0221
------------------------- XGBoost -------------------------
Accuracy: [0.677125   0.68529167 0.68054167 0.68583333 0.68366667]
mean: 0.6825		std: 0.0033


### k-fold cross validation, k = 3

In [None]:
accuracy_scores = {}
accuracy_scores_mean = {}
accuracy_scores_std = {}

cv = KFold(n_splits=3, shuffle=True)

for model in models:
    clf = models[model]

    accuracy_scores[model] = cross_val_score(clf,
                                            DDoS2019_X_scaled,
                                            DDoS2019_y.values,
                                            cv=cv,
                                            scoring='accuracy',
                                            n_jobs=-1)
    accuracy_scores_mean[model] = np.mean(accuracy_scores[model])
    accuracy_scores_std[model] = np.std(accuracy_scores[model])

    print(f"{'-'*25} {model} {'-'*25}")
    print(f"Accuracy: {accuracy_scores[model]}")
    print(f"mean: {accuracy_scores_mean[model]:.4f}\t\tstd: {accuracy_scores_std[model]:.4f}")

------------------------- k-Nearesr Neighbour -------------------------
Accuracy: [0.763175 0.7445   0.76105 ]
mean: 0.7562		std: 0.0083
------------------------- Random Forest -------------------------
Accuracy: [0.68935  0.686125 0.6832  ]
mean: 0.6862		std: 0.0025
------------------------- Artificial Neural Network -------------------------
Accuracy: [0.622625 0.580075 0.628875]
mean: 0.6105		std: 0.0217
------------------------- Deep Neural Network -------------------------
Accuracy: [0.5894   0.602425 0.5963  ]
mean: 0.5960		std: 0.0053
------------------------- XGBoost -------------------------
Accuracy: [0.6844   0.676675 0.67645 ]
mean: 0.6792		std: 0.0037
