### Hyperparameter Tuning

In [1]:
#Lib

import pandas as pd
import numpy as np

np.random.seed(42) #reproducibity

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score


from sklearn.model_selection import GridSearchCV  #tuning
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
#load the saved top 30 features  file

selected_features = pd.read_csv(r'Dataset/Feature selected/CICDDoS2019_Multiclass_RFClassifier_30.csv').squeeze()
selected_features = selected_features[:20]


In [3]:
#load multiclass encoded data

CICDDoS2019 = pd.read_csv(r'Dataset/Encoded Dataset/CIC-DDoS2019_multi.csv')


# Create the 'columns' list using 'selected_features' and columns starting with 'Label_'
columns = selected_features.tolist() + [col for col in CICDDoS2019.columns if col.startswith('Label_')]

# Use the 'columns' list to select specific columns from 'CICDDoS2019'
CICDDoS2019 = CICDDoS2019[columns]

CICDDoS2019.shape


(300000, 32)

In [4]:
CICDDoS2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 32 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Avg Fwd Segment Size         300000 non-null  float64
 1   Min Packet Length            300000 non-null  float64
 2   Fwd Packet Length Max        300000 non-null  float64
 3   Fwd Packet Length Mean       300000 non-null  float64
 4   Average Packet Size          300000 non-null  float64
 5   Packet Length Mean           300000 non-null  float64
 6   Max Packet Length            300000 non-null  float64
 7   Fwd Packet Length Min        300000 non-null  float64
 8   Total Length of Fwd Packets  300000 non-null  float64
 9   Subflow Fwd Bytes            300000 non-null  int64  
 10  Flow Bytes/s                 300000 non-null  float64
 11  ACK Flag Count               300000 non-null  int64  
 12  Fwd IAT Total                300000 non-null  float64
 13 

In [5]:

target_columns = [col for col in CICDDoS2019.columns if col.startswith('Label_')]

# Sample 10% of the data and reset the index
CICDDoS2019_sampled = CICDDoS2019.sample(frac=0.4, random_state=42).reset_index(drop=True)

# Print class distribution for each target column
for column in target_columns:
    print(f'Class distribution for {column}:')
    print(CICDDoS2019_sampled[column].value_counts())
    print()


Class distribution for Label_BENIGN:
0.0    110041
1.0      9959
Name: Label_BENIGN, dtype: int64

Class distribution for Label_DrDoS_DNS:
0.0    110015
1.0      9985
Name: Label_DrDoS_DNS, dtype: int64

Class distribution for Label_DrDoS_LDAP:
0.0    110001
1.0      9999
Name: Label_DrDoS_LDAP, dtype: int64

Class distribution for Label_DrDoS_MSSQL:
0.0    110038
1.0      9962
Name: Label_DrDoS_MSSQL, dtype: int64

Class distribution for Label_DrDoS_NTP:
0.0    109969
1.0     10031
Name: Label_DrDoS_NTP, dtype: int64

Class distribution for Label_DrDoS_NetBIOS:
0.0    110018
1.0      9982
Name: Label_DrDoS_NetBIOS, dtype: int64

Class distribution for Label_DrDoS_SNMP:
0.0    109894
1.0     10106
Name: Label_DrDoS_SNMP, dtype: int64

Class distribution for Label_DrDoS_SSDP:
0.0    109993
1.0     10007
Name: Label_DrDoS_SSDP, dtype: int64

Class distribution for Label_DrDoS_UDP:
0.0    110009
1.0      9991
Name: Label_DrDoS_UDP, dtype: int64

Class distribution for Label_Syn:
0.0    10

In [6]:
# Separate features and labels 

DDoS2019_X = CICDDoS2019_sampled.drop(CICDDoS2019_sampled.filter(regex='^Label_'), axis=1).copy()
DDoS2019_y = CICDDoS2019_sampled[CICDDoS2019_sampled.filter(regex='^Label_').columns].copy()

# Split the dataset into training and testing sets
DDoS2019_train_X, DDoS2019_test_X, DDoS2019_train_y, DDoS2019_test_y = train_test_split(DDoS2019_X, DDoS2019_y, test_size=0.2)

In [7]:
#normalisation

scalar = StandardScaler()

DDoS2019_train_X_scaled = scalar.fit_transform(DDoS2019_train_X)
DDoS2019_test_X_scaled = scalar.transform(DDoS2019_test_X)

## Defining hyperparameter configuration setting for each model 

### K-Nearest Neighbour

In [8]:
%%time
# Define the parameter space
parameter_space = [
    {
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan'],
        'algorithm': ['auto', 'ball_tree'],
        'leaf_size': [30, 40, 50],
        'p': [1, 2],
    }
]
    
    
# Create the KNN classifier object
knn = KNeighborsClassifier()

# Create the GridSearchCV object
optimal_knn = GridSearchCV(
                        estimator=knn, 
                        param_grid=parameter_space,
                        cv=5, 
                        scoring='accuracy', 
                        n_jobs=-1, 
                        verbose=0
)

# # Perform the grid search
# start_time = time.time()
optimal_knn.fit(DDoS2019_train_X_scaled, DDoS2019_train_y)
# end_time = time.time()

# Retrieve the best parameters
knn_optimal_params = optimal_knn.best_params_
print("Optimum hyperparameters:")
print(knn_optimal_params)
# print("Execution time: {:.2f} seconds".format(end_time - start_time))


Optimum hyperparameters:
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'euclidean', 'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
CPU times: total: 7.44 s
Wall time: 21min 28s


In [9]:

# Make predictions on the test data
y_pred = optimal_knn.predict(DDoS2019_test_X_scaled)

# Calculate accuracy
accuracy = accuracy_score(DDoS2019_test_y, y_pred)

print(f"Accuracy: {accuracy}")

Accuracy: 0.75475


### Random Forest

In [10]:
%%time

parameter_space = [
    {'criterion': ['gini', 'entropy'],
    'max_depth': [10, 20, 30],
    'min_samples_leaf': [3, 0.0001, 0.0005, 0.00001], 
    'min_samples_split': [8, 0.0005, 0.0001, 0.00001], 
    'n_estimators': [100, 200, 350]
    }
]

optimal_rf = GridSearchCV(
                        RandomForestClassifier(),
                        parameter_space, 
                        cv=5,
                        scoring='accuracy',
                        n_jobs=-1,
                        verbose=0
)

optimal_rf.fit(DDoS2019_train_X_scaled, DDoS2019_train_y)
rf_optimal_params = optimal_rf.best_params_
print(f"Optimum hyperparameters: \n{rf_optimal_params}")

Optimum hyperparameters: 
{'criterion': 'gini', 'max_depth': 30, 'min_samples_leaf': 1e-05, 'min_samples_split': 1e-05, 'n_estimators': 200}
CPU times: total: 1min 24s
Wall time: 2h 17min 54s


In [15]:

# Make predictions on the test data
y_pred = optimal_rf.predict(DDoS2019_test_X_scaled)

# Calculate accuracy
accuracy = accuracy_score(DDoS2019_test_y, y_pred)

print(f"Accuracy: {accuracy}")

Accuracy: 0.6863333333333334


### Artificial Neural Network

In [11]:
%%time

parameter_space = [
    {'hidden_layer_sizes': [(10,), (20,), (30,), (40,), (50,)],
    'activation': ['tanh', 'relu', 'logistic'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.00001],
    'max_iter': [500, 1000] }
]

ann = MLPClassifier()

optimal_ann = GridSearchCV(
                        ann,
                        parameter_space, 
                        cv=5,
                        n_jobs=-1,
                        verbose=0
)

optimal_ann.fit(DDoS2019_train_X_scaled, DDoS2019_train_y)
ann_optimal_params = optimal_ann.best_params_
print(f"Optimum hyperparameters: \n{ann_optimal_params}")

Optimum hyperparameters: 
{'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (50,), 'max_iter': 500, 'solver': 'adam'}
CPU times: total: 52min 35s
Wall time: 3h 15min 3s


In [13]:

# Make predictions on the test data
y_pred = optimal_ann.predict(DDoS2019_test_X_scaled)

# Calculate accuracy
accuracy = accuracy_score(DDoS2019_test_y, y_pred)

print(f"Accuracy: {accuracy}")

Accuracy: 0.6159583333333334


### Deep Neural Network

In [18]:
%%time

parameter_space = [
    {'hidden_layer_sizes': [(10,10, 10), (12, 12, 12), (15, 15, 15), (8, 8, 8, 8), (10, 10, 10, 10)],
    'activation': ['tanh', 'relu', 'logistic'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.00001, 0.001],
    'max_iter': [500, 1000, 2000]}
]

dnn = MLPClassifier()

optimal_dnn = GridSearchCV(
                        dnn,
                        parameter_space, 
                        cv=5,
                        n_jobs=-1,
                        verbose=0
)

optimal_dnn.fit(DDoS2019_train_X_scaled, DDoS2019_train_y)
dnn_optimal_params = optimal_dnn.best_params_
print(f"Optimum hyperparameters: \n{dnn_optimal_params}")

Optimum hyperparameters: 
{'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (15, 15, 15), 'max_iter': 2000, 'solver': 'adam'}
CPU times: total: 3min 39s
Wall time: 5h 15min 21s


In [19]:

# Make predictions on the test data
y_pred = optimal_dnn.predict(DDoS2019_test_X_scaled)

# Calculate accuracy
accuracy = accuracy_score(DDoS2019_test_y, y_pred)

print(f"Accuracy: {accuracy}")

Accuracy: 0.6648333333333334


### Extreme Gradient Boosting

In [20]:
%%time

# Define the parameter space for grid search
parameter_space = [
    {'n_estimators': [ 200, 300, 400],  # Number of boosting rounds
    'learning_rate': [ 0.2, 0.3, 0.4],  # Learning rate (eta)
    'max_depth': [4, 5, 6],  # Maximum tree depth
    'min_child_weight': [1, 2, 3],  # Minimum sum of instance weight needed in a child
    'subsample': [0.8, 0.9],  # Subsample ratio of the training instances
    'colsample_bytree': [0.8, 0.9]  }
]

# Create the XGBoost classifier
xgb_model = xgb.XGBClassifier()

# Perform grid search with cross-validation
optimal_xgb = GridSearchCV(
    xgb_model,
    parameter_space, 
    cv=5,
    n_jobs=-1,
    verbose=0
)

optimal_xgb.fit(DDoS2019_train_X_scaled, DDoS2019_train_y)
xgb_optimal_params = optimal_xgb.best_params_
print(f"Optimum hyperparameters: \n{xgb_optimal_params}")


Optimum hyperparameters: 
{'colsample_bytree': 0.9, 'learning_rate': 0.4, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 400, 'subsample': 0.8}
CPU times: total: 1h 31min 39s
Wall time: 9h 3min 26s


In [21]:

# Make predictions on the test data
y_pred = optimal_xgb.predict(DDoS2019_test_X_scaled)

# Calculate accuracy
accuracy = accuracy_score(DDoS2019_test_y, y_pred)

print(f"Accuracy: {accuracy}")

Accuracy: 0.6775416666666667
