## Hyperparameter Tuning

In [1]:
#Lib

import pandas as pd
import numpy as np

np.random.seed(42) #reproducibity

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV  #tuning
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
selected_features = pd.read_csv(r'Dataset/Features selected/CICDDoS2019_Binary_RFClassifier_30.csv').squeeze()
selected_features = selected_features[:21]


In [4]:
#load binary encoded data 
CICDDoS2019_binary = pd.read_csv(r'Dataset/Encoded Dataset/CIC-DDoS2019_binary.csv')
CICDDoS2019_binary.shape



(300000, 69)

In [5]:
# Create the 'columns' list using 'selected_features' and columns starting with 'Label_'
columns = selected_features.tolist() + [col for col in CICDDoS2019_binary.columns if col.startswith('Label_')]

# Use the 'columns' list to select specific columns from 'CICDDoS2019'
CICDDoS2019_binary = CICDDoS2019_binary[columns]

CICDDoS2019_binary.shape

(300000, 23)

In [6]:
CICDDoS2019_binary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 23 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Min Packet Length            300000 non-null  float64
 1   Fwd Packet Length Mean       300000 non-null  float64
 2   Fwd Packet Length Min        300000 non-null  float64
 3   Avg Fwd Segment Size         300000 non-null  float64
 4   Packet Length Mean           300000 non-null  float64
 5   Average Packet Size          300000 non-null  float64
 6   Inbound                      300000 non-null  int64  
 7   Bwd Packets/s                300000 non-null  float64
 8   Fwd Packet Length Max        300000 non-null  float64
 9   Init_Win_bytes_forward       300000 non-null  int64  
 10  URG Flag Count               300000 non-null  int64  
 11  Subflow Bwd Packets          300000 non-null  int64  
 12  Bwd IAT Mean                 300000 non-null  float64
 13 

In [7]:
# DDoS2019_binary = DDoS2019_binary.sample(frac=0.25).reset_index(drop=True)

# DDoS2019_binary['Label'].value_counts()


target_columns = [col for col in CICDDoS2019_binary.columns if col.startswith('Label_')]

# Sample 10% of the data and reset the index
CICDDoS2019_binary_sampled = CICDDoS2019_binary.sample(frac=0.35, random_state=42).reset_index(drop=True)

# Print class distribution for each target column
for column in target_columns:
    print(f'Class distribution for {column}:')
    print(CICDDoS2019_binary_sampled[column].value_counts())
    print()

Class distribution for Label_benign:
0.0    52620
1.0    52380
Name: Label_benign, dtype: int64

Class distribution for Label_attack:
1.0    52620
0.0    52380
Name: Label_attack, dtype: int64



In [8]:
# DDoS2019_binary_X = DDoS2019_binary.drop('Label', axis=1).copy()
# DDoS2019_binary_y = DDoS2019_binary['Label'].copy()

# DDoS2019_binary_train_X, DDoS2019_binary_test_X, DDoS2019_binary_train_y, DDoS2019_binary_test_y = train_test_split(DDoS2019_binary_X, DDoS2019_binary_y, test_size=0.3)


# Separate features and labels
DDoS2019_binary_X = CICDDoS2019_binary_sampled.drop(CICDDoS2019_binary_sampled.filter(regex='^Label_'), axis=1).copy()
DDoS2019_binary_y = CICDDoS2019_binary_sampled[CICDDoS2019_binary_sampled.filter(regex='^Label_').columns].copy()

# Split the dataset into training and testing sets
DDoS2019_binary_train_X, DDoS2019_binary_test_X, DDoS2019_binary_train_y, DDoS2019_binary_test_y = train_test_split(DDoS2019_binary_X, DDoS2019_binary_y, test_size=0.2)



In [9]:
scalar = StandardScaler()

train_X_scaled = scalar.fit_transform(DDoS2019_binary_train_X)
test_X_scaled = scalar.transform(DDoS2019_binary_test_X)


## K-Nearest Neighbour

In [10]:
%%time
# Define the parameter space
parameter_space = [
    {'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan'],
    'algorithm': ['auto', 'ball_tree']
    }
]
    
    
# Create the KNN classifier object
knn = KNeighborsClassifier()

# Create the GridSearchCV object
optimal_knn = GridSearchCV(
                        estimator=knn, 
                        param_grid=parameter_space,
                        cv=5, 
                        scoring='accuracy', 
                        n_jobs=-1, 
                        verbose=0
)

# # Perform the grid search
# start_time = time.time()
optimal_knn.fit(train_X_scaled, DDoS2019_binary_train_y)
# end_time = time.time()

# Retrieve the best parameters
knn_optimal_params = optimal_knn.best_params_
print("Optimum hyperparameters:")
print(knn_optimal_params)
# print("Execution time: {:.2f} seconds".format(end_time - start_time))


Optimum hyperparameters:
{'algorithm': 'auto', 'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'}
CPU times: total: 1.08 s
Wall time: 2min 35s


In [20]:

# Make predictions on the test data
y_pred = optimal_knn.predict(test_X_scaled)

# Calculate accuracy
accuracy = accuracy_score( DDoS2019_binary_test_y, y_pred)

print(f"Accuracy: {accuracy}")

Accuracy: 0.9995238095238095


## Random Forest

In [11]:
%%time

parameter_space = [
    {'criterion': ['gini', 'entropy'],
    'max_depth': [10, 20, 30],
    'min_samples_leaf': [3, 0.0001, 0.0005, 0.00001], 
    'min_samples_split': [8, 0.0005, 0.0001, 0.00001], 
    'n_estimators': [ 200, 350, 400]
    }
]

optimal_rf = GridSearchCV(
                        RandomForestClassifier(),
                        parameter_space, 
                        cv=5,
                        scoring='accuracy',
                        n_jobs=-1,
                        verbose=0
)

optimal_rf.fit(train_X_scaled, DDoS2019_binary_train_y)
rf_optimal_params = optimal_rf.best_params_
print(f"Optimum hyperparameters: \n{rf_optimal_params}")


Optimum hyperparameters: 
{'criterion': 'entropy', 'max_depth': 30, 'min_samples_leaf': 1e-05, 'min_samples_split': 1e-05, 'n_estimators': 200}
CPU times: total: 28.8 s
Wall time: 37min 54s


In [21]:

# Make predictions on the test data
y_pred = optimal_rf.predict(test_X_scaled)

# Calculate accuracy
accuracy = accuracy_score( DDoS2019_binary_test_y, y_pred)

print(f"Accuracy: {accuracy}")

Accuracy: 0.9997619047619047


## Artificial Neural Network

In [12]:
%%time

parameter_space = [
    {'hidden_layer_sizes': [(10,), (20,), (30,), (40,), (50,)],
    'activation': ['tanh', 'relu', 'logistic'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.00001],
    'max_iter': [200, 500, 1000] }
]

ann = MLPClassifier()

optimal_ann = GridSearchCV(
                        ann,
                        parameter_space, 
                        cv=5,
                        n_jobs=-1,
                        verbose=0
)

optimal_ann.fit(train_X_scaled, DDoS2019_binary_train_y)
ann_optimal_params = optimal_ann.best_params_
print(f"Optimum hyperparameters: \n{ann_optimal_params}")

Optimum hyperparameters: 
{'activation': 'tanh', 'alpha': 1e-05, 'hidden_layer_sizes': (50,), 'max_iter': 500, 'solver': 'adam'}
CPU times: total: 27min 41s
Wall time: 1h 1min 10s


In [19]:

# Make predictions on the test data
y_pred = optimal_ann.predict(test_X_scaled)

# Calculate accuracy
accuracy = accuracy_score( DDoS2019_binary_test_y, y_pred)

print(f"Accuracy: {accuracy}")

Accuracy: 0.9957619047619047


## Deep Neural Network

In [13]:
%%time

parameter_space = [
    {'hidden_layer_sizes': [(10,10, 10), (12, 12, 12), (15, 15, 15), (8, 8, 8, 8), (10, 10, 10, 10)],
    'activation': ['tanh', 'relu', 'logistic'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.00001, 0.001],
    'max_iter': [100, 500, 1000]}
]

dnn = MLPClassifier()

optimal_dnn = GridSearchCV(
                        dnn,
                        parameter_space, 
                        cv=5,
                        n_jobs=-1,
                        verbose=0
)

optimal_dnn.fit(train_X_scaled, DDoS2019_binary_train_y)
dnn_optimal_params = optimal_dnn.best_params_
print(f"Optimum hyperparameters: \n{dnn_optimal_params}")

Optimum hyperparameters: 
{'activation': 'tanh', 'alpha': 0.001, 'hidden_layer_sizes': (12, 12, 12), 'max_iter': 1000, 'solver': 'adam'}
CPU times: total: 1min 29s
Wall time: 1h 23min 48s


In [18]:

# Make predictions on the test data
y_pred = optimal_dnn.predict(test_X_scaled)

# Calculate accuracy
accuracy = accuracy_score( DDoS2019_binary_test_y, y_pred)

print(f"Accuracy: {accuracy}")

Accuracy: 0.9971904761904762


## eXtreme Gradient Boosting

In [14]:
%%time


# Define the parameter space for grid search
parameter_space = [
    {'n_estimators': [300, 400, 500],  # Number of boosting rounds
    'learning_rate': [0.1, 0.2, 0.3],  # Learning rate (eta)
    'max_depth': [3, 4, 5],  # Maximum tree depth
    'min_child_weight': [1, 2, 3],  # Minimum sum of instance weight needed in a child
    'subsample': [0.8, 0.9],  # Subsample ratio of the training instances
    'colsample_bytree': [0.8, 0.9] }
]

# Create the XGBoost classifier
xgb_model = xgb.XGBClassifier()

# Perform grid search with cross-validation
optimal_xgb = GridSearchCV(
                        xgb_model,
                        parameter_space, 
                        cv=5,
                        n_jobs=-1,
                        verbose=0
)

optimal_xgb.fit(train_X_scaled, DDoS2019_binary_train_y)
xgb_optimal_params = optimal_xgb.best_params_
print(f"Optimum hyperparameters: \n{xgb_optimal_params}")


Optimum hyperparameters: 
{'colsample_bytree': 0.9, 'learning_rate': 0.3, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 400, 'subsample': 0.9}
CPU times: total: 10min
Wall time: 1h 12min 40s


In [17]:

# Make predictions on the test data
y_pred = optimal_xgb.predict(test_X_scaled)

# Calculate accuracy
accuracy = accuracy_score( DDoS2019_binary_test_y, y_pred)

print(f"Accuracy: {accuracy}")

Accuracy: 0.9999047619047619


In [None]:
# %%time

# dtrain = xgb.DMatrix(DDoS2019_binary_X, label=DDoS2019_binary['Label'])

# # Define the parameter space for grid search
# parameter_space = {
#     'n_estimators': [200, 500, 500],  # Number of boosting rounds
#     'learning_rate': [0.1, 0.2, 0.3],  # Learning rate (eta)
#     'max_depth': [3, 4, 5],  # Maximum tree depth
#     'min_child_weight': [1, 2, 3],  # Minimum sum of instance weight needed in a child
#     'subsample': [0.8, 0.9],  # Subsample ratio of the training instances
#     'colsample_bytree': [0.8, 0.9]  
# }

# # Create the XGBoost classifier
# xgb_model = xgb.XGBClassifier()

# # Perform grid search with cross-validation
# optimal_xgb = GridSearchCV(
#     xgb_model,
#     parameter_space, 
#     cv=5,
#     n_jobs=-1,
#     verbose=0
# )

# optimal_xgb.fit(train_X_scaled, DDoS2019_binary_train_y)
# xgb_optimal_params = optimal_xgb.best_params_
# print(f"Optimum hyperparameters: \n{xgb_optimal_params}")
