In [1]:
import pandas as pd
import numpy as np
import random
import time
import joblib
import pickle

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GridSearchCV, ParameterGrid, RandomizedSearchCV

from xgboost import XGBClassifier



In [2]:
TRAIN_DATASET_PATH = 'Dataset/train_data.csv'
TEST_DATASET_PATH = 'Dataset/test_data.csv'
TARGET_COLUMN = 'Label'

In [3]:
df_train = pd.read_csv(TRAIN_DATASET_PATH)
df_test = pd.read_csv(TEST_DATASET_PATH)

In [4]:
columns_to_drop = ['FlowID', 'SourceIP', 'DestinationIP', 'Timestamp']

def preprocess_dataframe(df, columns_to_drop, target_column, desired_sample_size):
    df_processed = df.drop(columns=columns_to_drop, errors='ignore').copy()

    # Handle infinite and missing values
    df_processed.replace([float('inf'), float('-inf')], pd.NA, inplace=True)
    df_processed.fillna(0, inplace=True)

    # Stratified downsampling (unless using full data)
    if isinstance(desired_sample_size, str) and desired_sample_size.lower() == 'all':
        df_sampled = df_processed
    else:
        sample_frac = desired_sample_size / len(df_processed)
        df_sampled, _ = train_test_split(
            df_processed,
            test_size=1 - sample_frac,
            stratify=df_processed[target_column],
            random_state=42
        )

    # Separate features (X) and target (y)
    X = df_sampled.drop(columns=[target_column])
    y = df_sampled[target_column]

    return X, y

In [5]:
X_train, y_train = preprocess_dataframe(df_train, columns_to_drop, TARGET_COLUMN, 'all')
X_test, y_test = preprocess_dataframe(df_test, columns_to_drop, TARGET_COLUMN, 'all')

In [6]:
def simplify_labels(y):
    return y.apply(lambda x: 'BENIGN' if x.upper() == 'BENIGN' else 'ATTACK')

y_train = simplify_labels(y_train)
y_test = simplify_labels(y_test)

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)


In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
# Initialize the XGBoost classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Train the model on the entire training dataset
xgb_model.fit(X_train_scaled, y_train)

# Predict on the test dataset
y_pred = xgb_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.7f}")
print(f"Precision: {precision:.7f}")
print(f"Recall: {recall:.7f}")
print(f"F1 Score: {f1:.7f}")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.9998890
Precision: 0.9998890
Recall: 0.9998890
F1 Score: 0.9998890


In [32]:
# Define the parameter grid for XGBoost
param_grid = {
    'max_depth': [4, 5, 6, 7],
    'learning_rate': [0.01, 0.1, 0.15, 0.3],
    'min_child_weight': [1, 2, 3],
    'subsample': [0.5, 0.75, 1],
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    param_grid=param_grid,
    scoring='f1_weighted',
    cv=3,
    verbose=1,
    n_jobs=-1
)

In [33]:
# Perform the grid search on the training data
grid_search.fit(X_train_scaled, y_train)


Fitting 3 folds for each of 144 candidates, totalling 432 fits


3 fits failed out of a total of 432.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\jeffc\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\jeffc\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\Users\jeffc\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_

In [35]:
best_xgb_model = grid_search.best_estimator_

# Get the best estimator


# Print the best parameters
print("Best parameters found: ", grid_search.best_params_)

Best parameters found:  {'learning_rate': 0.3, 'max_depth': 7, 'min_child_weight': 1, 'subsample': 0.75}


In [36]:
# Evaluate the best estimator on the test set
y_pred_best = best_xgb_model.predict(X_test_scaled)

# Calculate evaluation metrics
accuracy_best = accuracy_score(y_test, y_pred_best)
precision_best = precision_score(y_test, y_pred_best, average='weighted')
recall_best = recall_score(y_test, y_pred_best, average='weighted')
f1_best = f1_score(y_test, y_pred_best, average='weighted')

print("Best Parameters:", grid_search.best_params_)
print(f"Accuracy (Best Estimator): {accuracy_best:.7f}")
print(f"Precision (Best Estimator): {precision_best:.7f}")
print(f"Recall (Best Estimator): {recall_best:.7f}")
print(f"F1 Score (Best Estimator): {f1_best:.7f}")

Best Parameters: {'learning_rate': 0.3, 'max_depth': 7, 'min_child_weight': 1, 'subsample': 0.75}
Accuracy (Best Estimator): 0.9998437
Precision (Best Estimator): 0.9998437
Recall (Best Estimator): 0.9998437
F1 Score (Best Estimator): 0.9998437


Using RandomSearch instead of GridSearch to reduce optimization time

In [10]:
from scipy.stats import uniform, randint
param_distributions = {
    'max_depth': randint(4, 8),               # 4, 5, 6, 7
    'learning_rate': uniform(0.01, 0.29),     # 0.01 to 0.3
    'min_child_weight': randint(1, 4),        # 1, 2, 3
    'subsample': uniform(0.5, 0.5)            # 0.5 to 1.0
}

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=1)

# RandomizedSearchCV for 20 iterations
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_distributions,
    n_iter=20,
    scoring='f1_weighted',
    cv=3,
    verbose=3,
    n_jobs=-1,
    random_state=42
)

# Fit on training data
random_search.fit(X_train_scaled, y_train)

# Best model and parameters
best_model_random = random_search.best_estimator_
print("Best parameters:", random_search.best_params_)

# Evaluate on test set
y_pred = best_model_random.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.7f}")
print(f"Precision: {precision:.7f}")
print(f"Recall: {recall:.7f}")
print(f"F1 Score: {f1:.7f}")

Fitting 3 folds for each of 20 candidates, totalling 60 fits


1 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\jeffc\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\jeffc\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\Users\jeffc\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_q

Best parameters: {'learning_rate': 0.2542625846325495, 'max_depth': 7, 'min_child_weight': 2, 'subsample': 0.6975751180009072}
Accuracy: 0.9998890
Precision: 0.9998890
Recall: 0.9998890
F1 Score: 0.9998890
