# Install and Import Libraries

In [17]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import OrdinalEncoder
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import os 
import sys
# Step 1: Load the training and test data
current_dir = os.path.dirname(os.path.abspath('__file__'))
project_root = current_dir
while not os.path.exists(os.path.join(project_root, 'system_main')):
    project_root = os.path.dirname(project_root)
    if project_root == os.path.dirname(project_root):  # reached the root directory
        raise FileNotFoundError("Could not find the 'system_main' directory.")

# Set up paths
dataset_dir = os.path.join(project_root, 'system_main', 'dataset')
model_dir = os.path.join(project_root, 'system_main', 'model', 'saved_model')

# Ensure the directories exist
os.makedirs(dataset_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)

# Print the paths for verification
print(f"Dataset directory: {dataset_dir}")
print(f"Model directory: {model_dir}")

# Load the data
train_data_path = os.path.join(dataset_dir, 'UNSW_NB15_training-set.csv')
test_data_path = os.path.join(dataset_dir, 'UNSW_NB15_testing-set.csv')

# Check if the files exist
if not os.path.exists(train_data_path):
    raise FileNotFoundError(f"Training dataset not found at {train_data_path}")
if not os.path.exists(test_data_path):
    raise FileNotFoundError(f"Testing dataset not found at {test_data_path}")

# Load the data
train_data_initial = pd.read_csv(train_data_path)
test_data_initial = pd.read_csv(test_data_path)

# Print the shape of the loaded data for verification
print(f"Training data shape: {train_data_initial.shape}")
print(f"Testing data shape: {test_data_initial.shape}")

Dataset directory: c:\Users\kimki\network_anomaly_detect\system_main\dataset
Model directory: c:\Users\kimki\network_anomaly_detect\system_main\model\saved_model
Training data shape: (175341, 45)
Testing data shape: (82332, 45)


# Data Preprocessing


In [18]:
def engineer_features(df):
    print("Available columns:", df.columns.tolist())

    if 'sbytes' in df.columns and 'dbytes' in df.columns:
        df['total_bytes'] = df['sbytes'] + df['dbytes']
        df['byte_ratio'] = df['sbytes'] / (df['dbytes'] + 1)

    if 'sttl' in df.columns and 'dttl' in df.columns:
        df['ttl_diff'] = df['sttl'] - df['dttl']

    if 'sport' in df.columns and 'dport' in df.columns:
        df['high_port'] = ((df['sport'] > 1024) | (df['dport'] > 1024)).astype(int)

    return df

In [19]:
# Data Preprocessing
x_train = train_data_initial.drop(columns=['label', 'service', 'attack_cat'])
y_train = train_data_initial['label']
x_test = test_data_initial.drop(columns=['label', 'service', 'attack_cat'])
y_test = test_data_initial['label']

# Apply feature engineering
x_train = engineer_features(x_train)
x_test = engineer_features(x_test)

categorical_columns = x_train.select_dtypes(include=['object']).columns.tolist()
numeric_columns = x_train.select_dtypes(include=[np.number]).columns.tolist()

# Encode categorical variables
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
x_train[categorical_columns] = encoder.fit_transform(x_train[categorical_columns])
x_test[categorical_columns] = encoder.transform(x_test[categorical_columns])

# Split the training data
x_train_split, x_val_split, y_train_split, y_val_split = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

# Apply SMOTE for oversampling
smote = SMOTE(random_state=42)
x_train_res, y_train_res = smote.fit_resample(x_train_split, y_train_split)

Available columns: ['id', 'dur', 'proto', 'state', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports']
Available columns: ['id', 'dur', 'proto', 'state', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_port

# Train and Evaluate the XGBoost Model

In [20]:
##  Initialising and training the XGBoost model  ##
xgboost_model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.003,
    random_state=42
)
xgboost_model.fit(x_train_split, y_train_split)

##  Check the class distribution  ##
# print(train_data['label'].value_counts())
# print(test_data['label'].value_counts())

##  Evaluate the model's performance  ##
y_test_pred = xgboost_model.predict(x_test)
print(f"Accuracy: {accuracy_score(y_test, y_test_pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_test_pred)}")
print(f"Classification Report:\n{classification_report(y_test, y_test_pred, zero_division=1)}")

Accuracy: 0.5102633241024146
Confusion Matrix:
[[24045 12955]
 [27366 17966]]
Classification Report:
              precision    recall  f1-score   support

           0       0.47      0.65      0.54     37000
           1       0.58      0.40      0.47     45332

    accuracy                           0.51     82332
   macro avg       0.52      0.52      0.51     82332
weighted avg       0.53      0.51      0.50     82332



# Train the Optimised XGBoost Model

In [21]:
##  Calculate scale_pos_weight as the ratio of negative to positive examples in the original data  ##
neg, pos = y_train.value_counts()
scale_pos_weight = neg / pos

##  Define the XGBoost model  ##
optimised_xgboost_model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.003,
    max_depth=18,
    subsample=0.8,
    colsample_bytree=0.9,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    enable_categorical=True  # Enable built-in categorical feature support
)
optimised_xgboost_model.fit(x_train_res, y_train_res)

# Evaluate the model
test_predictions = optimised_xgboost_model.predict(x_test)
print(f"Optimized Model Accuracy: {accuracy_score(y_test, test_predictions)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, test_predictions)}")
print(f"Test Classification Report:\n{classification_report(y_test, test_predictions)}")

Optimized Model Accuracy: 0.7944541612009911
Confusion Matrix:
[[20077 16923]
 [    0 45332]]
Test Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.54      0.70     37000
           1       0.73      1.00      0.84     45332

    accuracy                           0.79     82332
   macro avg       0.86      0.77      0.77     82332
weighted avg       0.85      0.79      0.78     82332



In [22]:
import joblib

model_path = os.path.join(model_dir, 'xgboost_model.joblib')
joblib.dump(optimised_xgboost_model, model_path)

preprocessor = {
    'encoder': encoder,
    'categorical_columns': categorical_columns,
    'numeric_columns': numeric_columns,
    'smote': smote,
    'scale_pos_weight': scale_pos_weight
}
preprocessor_path = os.path.join(model_dir, 'xgboost_preprocessor.joblib')
joblib.dump(preprocessor, preprocessor_path)

print(f"Model saved to: {model_path}")
print(f"Preprocessor saved to: {preprocessor_path}")

Model saved to: c:\Users\kimki\network_anomaly_detect\system_main\model\saved_model\xgboost_model.joblib
Preprocessor saved to: c:\Users\kimki\network_anomaly_detect\system_main\model\saved_model\xgboost_preprocessor.joblib
