<a href="https://colab.research.google.com/github/isj0/DeepLearning/blob/main/Test_Run_NSL_KDD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import ibraries and load dataset

In [None]:
# Import required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.ensemble import VotingClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
import time
from scipy.stats import mode
from sklearn.metrics import classification_report



In [None]:
# load the NSL-KDD dataset from HuggingFace

ds = load_dataset("Mireu-Lab/NSL-KDD")

In [None]:
# Hugging Face datasets need to be converted to pandas DataFrame for easy handling

# convert train and test splits to dataFrames
train_df = pd.DataFrame(ds['train'])
test_df = pd.DataFrame(ds['test'])

### Explore dataset

In [None]:
# preview a few first rows of the data

print(train_df.head())

In [None]:
# display shape of datasets

print("Training dataset shape:", train_df.shape)
print("Testing dataset shape:", test_df.shape)

In [None]:
# display the column names

print("Columns in the dataset:")
print(train_df.columns)


In [None]:
# Check data types of each column
print("Data types of each column:")
print(train_df.dtypes)

In [None]:
# check target class distribution

print("Attack types in training dataset:")
print(train_df['class'].value_counts())

### Data Cleaning

In [None]:
# check for duplicate rows in training and test datasets

print("Number of duplicate rows in training set:", train_df.duplicated().sum())
print("Number of duplicate rows in test set:", test_df.duplicated().sum())


In [None]:
# Remove duplicate rows

train_df = train_df.drop_duplicates()
test_df = test_df.drop_duplicates()

# check for duplicate rows after removal
print("Number of duplicate rows in training set:", train_df.duplicated().sum())
print("Number of duplicate rows in test set:", test_df.duplicated().sum())


In [None]:
# Check for any missing values

print("Missing values in training set:\n", train_df.isnull().sum())
print("Missing values in test set:\n", test_df.isnull().sum())


In [None]:
# convert the target column to numeric: normal=0, anomaly=1

train_df['label'] = train_df['class'].apply(lambda x: 0 if x=='normal' else 1)
test_df['label'] = test_df['class'].apply(lambda x: 0 if x=='normal' else 1)

# Now drop original class column
train_df.drop('class', axis=1, inplace=True)
test_df.drop('class', axis=1, inplace=True)


### Data preprocessing

In [None]:
# peform One-hot encoding for categorical features

categorical_features = ['protocol_type', 'service', 'flag']

# convert categorical features to one-hot encoded columns
train_df_encoded = pd.get_dummies(train_df, columns=categorical_features)
test_df_encoded = pd.get_dummies(test_df, columns=categorical_features)


In [None]:
# align train and test after one-hot encoding

train_df_encoded, test_df_encoded = train_df_encoded.align(test_df_encoded, join='left', axis=1, fill_value=0)

# now confirm they have the same number of columns

print("Training features shape:", train_df_encoded.shape)
print("Testing features shape:", test_df_encoded.shape)


In [None]:
# Scale numeric features

# identify numeric columns
numeric_cols = train_df.select_dtypes(include=np.number).columns.tolist()
# remove target column from features
numeric_cols.remove('label')

print("Numeric columns to scale:", numeric_cols)


In [None]:
# Next we initialize the scaler
scaler = StandardScaler()

# fit the scaler on training data numeric columns and transform
train_df_encoded[numeric_cols] = scaler.fit_transform(train_df_encoded[numeric_cols])

# transform the test data using the same scaler
test_df_encoded[numeric_cols] = scaler.transform(test_df_encoded[numeric_cols])

# check a few rows to see scaled numeric features
print("Numeric features scaled. Sample data:")
print(train_df_encoded[numeric_cols].head())

In [None]:
# apply PCA to reduce dimensionality
# we will keep enough components to retain 95% of variance

# separate features and target
X_train = train_df_encoded.drop('label', axis=1)
y_train = train_df_encoded['label']

X_test = test_df_encoded.drop('label', axis=1)
y_test = test_df_encoded['label']

# Initialize PCA to retain 95% of variance
pca = PCA(n_components=0.95, random_state=42)

# fit PCA on training features and transform
X_train_pca = pca.fit_transform(X_train)

# transform test features using the same PCA
X_test_pca = pca.transform(X_test)

# check how many components PCA kept
print("Original number of features:", X_train.shape[1])
print("Reduced number of features after PCA:", X_train_pca.shape[1])


In [None]:
# function to compute, store and key evaluation metrics.

# dictionary to store results
model_results = {}

def evaluation_metrics(y_true, y_pred, model_name="Model"):

    # compute metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    # print metrics
    print(f"\n{model_name} Performance:")
    print(f"Accuracy : {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall   : {recall:.4f}")
    print(f"F1-score : {f1:.4f}")

    # store metrics in the global dictionary
    model_results[model_name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1": f1
    }

    # return metrics dictionary
    return model_results[model_name]





In [None]:
# function to plot a confusion matrix as a heatmap for our models

def plot_confusion_matrix(y_true, y_pred, model_name="Model"):

    # compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # plotting
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(f"{model_name} Confusion Matrix")
    plt.ylabel("Actual Label")
    plt.xlabel("Predicted Label")
    plt.show()


In [None]:
# Logistic Regression

# initialize the model
logreg = LogisticRegression(random_state=42, max_iter=1000)  # max_iter increased to ensure convergence

# fit the model on PCA-transformed training data
logreg.fit(X_train_pca, y_train)

# predict on test data
y_pred_logreg = logreg.predict(X_test_pca)

# evaluate metrics
evaluation_metrics(y_test, y_pred_logreg, model_name="Logistic Regression")

# plot confusion matrix
plot_confusion_matrix(y_test, y_pred_logreg, model_name="Logistic Regression")



In [None]:
# Decision Tree

# initialize the Decision Tree
dt = DecisionTreeClassifier(random_state=42)

# fit the model on PCA-transformed training data
dt.fit(X_train_pca, y_train)

# Predict on test data
y_pred_dt = dt.predict(X_test_pca)

# evaluate metrics
evaluation_metrics(y_test, y_pred_dt, model_name="Decision Tree")

# Plot confusion matrix
plot_confusion_matrix(y_test, y_pred_dt, model_name="Decision Tree")


In [None]:
# Random Forest

# initialize the Random Forest
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

# fit the model on PCA-transformed training data
rf.fit(X_train_pca, y_train)

# Predict on test data
y_pred_rf = rf.predict(X_test_pca)

# evaluate metrics
evaluation_metrics(y_test, y_pred_rf, model_name="Random Forest")

# Plot confusion matrix
plot_confusion_matrix(y_test, y_pred_rf, model_name="Random Forest")


In [None]:
# Support Vector machine

# initialize the SVM with RBF kernel
svm_model = SVC(kernel='rbf', random_state=42)

# train the model on PCA-transformed training data
svm_model.fit(X_train_pca, y_train)

# predict on test data
y_pred_svm = svm_model.predict(X_test_pca)

# evaluate metrics
evaluation_metrics(y_test, y_pred_svm, model_name="SVM")

# plot confusion matrix
plot_confusion_matrix(y_test, y_pred_svm, model_name="SVM")


In [None]:
# Multilayer Perceptron

# initialize MLP
mlp_model = MLPClassifier(hidden_layer_sizes=(50,), max_iter=200, random_state=42)

# train MLP
mlp_model.fit(X_train_pca, y_train)

# Predict on test data
y_pred_mlp = mlp_model.predict(X_test_pca)

# Evaluate metrics
evaluation_metrics(y_test, y_pred_mlp, model_name="MLP")

# Plot confusion matrix
plot_confusion_matrix(y_test, y_pred_mlp, model_name="MLP")


PCA reduces dimensionality

You went from 122 features → 24 features.

PCA keeps ~95% of the variance, but some information is inevitably lost.

This can affect models like Random Forest, which can handle many features and might benefit from the full feature set.

Random Forest doesn’t require PCA

RF is tree-based, not linear, so it can naturally handle correlated or high-dimensional features.

PCA is more useful for linear models (like Logistic Regression, SVM with RBF) where too many correlated features can hurt performance or increase computation.

Compare results

Training RF on PCA features gave you one set of metrics.

Training RF on original features gives a baseline to see if PCA helped or hurt.

You might find that RF on original features has better recall or F1-score, especially for the minority class (anomalies).

In [None]:
# Random Forest Hyperparameter Tuning using RandomizedSearchCV

# create a small set of parameters to test
rf_param_grid_fast = {
    "n_estimators": [50, 100, 200],       # num of trees
    "max_depth": [None, 10, 20],          # max depth of each tree
    "min_samples_split": [2, 5, 10],      # min samples to split a node
    "min_samples_leaf": [1, 2, 4],        # min samples at leaf
    "max_features": ["sqrt", 0.5],        # features considered per split
    "bootstrap": [True]                    # use bootstrap samples
}

# create a random forest model
rf = RandomForestClassifier(random_state=42)

# set up RandomizedSearchCV to find best parameters
rf_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=rf_param_grid_fast,
    n_iter=10,          # try only 10 random combinations
    cv=3,               # 3-fold cross validation
    scoring='f1',       # focus on F1 due to class imbalance
    verbose=2,
    random_state=42,
    n_jobs=-1           # use all CPU cores
)

# Train the model and measure time
start_time = time.time()
rf_search.fit(X_train_pca, y_train)
end_time = time.time()

print(f"Random Forest tuning completed in {end_time - start_time:.2f} seconds")

# Show the best hyperparameters
print("\nBest Random Forest Hyperparameters:")
print(rf_search.best_params_)

# predict on test data
y_pred_rf_best = rf_search.best_estimator_.predict(X_test_pca)

# evaluation metrics and confusion matrix
evaluation_metrics(y_test, y_pred_rf_best, model_name="Random Forest (Tuned simple)")
plot_confusion_matrix(y_test, y_pred_rf_best, model_name="Random Forest (Tuned simple)")


In [None]:
# Random Forest on all original features (No PCA)

print("Training Random Forest on original features")

rf_no_pca = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    max_features='sqrt',
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

# train random forest on all features
rf_no_pca.fit(X_train, y_train)

# predictions
y_pred_rf_no_pca = rf_no_pca.predict(X_test)

# evaluation
evaluation_metrics(y_test, y_pred_rf_no_pca, model_name="Random Forest (No PCA)")
plot_confusion_matrix(y_test, y_pred_rf_no_pca, model_name="Random Forest (No PCA)")


In [None]:
# SVM hyperparameter tuning setup using RandomizedSearchCV:
# We want to find the best SVM settings for our data

# choose some SVM options to try
svm_options = {
    'C': [0.1, 1, 10, 50],         # penalty
    'gamma': ['scale', 0.01, 0.1],
    'kernel': ['rbf']              # RBF kernel
}

# create the svm model
svm_model = SVC(random_state=42)

# try different settings
# RandomizedSearchCV will try 10 combinations of the options with 3-fold CV

svm_search = RandomizedSearchCV(
    estimator=svm_model,
    param_distributions=svm_options,
    n_iter=10,        # try 10 random combinations
    cv=3,             # 3-fold cross-validation
    scoring='f1',       # focus on F1-score due to class imbalance
    verbose=2,
    random_state=42,
    n_jobs=-1         # use all CPU cores
)

# fit with PCA reduced training data
start_time = time.time()
svm_search.fit(X_train_pca, y_train)
end_time = time.time()

print(f"SVM tuning completed in {end_time - start_time:.2f} seconds")

# see which setting worked
best_svm = svm_search.best_estimator_
print("\nBest SVM Hyperparameters found:")
print(svm_search.best_params_)

# Test the best SVM on our test data
y_pred_svm = best_svm.predict(X_test_pca)

# evaluations
evaluation_metrics(y_test, y_pred_svm, model_name="SVM (Tuned)")
plot_confusion_matrix(y_test, y_pred_svm, model_name="SVM (Tuned)")

In [None]:
# Gradient Boosting Classifier

# create the model
gb = GradientBoostingClassifier(
    # number of trees
    n_estimators=150,
    # learning rate
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

# Train the model on PCA-reduced training data
gb.fit(X_train_pca, y_train)

# Predict on test data
y_pred_gb = gb.predict(X_test_pca)

# evaluation
evaluation_metrics(y_test, y_pred_gb, model_name="Gradient Boosting")
plot_confusion_matrix(y_test, y_pred_gb, model_name="Gradient Boosting")



In [None]:
# Isolation Forest (Unsupervised model)

# create the model
iso = IsolationForest(
    # num of trees
    n_estimators=200,
    contamination=y_train.mean(),  # ratio of attacks in data
    max_samples='auto',         # samples each tree uses
    random_state=42,
    n_jobs=-1
)

# Train the model using features only -no labels
iso.fit(X_train)

# predict on test data (-1 = anomaly, 1 = normal)
y_pred_iso = iso.predict(X_test)

# Convert the output to 0 (normal) / 1 (attack)
y_pred_iso = np.where(y_pred_iso == -1, 1, 0)

# evaluate performace
evaluation_metrics(y_test, y_pred_iso, model_name="Isolation Forest")
plot_confusion_matrix(y_test, y_pred_iso, model_name="Isolation Forest")



In [None]:
# Hybrid Ensemble: Gradient Boosting + Isolation Forest

# supervised model predictions - gradient boosting
gb_pred = y_pred_gb

# Unsupervised model predictions - Isolation Forest
iso_pred = y_pred_iso

# Ensemble prediction using OR rule
ensemble_pred = ((gb_pred == 1) | (iso_pred == 1)).astype(int)

# evaluate ensemble
evaluation_metrics(y_test, ensemble_pred, model_name="Hybrid Ensemble")
plot_confusion_matrix(y_test, ensemble_pred, model_name="Hybrid Ensemble")


In [None]:
# Additional ensemble variations - AND-rule Ensemble

ensemble_and = ((gb_pred == 1) & (iso_pred == 1)).astype(int)

# evaluate AND-rule ensemble
evaluation_metrics(y_test, ensemble_and, model_name="Hybrid Ensemble (AND)")
plot_confusion_matrix(y_test, ensemble_and, model_name="Hybrid Ensemble (AND)")


In [None]:
# Majority Voting Ensemble
# Here we combine predictions from three models:
# Gradient Boosting (supervised)
# Isolation Forest (unsupervised)
# Random Forest (supervised, no PCA version)


# Stack predictions into a matrix: rows = samples, columns = models
preds = np.vstack([gb_pred, iso_pred, y_pred_rf_no_pca]).T

# now apply majority voting across the three models
ensemble_majority = mode(preds, axis=1)[0].flatten()

# Evaluate Majority voting ensemble
evaluation_metrics(y_test, ensemble_majority, model_name="Hybrid Ensemble (Majority Voting)")
plot_confusion_matrix(y_test, ensemble_majority, model_name="Hybrid Ensemble (Majority Voting)")


In [None]:
# Generate the classification report
print("Classification Report for Hybrid Ensemble (OR-rule)")
print(classification_report(y_test, ensemble_pred, target_names=['Normal', 'Attack']))

In [None]:
# Summarize all model metrics in a table

# convert the model_results dictionary into a dataframe

results_df = pd.DataFrame(model_results).T

# sort the models by F1-score
results_df = results_df.sort_values(by="F1", ascending=False)

display(results_df.style.background_gradient(cmap="Blues").format("{:.4f}"))

In [None]:

plt.figure(figsize=(10,6))
plt.bar(results_df.index, results_df['F1'], color='skyblue')
plt.ylabel('F1-score')
plt.title('Comparison of Model Performance (F1-score)')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Plot all metrics as grouped bars
ax = results_df[['Accuracy', 'Precision', 'Recall', 'F1']].plot(
    kind='bar', figsize=(12,6), width=0.8)

# Add titles and labels
plt.title('All Model Metrics Comparison')
plt.ylabel('Score')
plt.ylim(0, 1)  # all metrics are between 0 and 1
plt.xticks(rotation=45)
plt.legend(loc='lower right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()