<a href="https://colab.research.google.com/github/isj0/DeepLearning/blob/main/Test_Run_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 0. Import required libraries

import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# 1. Load the NSL-KDD dataset from HuggingFace
ds = load_dataset("Mireu-Lab/NSL-KDD")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.csv:   0%|          | 0.00/22.5M [00:00<?, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/151165 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/34394 [00:00<?, ? examples/s]

In [17]:
# 2. convert HuggingFace datasets to pandas data frames
train = ds['train']
test = ds['test']
# Convert to DataFrame
train_df = pd.DataFrame(train)
test_df = pd.DataFrame(test)

# shape of training data
print("Training set shape:", train_df.shape)

# shape of testing data
print("Testing set shape:", test_df.shape)

Training set shape: (151165, 42)
Testing set shape: (34394, 42)


In [18]:
# 3. Check initial class distribution

print("Unique classes (train):", train_df['class'].unique())
print("Unique classes (test):", test_df['class'].unique())

Unique classes (train): ['normal' 'anomaly']
Unique classes (test): ['anomaly' 'normal']


In [19]:
# 4. Remove duplicate rows
# ================================================================
print("Duplicates before removal (train):", train_df.duplicated().sum())
print("Duplicates before removal (test):", test_df.duplicated().sum())

train_df = train_df.drop_duplicates()
test_df = test_df.drop_duplicates()

print("Duplicates after removal (train):", train_df.duplicated().sum())
print("Duplicates after removal (test):", test_df.duplicated().sum())

Duplicates before removal (train): 25201
Duplicates before removal (test): 11853
Duplicates after removal (train): 0
Duplicates after removal (test): 0


In [20]:
# 5. Convert class column to binary (0 = normal, 1 = attack)
# ================================================================
def convert_class(label):
    return 0 if label == 'normal' else 1

train_df['class'] = train_df['class'].apply(convert_class)
test_df['class'] = test_df['class'].apply(convert_class)

print(train_df['class'].value_counts())
print(test_df['class'].value_counts())

class
0    67343
1    58621
Name: count, dtype: int64
class
1    12830
0     9711
Name: count, dtype: int64


In [21]:
# 6. One-Hot Encode categorical columns
# ================================================================
categorical_columns = ['protocol_type', 'service', 'flag']

train_encoded = pd.get_dummies(train_df, columns=categorical_columns)
test_encoded = pd.get_dummies(test_df, columns=categorical_columns)

# Align test to training columns
# (Test may not contain some categories present in training)
test_encoded = test_encoded.reindex(columns=train_encoded.columns, fill_value=0)


In [22]:
# 7. Split into X (features) and y (labels)
# ================================================================
X_train = train_encoded.drop('class', axis=1)
y_train = train_encoded['class']

X_test = test_encoded.drop('class', axis=1)
y_test = test_encoded['class']

In [23]:
# 8. Scale features
# ================================================================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [24]:
# 9. Apply PCA (retain 95% variance)
# ================================================================
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print("PCA components:", X_train_pca.shape[1])

PCA components: 89


In [25]:
# 10. Train Random Forest (Supervised Model)
# ================================================================
rf = RandomForestClassifier(n_estimators=150, random_state=42)
rf.fit(X_train_pca, y_train)

rf_preds = rf.predict(X_test_pca)

In [26]:
# 11. Train Isolation Forest (Unsupervised Model)
# ================================================================
# Contamination set to approximate attack ratio
attack_ratio = y_train.mean()

iso = IsolationForest(contamination=attack_ratio, random_state=42)
iso.fit(X_train_pca)

iso_scores = iso.predict(X_test_pca)
# IsolationForest labels: 1 = normal, -1 = anomaly → convert to binary
iso_preds = np.where(iso_scores == -1, 1, 0)


In [27]:
# 12. Minimal Ensemble
# ================================================================
# If either model says "attack", mark as attack
ensemble_preds = np.where((rf_preds == 1) | (iso_preds == 1), 1, 0)


In [28]:
# 13. Evaluation Function
# ================================================================
def evaluate(name, y_true, y_pred):
    print(f"\n=== {name} Evaluation ===")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

In [29]:
# 14. Evaluate Models
# ================================================================
evaluate("Random Forest", y_test, rf_preds)
evaluate("Isolation Forest", y_test, iso_preds)
evaluate("Ensemble", y_test, ensemble_preds)



=== Random Forest Evaluation ===
Accuracy: 0.7946852402289162
Precision: 0.9669779093600547
Recall: 0.6618862042088854
F1 Score: 0.7858597075698686
Confusion Matrix:
 [[9421  290]
 [4338 8492]]

=== Isolation Forest Evaluation ===
Accuracy: 0.699614036644337
Precision: 0.7918312301319719
Recall: 0.6406858924395947
F1 Score: 0.7082848649347292
Confusion Matrix:
 [[7550 2161]
 [4610 8220]]

=== Ensemble Evaluation ===
Accuracy: 0.8441506587995209
Precision: 0.8378417579229821
Recall: 0.9004676539360873
F1 Score: 0.8680265975431083
Confusion Matrix:
 [[ 7475  2236]
 [ 1277 11553]]
