# 🛡️ UNSW-NB15 Intrusion Detection Model Retraining

This notebook retrains a model for intrusion detection using the UNSW-NB15 dataset. The workflow includes:
1. Importing libraries
2. Loading the dataset
3. Data cleaning
4. Encoding categorical features
5. Feature selection using `SelectKBest`
6. Scaling numeric features
7. Model training
8. Evaluation
9. Saving model and preprocessing objects

In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# --- Step 1: Load datasets ---
train_df = pd.read_csv("UNSW_NB15_training-set.csv")
test_df = pd.read_csv("UNSW_NB15_testing-set.csv")

# Drop unnecessary columns
drop_cols = ['id', 'attack_cat']
train_df = train_df.drop(columns=[col for col in drop_cols if col in train_df.columns], errors='ignore')
test_df = test_df.drop(columns=[col for col in drop_cols if col in test_df.columns], errors='ignore')

# Define label column
LABEL_COL = "label"
y_train = train_df[LABEL_COL].apply(lambda x: 0 if x == 0 else 1)
X_train = train_df.drop(columns=[LABEL_COL])
y_test = test_df[LABEL_COL].apply(lambda x: 0 if x == 0 else 1)
X_test = test_df.drop(columns=[LABEL_COL])

# --- Step 2: Encode categorical values ---
categorical_cols = ['proto', 'service', 'state']
encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    # Fit on combined unique values from train and test to ensure consistent encoding
    combined_values = pd.concat([X_train[col], X_test[col]], axis=0).astype(str).unique()
    le.fit(combined_values)
    X_train[col] = le.transform(X_train[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))
    encoders[col] = le

# Save encoders for future use
joblib.dump(encoders, "unsw_encoders.pkl")

# --- Step 3: Handle missing values ---
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

# --- Step 4: Apply SMOTE to handle class imbalance ---
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)
print("SMOTE applied. New training set shape:", X_train.shape)

# --- Step 5: Hyperparameter tuning for Random Forest ---
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

# Optional: Use a subset for faster tuning (comment out for full dataset)
X_train_subset, _, y_train_subset, _ = train_test_split(
    X_train, y_train, test_size=0.8, random_state=42, stratify=y_train
)

clf = RandomForestClassifier(random_state=42, n_jobs=-1)
grid_search = GridSearchCV(
    clf, param_grid, cv=3, scoring='f1', n_jobs=-1, verbose=1
)
grid_search.fit(X_train_subset, y_train_subset)  # Use subset for tuning

# Train the best model on the full (SMOTE-resampled) training set
best_clf = grid_search.best_estimator_
best_clf.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best CV F1 score (on subset):", grid_search.best_score_)

# --- Step 6: Evaluate ---
y_pred = best_clf.predict(X_test)
metrics = {
    "accuracy": accuracy_score(y_test, y_pred),
    "precision": precision_score(y_test, y_pred),
    "recall": recall_score(y_test, y_pred),
    "f1": f1_score(y_test, y_pred),
}
print(metrics)

# --- Step 7: Visualize confusion matrix ---
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Normal', 'Attack'], yticklabels=['Normal', 'Attack'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('confusion_matrix_rf.png')
plt.close()

# --- Step 8: Save model ---
joblib.dump(best_clf, "unsw_rf_full.pkl")
print("✅ Model and encoders saved successfully!")

SMOTE applied. New training set shape: (238682, 42)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
Best parameters: {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
Best CV F1 score (on subset): 0.9560281758407365
{'accuracy': 0.8837754457562066, 'precision': 0.8389408040639157, 'recall': 0.9763522456542839, 'f1': 0.902445738054216}
✅ Model and encoders saved successfully!
