# 🛡️ UNSW-NB15 Intrusion Detection Model Retraining

This notebook retrains a model for intrusion detection using the UNSW-NB15 dataset. The workflow includes:
1. Importing libraries
2. Loading the dataset
3. Data cleaning
4. Encoding categorical features
5. Feature selection using `SelectKBest`
6. Scaling numeric features
7. Model training
8. Evaluation
9. Saving model and preprocessing objects

In [None]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load Dataset
df = pd.read_csv("UNSW_NB15_training-set.csv")
df.head()

# Drop unnecessary columns
drop_cols = ['id', 'attack_cat']
df = df.drop(columns=[col for col in drop_cols if col in df.columns], errors='ignore')

# Define label column
LABEL_COL = "label"
y = df[LABEL_COL].apply(lambda x: 0 if x == 0 else 1)
X = df.drop(columns=[LABEL_COL])

# --- Step 2: Encode categorical values ---
cat_cols = ['proto', 'service', 'state']
for col in cat_cols:
    if col in X.columns:
        X[col] = X[col].astype('category').cat.codes  # simple encoding

# --- Step 3: Handle missing values ---
X = X.fillna(0)

# --- Step 4: Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Step 5: Train model ---
clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
clf.fit(X_train, y_train)

# --- Step 6: Evaluate ---
y_pred = clf.predict(X_test)
metrics = {
    "accuracy": accuracy_score(y_test, y_pred),
    "precision": precision_score(y_test, y_pred),
    "recall": recall_score(y_test, y_pred),
    "f1": f1_score(y_test, y_pred),
}
print(metrics)

# --- Step 7: Save model ---
joblib.dump(clf, "unsw_rf_full.pkl")
