# 🛡️ UNSW-NB15 Intrusion Detection Model Retraining

This notebook retrains a model for intrusion detection using the UNSW-NB15 dataset. The workflow includes:
1. Importing libraries
2. Loading the dataset
3. Data cleaning
4. Encoding categorical features
5. Feature selection using `SelectKBest`
6. Scaling numeric features
7. Model training
8. Evaluation
9. Saving model and preprocessing objects

In [36]:
# -------------------------------
# 1. Import Libraries
# -------------------------------
import pandas as pd
import numpy as np
import joblib
import time
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import StackingClassifier
from scipy.stats import randint, uniform
import lightgbm as lgb

## 2. Load Dataset

In [33]:
import pandas as pd

df = pd.read_csv("UNSW_NB15_training-set.csv")
df.head()

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,tcp,-,FIN,6,4,258,172,74.08749,...,1,1,0,0,0,1,1,0,Normal,0
1,2,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,...,1,2,0,0,0,1,6,0,Normal,0
2,3,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,...,1,3,0,0,0,2,6,0,Normal,0
3,4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,...,1,3,1,1,0,2,1,0,Normal,0
4,5,0.449454,tcp,-,FIN,10,6,534,268,33.373826,...,1,40,0,0,0,2,39,0,Normal,0


In [9]:
print(df.columns)

Index(['id', 'dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss',
       'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin',
       'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
       'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat', 'label'],
      dtype='object')


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175341 entries, 0 to 175340
Data columns (total 45 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 175341 non-null  int64  
 1   dur                175341 non-null  float64
 2   proto              175341 non-null  object 
 3   service            175341 non-null  object 
 4   state              175341 non-null  object 
 5   spkts              175341 non-null  int64  
 6   dpkts              175341 non-null  int64  
 7   sbytes             175341 non-null  int64  
 8   dbytes             175341 non-null  int64  
 9   rate               175341 non-null  float64
 10  sttl               175341 non-null  int64  
 11  dttl               175341 non-null  int64  
 12  sload              175341 non-null  float64
 13  dload              175341 non-null  float64
 14  sloss              175341 non-null  int64  
 15  dloss              175341 non-null  int64  
 16  si

## 3. Data Cleaning

In [34]:
# Define features and label
FEATURES = ['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss',
       'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin',
       'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
       'ct_srv_dst', 'is_sm_ips_ports']
LABEL_COL = 'label'

# Fill missing values with 0
df = df[FEATURES + [LABEL_COL]].fillna(0)
df.head()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,label
0,0.121478,tcp,-,FIN,6,4,258,172,74.08749,252,...,1,1,1,0,0,0,1,1,0,0
1,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,62,...,1,1,2,0,0,0,1,6,0,0
2,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,62,...,1,1,3,0,0,0,2,6,0,0
3,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,62,...,1,1,3,1,1,0,2,1,0,0
4,0.449454,tcp,-,FIN,10,6,534,268,33.373826,254,...,2,1,40,0,0,0,2,39,0,0


## 4. Encode Categorical Features

In [41]:
# Encode categorical features: proto, service, state

categorical_cols = ['proto', 'service', 'state']
encoders = {}  # save encoders for later use
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

joblib.dump(encoders , "unsw_encoders.pkl")

['unsw_encoders.pkl']

## 5. Feature Selection using KBest

In [42]:
X = df[FEATURES].values
y = df[LABEL_COL].values

# Use ANOVA F-value to select top 20 features
selector = SelectKBest(score_func=f_classif, k=30)
X_selected = selector.fit_transform(X, y)
selected_features = [FEATURES[i] for i in selector.get_support(indices=True)]
print('Selected Features:', selected_features)
joblib.dump(selector, "unsw_selectkbest_selector.pkl")




Selected Features: ['dur', 'state', 'spkts', 'dpkts', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'dloss', 'sinpkt', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'dmean', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports']


['unsw_selectkbest_selector.pkl']

## Option 2

In [37]:
# Assume df is your raw DataFrame
LABEL_COL = "label"
categorical_cols = ["proto", "service", "state"]  # first 3 categorical features
numeric_cols = [col for col in df.columns if col not in categorical_cols + [LABEL_COL, "id"]]

# Separate features and target
X = df[numeric_cols + categorical_cols]
y = df[LABEL_COL].values

# Encode categorical features temporarily for mutual info calculation
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
X[categorical_cols] = encoder.fit_transform(X[categorical_cols])

# Feature selection using mutual information
k = 30  # top features
selector = SelectKBest(score_func=mutual_info_classif, k=k)
X_selected = selector.fit_transform(X, y)

# Get selected feature names
selected_features = [X.columns[i] for i in selector.get_support(indices=True)]
print(f"Selected top {k} features:\n{selected_features}")

# Save selector and encoder for later use
joblib.dump(selector, "unsw_mutual_info_selector.pkl")
joblib.dump(encoder, "unsw_temp_encoder.pkl")
print("Selector and encoder saved for later use.")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[categorical_cols] = encoder.fit_transform(X[categorical_cols])


Selected top 30 features:
['dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'ct_src_ltm', 'ct_srv_dst', 'proto', 'state']
Selector and encoder saved for later use.


## 6. Split Dataset and Scale

In [43]:
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

joblib.dump(scaler, "unsw_scaler.pkl")



['unsw_scaler.pkl']

In [56]:
feature_list=['dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'ct_src_ltm', 'ct_srv_dst', 'proto', 'state']


# Suppose feature_names comes from scaler or your feature list
feature_names = getattr(scaler, "feature_names_in_", feature_list)

j = 0
for row in X_train:  # X_train is now a NumPy array
    print(f"Row {j}:")
    for col_name, value in zip(feature_names, row):
        print(f"{col_name}: {value}")
    print("-" * 40)
    
    j += 1
    if j == 5:
        break


Row 0:
dur: -0.1746689251452038
spkts: -0.40983821307107005
dpkts: -0.07528848652982427
sbytes: -0.09701240630255832
dbytes: -0.09401842285433677
rate: -0.5761709687953193
sttl: 0.7222701365189131
dttl: 1.559692851640966
sload: -0.3905831436818904
dload: -0.26436804317738316
sloss: -0.09128933909233711
dloss: -0.13274177500413978
sinpkt: -0.1348608378389523
dinpkt: 1.0928309581273243
sjit: 1.6036544821268683
djit: 0.5425575835683591
tcprtt: 1.1038920149257545
synack: 0.15955679958067487
ackdat: -0.3339596685500486
smean: 0.6649884805423422
dmean: 0.030753036415489744
ct_state_ttl: -0.7767207925186039
ct_dst_ltm: -0.319566598719986
ct_src_dport_ltm: -0.6459200207469337
ct_dst_sport_ltm: -0.5453648255166359
ct_dst_src_ltm: -0.5545373997340904
ct_src_ltm: -0.7065296923356614
ct_srv_dst: -0.7168480336223853
proto: -0.6607610560254404
state: -0.12628710130032966
----------------------------------------
Row 1:
dur: -0.20994069997712247
spkts: -2.713722548578027
dpkts: -0.13398099902742208
sb

In [44]:
print(X_train.shape)

(140272, 30)


## 7. Train Model

In [45]:
# Define utility function
def evaluate_model(model, X_train, y_train, X_test, y_test):
    start = time.time()
    model.fit(X_train, y_train)
    elapsed = time.time() - start
    preds = model.predict(X_test)
    return {
        "Accuracy": accuracy_score(y_test, preds),
        "Precision": precision_score(y_test, preds, zero_division=0),
        "Recall": recall_score(y_test, preds, zero_division=0),
        "F1-Score": f1_score(y_test, preds, zero_division=0),
        "Training_Time": elapsed
    }, model

# -------------------------------
# Apply SMOTE
# -------------------------------
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)
print(f"Balanced training set shape: {X_train_bal.shape}, {y_train_bal.shape}")

# -------------------------------
# Class weights
# -------------------------------
classes = np.unique(y_train_bal)
weights = compute_class_weight("balanced", classes=classes, y=y_train_bal)
class_weight_dict = dict(zip(classes, weights))

# -------------------------------
# Base Decision Tree
# -------------------------------
dt_base = DecisionTreeClassifier(
    random_state=42,
    class_weight="balanced",
    max_depth=25,
    min_samples_split=10,
    min_samples_leaf=5,
    criterion="gini"
)
dt_metrics, dt_fitted = evaluate_model(dt_base, X_train_bal, y_train_bal, X_test, y_test)

# -------------------------------
# LightGBM Hyperparameter Search
# -------------------------------
param_dist = {
    "num_leaves": randint(31, 255),
    "max_depth": randint(4, 12),
    "learning_rate": uniform(0.01, 0.1),
    "n_estimators": randint(300, 1200),
    "min_child_samples": randint(10, 60),
    "subsample": uniform(0.6, 0.3),
    "colsample_bytree": uniform(0.6, 0.4),
    "reg_alpha": uniform(0.0, 1.0),
    "reg_lambda": uniform(0.0, 1.0),
    "min_data_in_leaf": randint(20, 200),
    "feature_fraction": uniform(0.0, 1.0)
}

lgb_base = lgb.LGBMClassifier(
    objective="binary",
    class_weight=class_weight_dict,
    boosting_type="gbdt",
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

search = RandomizedSearchCV(
    lgb_base,
    param_distributions=param_dist,
    n_iter=20,
    scoring="f1",
    cv=3,
    n_jobs=-1,
    random_state=42
)

search.fit(X_train_bal, y_train_bal)
best_params = search.best_params_
print("Best Hyperparameters:", best_params)

# Train final LightGBM
lgb_final = lgb.LGBMClassifier(
    objective="binary",
    class_weight=class_weight_dict,
    boosting_type="gbdt",
    random_state=42,
    n_jobs=-1,
    verbose=-1,
    **best_params
)
lgb_final.fit(X_train_bal, y_train_bal)
lgb_metrics, lgb_fitted = evaluate_model(lgb_final, X_train_bal, y_train_bal, X_test, y_test)

# -------------------------------
# Stacking Ensemble
# -------------------------------
stack_ensemble = StackingClassifier(
    estimators=[("dt", dt_fitted), ("lgb", lgb_fitted)],
    final_estimator=lgb.LGBMClassifier(
        objective="binary",
        n_estimators=500, learning_rate=0.05, random_state=42
    ),
    cv=3,
    n_jobs=-1
)

ens_metrics, ens_fitted = evaluate_model(stack_ensemble, X_train_bal, y_train_bal, X_test, y_test)

# -------------------------------
# Print Results
# -------------------------------
print("\nDecision Tree Metrics:", dt_metrics)
print("\nLightGBM Metrics:", lgb_metrics)
print("\nStacking Ensemble Metrics:", ens_metrics)

# Classification report & confusion matrix
ens_preds = ens_fitted.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, ens_preds, zero_division=0))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, ens_preds))

# -------------------------------
# Save final stacking model
# -------------------------------
joblib.dump(lgb_fitted, "unsw_lgb_model.pkl")
print("✅ Stacking Ensemble model saved as 'unsw_stacking_model.pkl'")

Balanced training set shape: (190944, 30), (190944,)
Best Hyperparameters: {'colsample_bytree': np.float64(0.9706635463175177), 'feature_fraction': np.float64(0.7272719958564209), 'learning_rate': np.float64(0.04265407688058354), 'max_depth': 9, 'min_child_samples': 32, 'min_data_in_leaf': 59, 'n_estimators': 1024, 'num_leaves': 238, 'reg_alpha': np.float64(0.7473201101373809), 'reg_lambda': np.float64(0.5396921323890798), 'subsample': np.float64(0.7760253496991545)}





Decision Tree Metrics: {'Accuracy': 0.9407168724514529, 'Precision': 0.9620832979896514, 'Recall': 0.9503540156688592, 'F1-Score': 0.9561826880519316, 'Training_Time': 1.4390654563903809}

LightGBM Metrics: {'Accuracy': 0.9564002395277881, 'Precision': 0.9641402808942076, 'Recall': 0.9720976999455361, 'F1-Score': 0.9681026389902994, 'Training_Time': 4.028558969497681}

Stacking Ensemble Metrics: {'Accuracy': 0.9549174484587527, 'Precision': 0.9596980447157826, 'Recall': 0.9746952113620176, 'F1-Score': 0.9671384922367027, 'Training_Time': 15.233223915100098}

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.91      0.93     11200
           1       0.96      0.97      0.97     23869

    accuracy                           0.95     35069
   macro avg       0.95      0.94      0.95     35069
weighted avg       0.95      0.95      0.95     35069


Confusion Matrix:
 [[10223   977]
 [  604 23265]]
✅ Stacking Ensemble model saved 

