In [19]:
import pandas as pd

# 1. Load UNSW-NB15 CSV
df = pd.read_csv(r"C:\Users\kbc_k\comp3000\UNSW_NB15_training-set1.csv")
df = pd.read_csv(r"C:\Users\kbc_k\comp3000\UNSW_NB15_testing-set2.csv")

# 2. Look at first rows
print(df.head())

# 3. Check shape
print("Shape:", df.shape)

# 4. Check columns
print(df.columns)

# 5. Check target/label column
print(df['label'].value_counts())    # sometimes binary label
# or:
# label often â†’ 0 = normal, 1 = attack


   id       dur proto service state  spkts  dpkts  sbytes  dbytes  \
0   1  0.000011   udp       -   INT      2      0     496       0   
1   2  0.000008   udp       -   INT      2      0    1762       0   
2   3  0.000005   udp       -   INT      2      0    1068       0   
3   4  0.000006   udp       -   INT      2      0     900       0   
4   5  0.000010   udp       -   INT      2      0    2126       0   

          rate  ...  ct_dst_sport_ltm  ct_dst_src_ltm  is_ftp_login  \
0   90909.0902  ...                 1               2             0   
1  125000.0003  ...                 1               2             0   
2  200000.0051  ...                 1               3             0   
3  166666.6608  ...                 1               3             0   
4  100000.0025  ...                 1               3             0   

   ct_ftp_cmd  ct_flw_http_mthd  ct_src_ltm  ct_srv_dst  is_sm_ips_ports  \
0           0                 0           1           2                0   
1     

In [21]:
# Change 'label' to the actual label column name in CSV
label_col = 'label'  

# Drop label from features
X = df.drop(columns=[label_col])
y = df[label_col]

# Check which columns are numeric and which are object (categorical)
numeric_cols = X.select_dtypes(exclude=['object']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

print("Numeric:", numeric_cols)
print("Categorical:", categorical_cols)

Numeric: ['id', 'dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports']
Categorical: ['proto', 'service', 'state', 'attack_cat']


In [23]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import IsolationForest

# Preprocess: encode categorical + scale numeric
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", StandardScaler(), numeric_cols)
    ]
)

# Isolation Forest model
iso = IsolationForest(
    n_estimators=200,
    contamination=0.1,  # guess 10% anomalies (you can adjust)
    random_state=42
)

# Full pipeline: preprocessing + model
clf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", iso)
])

# Keep only normal rows (assuming 0 = normal)
normal_df = df[df[label_col] == 0]

X_normal = normal_df.drop(columns=[label_col])

# Fit the pipeline
clf.fit(X_normal)
print("Model trained!")


Model trained!


In [25]:
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

X_all = df.drop(columns=[label_col])
y_all = df[label_col]  # 0 normal, 1 attack

# Predict: IsolationForest gives 1 (normal) and -1 (anomaly)
raw_preds = clf.predict(X_all)

# Convert to 0/1 (same as label)
pred_anomaly = np.where(raw_preds == -1, 1, 0)  # 1 = anomaly/attack

print("Confusion Matrix:")
print(confusion_matrix(y_all, pred_anomaly))

print("\nClassification Report:")
print(classification_report(y_all, pred_anomaly))

Confusion Matrix:
[[33300  3700]
 [23998 21334]]

Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.90      0.71     37000
           1       0.85      0.47      0.61     45332

    accuracy                           0.66     82332
   macro avg       0.72      0.69      0.66     82332
weighted avg       0.73      0.66      0.65     82332

