In [2]:
#Load UNSW_NB15
import pandas as pd

train_path = "../data/raw/UNSW_NB15/UNSW_NB15_testing-set.parquet"
test_path  = "../data/raw/UNSW_NB15/UNSW_NB15_training-set.parquet"

In [3]:
df_train = pd.read_parquet(train_path)
df_test  = pd.read_parquet(test_path)

print("Train shape:", df_train.shape)
print("Test shape:", df_test.shape)
df_train.head()

Train shape: (82332, 36)
Test shape: (175341, 36)


Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,attack_cat,label
0,1.1e-05,udp,-,INT,2,0,496,0,90909.09375,180363632.0,...,0,0,1,1,0,0,0,0,Normal,0
1,8e-06,udp,-,INT,2,0,1762,0,125000.0,881000000.0,...,0,0,1,1,0,0,0,0,Normal,0
2,5e-06,udp,-,INT,2,0,1068,0,200000.0,854400000.0,...,0,0,1,1,0,0,0,0,Normal,0
3,6e-06,udp,-,INT,2,0,900,0,166666.65625,600000000.0,...,0,0,2,1,0,0,0,0,Normal,0
4,1e-05,udp,-,INT,2,0,2126,0,100000.0,850400000.0,...,0,0,2,1,0,0,0,0,Normal,0


In [4]:
#Identify Target Columns
print(df_train.columns)

Index(['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt',
       'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt',
       'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_src_dport_ltm', 'ct_dst_sport_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'is_sm_ips_ports',
       'attack_cat', 'label'],
      dtype='object')


In [5]:
df_train["label"].value_counts()

label
1    45332
0    37000
Name: count, dtype: int64

In [18]:
#Seperate Features and Target
X_train = df_train.drop(["label", "attack_cat"], axis=1)
y_train = df_train["label"]

X_test = df_test.drop(["label", "attack_cat"], axis=1)
y_test = df_test["label"]

In [19]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns
numeric_cols = X_train.select_dtypes(exclude=['object', 'category']).columns

print("Categorical:", categorical_cols)
print("Numeric:", numeric_cols)

Categorical: Index(['proto', 'service', 'state'], dtype='object')
Numeric: Index(['dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sload', 'dload',
       'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb',
       'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean',
       'trans_depth', 'response_body_len', 'ct_src_dport_ltm',
       'ct_dst_sport_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd',
       'is_sm_ips_ports'],
      dtype='object')


In [20]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ]
)

In [21]:
from sklearn.ensemble import RandomForestClassifier

model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(
        n_estimators=100,
        random_state=42,
        n_jobs=-1
    ))
])

In [22]:
model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [23]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = model.predict(X_test)

print("Train Accuracy:", model.score(X_train, y_train))
print("Test Accuracy:", model.score(X_test, y_test))

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Train Accuracy: 0.9952509352378176
Test Accuracy: 0.9113156649043863

Classification Report:

              precision    recall  f1-score   support

           0       0.80      0.96      0.87     56000
           1       0.98      0.89      0.93    119341

    accuracy                           0.91    175341
   macro avg       0.89      0.92      0.90    175341
weighted avg       0.92      0.91      0.91    175341



In [17]:
print(X_train.columns)

Index(['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt',
       'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt',
       'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_src_dport_ltm', 'ct_dst_sport_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'is_sm_ips_ports',
       'attack_cat'],
      dtype='object')


In [25]:
import joblib
joblib.dump(model, "../models/unsw_rf_model.pkl")
print("Model saved successfully.")

Model saved successfully.


In [29]:
print(dir())

['ColumnTransformer', 'In', 'OneHotEncoder', 'Out', 'Pipeline', 'RandomForestClassifier', 'StandardScaler', 'X_test', 'X_train', '_', '_15', '_22', '_3', '_5', '__', '___', '__builtin__', '__builtins__', '__doc__', '__loader__', '__name__', '__package__', '__session__', '__spec__', '_dh', '_i', '_i1', '_i10', '_i11', '_i12', '_i13', '_i14', '_i15', '_i16', '_i17', '_i18', '_i19', '_i2', '_i20', '_i21', '_i22', '_i23', '_i24', '_i25', '_i26', '_i27', '_i28', '_i29', '_i3', '_i4', '_i5', '_i6', '_i7', '_i8', '_i9', '_ih', '_ii', '_iii', '_oh', 'accuracy_score', 'categorical_cols', 'classification_report', 'df_test', 'df_train', 'exit', 'get_ipython', 'joblib', 'model', 'np', 'numeric_cols', 'open', 'pd', 'preprocessor', 'quit', 'shap', 'test_path', 'train_path', 'y_pred', 'y_test', 'y_train']


In [30]:
import shap
import numpy as np

In [32]:
print(model.named_steps)

{'preprocessor': ColumnTransformer(transformers=[('num', StandardScaler(),
                                 Index(['dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sload', 'dload',
       'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb',
       'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean',
       'trans_depth', 'response_body_len', 'ct_src_dport_ltm',
       'ct_dst_sport_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd',
       'is_sm_ips_ports'],
      dtype='object')),
                                ('cat', OneHotEncoder(handle_unknown='ignore'),
                                 Index(['proto', 'service', 'state'], dtype='object'))]), 'classifier': RandomForestClassifier(n_jobs=-1, random_state=42)}


In [34]:
rf_model = model.named_steps["classifier"]
preprocessor = model.named_steps["preprocessor"]

In [36]:
#Transform data
X_train_processed = preprocessor.transform(X_train)
X_test_processed  = preprocessor.transform(X_test)

print("Transformation complete")


Transformation complete


In [None]:
import numpy as np

X_train_transformed = preprocessor.transform(X_train)

# Convert to numpy array if sparse
if hasattr(X_train_transformed, "toarray"):
    X_train_transformed = X_train_transformed.toarray()

# Force numeric dtype
X_train_transformed = X_train_transformed.astype(np.float32)

X_sample = X_train_transformed[:500] 

In [None]:
#create SHAP Explainer
explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(X_sample)