In [1]:
import pandas as pd

# Load the data
train_df = pd.read_csv("./UNSW_NB15_training-set.csv")
test_df = pd.read_csv("./UNSW_NB15_testing-set.csv")

#preview the data
print("Training set shape:", train_df.shape)
print("Testing set shape:",test_df.shape)
#train_df.head()


Training set shape: (82332, 45)
Testing set shape: (175341, 45)


In [2]:
print(train_df.isnull().sum().sort_values(ascending=False).head())

id        0
dwin      0
synack    0
ackdat    0
smean     0
dtype: int64


In [3]:
drop_cols = ['id']  # Add more if needed
train_df.drop(columns=drop_cols, inplace=True)
test_df.drop(columns=drop_cols, inplace=True)


In [4]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = ['proto', 'service', 'state']
encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    combined_values = pd.concat([train_df[col], test_df[col]], axis=0)
    le.fit(combined_values)
    train_df[col] = le.transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
    encoders[col] = le



In [5]:
#train_df.head()

In [6]:
X_train = train_df.drop(columns=['label', 'attack_cat'])
y_train = train_df['label']

X_test = test_df.drop(columns=['label', 'attack_cat'])
y_test = test_df['label']


In [7]:
from sklearn.model_selection import train_test_split

X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_split, y_train_split)

# Evaluate
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))


              precision    recall  f1-score   support

           0       0.97      0.98      0.98      7418
           1       0.98      0.97      0.98      9049

    accuracy                           0.98     16467
   macro avg       0.98      0.98      0.98     16467
weighted avg       0.98      0.98      0.98     16467

[[7283  135]
 [ 232 8817]]


In [9]:
y_test_pred = model.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix

print("Test Set Evaluation:")
print(classification_report(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))


Test Set Evaluation:
              precision    recall  f1-score   support

           0       0.77      0.98      0.86     56000
           1       0.99      0.87      0.92    119341

    accuracy                           0.90    175341
   macro avg       0.88      0.92      0.89    175341
weighted avg       0.92      0.90      0.90    175341

[[ 54815   1185]
 [ 15994 103347]]


In [10]:
import joblib

# Save the trained model
joblib.dump(model, 'network_intrusion_model.pkl')

# Save encoders too (if needed in the web app)
joblib.dump(encoders, 'label_encoders.pkl')




['label_encoders.pkl']

In [11]:
import pickle

# Get the column names used in training
feature_names = X_train_split.columns.tolist()
# Save to a .pkl file
with open("feature_names.pkl", "wb") as f:
    pickle.dump(feature_names, f)


In [12]:
import shutil
import glob
import os

# Define source and destination folders
src_folder = "/Users/hemadrimhaskar/mini_proj/network_threat_detection"
dst_folder = "/Users/hemadrimhaskar/mini_proj/network_threat_detection/ml_web_app"

# Create destination folder if it doesn't exist
os.makedirs(dst_folder, exist_ok=True)

# Move all .pkl files
for file_path in glob.glob(os.path.join(src_folder, "*.pkl")):
    filename = os.path.basename(file_path)
    dest_path = os.path.join(dst_folder, filename)

    if os.path.exists(dest_path):
        os.remove(dest_path)  # force overwrite

    shutil.move(file_path, dest_path)
