In [40]:
import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.utils.class_weight import compute_class_weight

RANDOM_STATE = 42


In [41]:
train = pd.read_csv("../data/processed/train_day1.csv")
test = pd.read_csv("../data/processed/test_day1.csv")

print("Train Shape:", train.shape)
print("Test Shape:", test.shape)


Train Shape: (125973, 43)
Test Shape: (22544, 43)


In [42]:
train = train.drop(columns=["difficulty"], errors="ignore")
test = test.drop(columns=["difficulty"], errors="ignore")


In [43]:
categorical_cols = ["protocol_type", "service", "flag"]

combined = pd.concat([train, test], axis=0).reset_index(drop=True)

combined = pd.get_dummies(combined, columns=categorical_cols)

train = combined.iloc[:len(train)].reset_index(drop=True)
test = combined.iloc[len(train):].reset_index(drop=True)

print("After One-Hot Encoding:", train.shape)


After One-Hot Encoding: (125973, 123)


In [44]:
# Save the exact column names after one-hot encoding
training_columns = list(train.columns)
training_columns.remove("label") # We don't need the label column for inputs

import joblib
joblib.dump(training_columns, "../outputs/models/training_columns.pkl")
print("âœ… Training columns saved for production backend.")

âœ… Training columns saved for production backend.


In [45]:
X_train = train.drop(columns=["label"])
y_train = train["label"]

X_test = test.drop(columns=["label"])
y_test = test["label"]

print("Classes:", np.unique(y_train))
print("Feature count before selection:", X_train.shape[1])


Classes: [0 1 2 3]
Feature count before selection: 122


In [46]:
print("Classes:", np.unique(y_train))
print("Feature count before selection:", X_train.shape[1])
selector = SelectKBest(
    score_func=mutual_info_classif,
    k=80   # ðŸ”¥ increased from 40 â†’ major gain
)

X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

print("Selected feature shape:", X_train_selected.shape)

joblib.dump(selector, "../outputs/models/feature_selector.pkl")
print("Feature selector saved.")

# ==========================================
# ðŸ”§ ADD THIS LINE SO 'classes' IS DEFINED
classes = np.unique(y_train)
# ==========================================

class_weights_array = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=y_train
)

Classes: [0 1 2 3]
Feature count before selection: 122
Selected feature shape: (125973, 80)
Feature selector saved.


In [47]:
class_weights_array = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=y_train
)

class_weights = dict(zip(classes, class_weights_array))

print("Original weights:", class_weights)

# ðŸ”¥ Manual amplification for minority classes
# Class mapping:
# 0 = Normal
# 1 = DoS
# 2 = Probe
# 3 = Privilege (hardest)

class_weights[2] *= 1.3   # mild boost
class_weights[3] *= 1.75    # strong boost (KEY)

print("Amplified weights:", class_weights)

joblib.dump(class_weights, "../outputs/models/class_weights.pkl")



Original weights: {np.int64(0): np.float64(0.46765439615104765), np.int64(1): np.float64(0.685724083872232), np.int64(2): np.float64(2.7018917295813316), np.int64(3): np.float64(30.07951289398281)}
Amplified weights: {np.int64(0): np.float64(0.46765439615104765), np.int64(1): np.float64(0.685724083872232), np.int64(2): np.float64(3.5124592484557313), np.int64(3): np.float64(52.639147564469916)}


['../outputs/models/class_weights.pkl']

In [48]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

joblib.dump(scaler, "../outputs/models/scaler.pkl")

print("Scaling completed.")


Scaling completed.


In [49]:
np.save("../data/processed/X_train.npy", X_train_scaled)
np.save("../data/processed/X_test.npy", X_test_scaled)
np.save("../data/processed/y_train.npy", y_train.values)
np.save("../data/processed/y_test.npy", y_test.values)

print("\nâœ… Day 2 completed successfully (Optimized IDS version).")



âœ… Day 2 completed successfully (Optimized IDS version).


In [50]:
import json
sample_dict = test.iloc[0].drop("label").to_dict()
print(json.dumps(sample_dict, indent=2))

{
  "duration": 0,
  "src_bytes": 0,
  "dst_bytes": 0,
  "land": 0,
  "wrong_fragment": 0,
  "urgent": 0,
  "hot": 0,
  "num_failed_logins": 0,
  "logged_in": 0,
  "num_compromised": 0,
  "root_shell": 0,
  "su_attempted": 0,
  "num_root": 0,
  "num_file_creations": 0,
  "num_shells": 0,
  "num_access_files": 0,
  "num_outbound_cmds": 0,
  "is_host_login": 0,
  "is_guest_login": 0,
  "count": 229,
  "srv_count": 10,
  "serror_rate": 0.0,
  "srv_serror_rate": 0.0,
  "rerror_rate": 1.0,
  "srv_rerror_rate": 1.0,
  "same_srv_rate": 0.04,
  "diff_srv_rate": 0.06,
  "srv_diff_host_rate": 0.0,
  "dst_host_count": 255,
  "dst_host_srv_count": 10,
  "dst_host_same_srv_rate": 0.04,
  "dst_host_diff_srv_rate": 0.06,
  "dst_host_same_src_port_rate": 0.0,
  "dst_host_srv_diff_host_rate": 0.0,
  "dst_host_serror_rate": 0.0,
  "dst_host_srv_serror_rate": 0.0,
  "dst_host_rerror_rate": 1.0,
  "dst_host_srv_rerror_rate": 1.0,
  "protocol_type_icmp": false,
  "protocol_type_tcp": true,
  "protocol_type

In [51]:
import pandas as pd
import json

raw_test = pd.read_csv("../data/processed/test_day1.csv")
sample_dict = raw_test.iloc[0].drop("label").to_dict() # Change 0 to any row number!
print(json.dumps(sample_dict, indent=2))

{
  "duration": 0,
  "protocol_type": "tcp",
  "service": "private",
  "flag": "REJ",
  "src_bytes": 0,
  "dst_bytes": 0,
  "land": 0,
  "wrong_fragment": 0,
  "urgent": 0,
  "hot": 0,
  "num_failed_logins": 0,
  "logged_in": 0,
  "num_compromised": 0,
  "root_shell": 0,
  "su_attempted": 0,
  "num_root": 0,
  "num_file_creations": 0,
  "num_shells": 0,
  "num_access_files": 0,
  "num_outbound_cmds": 0,
  "is_host_login": 0,
  "is_guest_login": 0,
  "count": 229,
  "srv_count": 10,
  "serror_rate": 0.0,
  "srv_serror_rate": 0.0,
  "rerror_rate": 1.0,
  "srv_rerror_rate": 1.0,
  "same_srv_rate": 0.04,
  "diff_srv_rate": 0.06,
  "srv_diff_host_rate": 0.0,
  "dst_host_count": 255,
  "dst_host_srv_count": 10,
  "dst_host_same_srv_rate": 0.04,
  "dst_host_diff_srv_rate": 0.06,
  "dst_host_same_src_port_rate": 0.0,
  "dst_host_srv_diff_host_rate": 0.0,
  "dst_host_serror_rate": 0.0,
  "dst_host_srv_serror_rate": 0.0,
  "dst_host_rerror_rate": 1.0,
  "dst_host_srv_rerror_rate": 1.0,
  "diffic