In [None]:
# Day 4: Training ML Model from Raw CICIDS2017 Dataset

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
import joblib
import os

# Step 1: Load original raw dataset
raw_path = "../data/CICIDS2017/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv"
df = pd.read_csv(raw_path)
print("Original label distribution:")
print(df[' Label'].value_counts())

# Step 2: Filter for only BENIGN and DDoS classes
df = df[df[' Label'].isin(['BENIGN', 'DDoS'])]
df[' Label'] = df[' Label'].map({'BENIGN': 0, 'DDoS': -1})
print("Filtered label distribution:")
print(df[' Label'].value_counts())

# Step 3: Drop irrelevant columns
cols_to_drop = ['Flow ID', 'Source IP', 'Destination IP', 'Timestamp', 'Protocol', 'SimillarHTTP', 'Label.1']
df = df.drop(columns=[col for col in cols_to_drop if col in df.columns], errors='ignore')

# Step 4: Drop rows with missing or infinite values
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna()

# Step 5: Balance the classes
count_normal = (df[' Label'] == 0).sum()
count_ddos = (df[' Label'] == -1).sum()

if abs(count_normal - count_ddos) > 100:
    majority_class = 0 if count_normal > count_ddos else -1
    minority_class = -1 if majority_class == 0 else 0

    df_major = df[df[' Label'] == majority_class]
    df_minor = df[df[' Label'] == minority_class]

    df_major_down = resample(df_major, replace=False, n_samples=len(df_minor), random_state=42)
    df = pd.concat([df_major_down, df_minor])

    print("Balanced label distribution:")
    print(df[' Label'].value_counts())

# Step 6: Split features and label
X = df.drop(columns=[' Label'])
y = df[' Label']

# Step 7: Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save the scaler
os.makedirs("../models", exist_ok=True)
joblib.dump(scaler, "../models/scaler.pkl")

# Step 8: Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Step 9: Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 10: Evaluate model
y_pred = model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Step 11: Save model
joblib.dump(model, "../models/best_random_forest_model.pkl")
print("\n✅ Model and scaler saved to ../models/")





  


Original label distribution:
 Label
DDoS      128027
BENIGN     97718
Name: count, dtype: int64
Filtered label distribution:
 Label
-1    128027
 0     97718
Name: count, dtype: int64
Balanced label distribution:
 Label
-1    97686
 0    97686
Name: count, dtype: int64
Final training label counts:
 Label
-1    97686
 0    97686
Name: count, dtype: int64

Classification Report:
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00     29182
           0       1.00      1.00      1.00     29430

    accuracy                           1.00     58612
   macro avg       1.00      1.00      1.00     58612
weighted avg       1.00      1.00      1.00     58612


Confusion Matrix:
[[29181     1]
 [    0 29430]]

✅ Model and scaler saved to ../models/
