In [1]:
import sys
sys.path.append('../')

from src.preprocessing.data_processor import NetworkDataProcessor
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import joblib

print("="*70)
print("MULTI-CLASS IDS - DATA PREPROCESSING")
print("="*70)

file_paths = [
    '../data/Monday-WorkingHours.pcap_ISCX.csv',
    '../data/Tuesday-WorkingHours.pcap_ISCX.csv',
    '../data/Wednesday-workingHours.pcap_ISCX.csv',
    '../data/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
    '../data/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
    '../data/Friday-WorkingHours-Morning.pcap_ISCX.csv',
    '../data/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
    '../data/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv',
]

processor = NetworkDataProcessor()

df = processor.load_multiple_files(file_paths)
print(f"\nTotal flows loaded: {len(df):,}")

df = processor.clean_column_names(df)

df['Label'] = df['Label'].str.replace('�', '-', regex=False)
df['Label'] = df['Label'].str.strip()

df = processor.handle_infinity_and_missing(df)
df = processor.remove_duplicates(df)

print(f"\nLabel distribution:")
print(df['Label'].value_counts())

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Label'])

print(f"\nEncoded labels:")
for i, label in enumerate(label_encoder.classes_):
    print(f"  {i}: {label}")

X = df.drop(['Label'], axis=1)
X = X.select_dtypes(include=[np.number])

feature_columns = X.columns.tolist()
print(f"\n Prepared {len(feature_columns)} features")

X_train_unscaled, X_test_unscaled, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\n Train set: {len(X_train_unscaled):,} flows")
print(f" Test set: {len(X_test_unscaled):,} flows")

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_unscaled)
X_test_scaled = scaler.transform(X_test_unscaled)

print(" Features scaled")

import os
os.makedirs('../data/processed_multiclass', exist_ok=True)

np.save('../data/processed_multiclass/X_train_scaled.npy', X_train_scaled)
np.save('../data/processed_multiclass/X_test_scaled.npy', X_test_scaled)
np.save('../data/processed_multiclass/X_train_unscaled.npy', X_train_unscaled.values)
np.save('../data/processed_multiclass/X_test_unscaled.npy', X_test_unscaled.values)
np.save('../data/processed_multiclass/y_train.npy', y_train)
np.save('../data/processed_multiclass/y_test.npy', y_test)

# Save preprocessor with label encoder
joblib.dump({
    'scaler': scaler,
    'feature_columns': feature_columns,
    'label_encoder': label_encoder,
    'classes': label_encoder.classes_.tolist()
}, '../models/preprocessor_multiclass.pkl')

print("\n Saved all processed data to data/processed_multiclass/")
print(" Saved preprocessor to models/preprocessor_multiclass.pkl")

print("\n" + "="*70)
print("PREPROCESSING COMPLETE")
print("="*70)

MULTI-CLASS IDS - DATA PREPROCESSING
Loading data files...
  Loading Monday-WorkingHours.pcap_ISCX.csv...
  Loading Tuesday-WorkingHours.pcap_ISCX.csv...
  Loading Wednesday-workingHours.pcap_ISCX.csv...
  Loading Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv...
  Loading Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv...
  Loading Friday-WorkingHours-Morning.pcap_ISCX.csv...
  Loading Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv...
  Loading Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv...
✓ Loaded 2,830,743 total rows

Total flows loaded: 2,830,743
✓ Cleaned 79 column names

Cleaning data...
  Found 5,734 missing/infinity values
✓ Cleaned data: 2,827,876 rows remaining
✓ Removed 307,078 duplicate rows

Label distribution:
Label
BENIGN                        2095057
DoS Hulk                       172846
DDoS                           128014
PortScan                        90694
DoS GoldenEye                   10286
FTP-Patator                      5931
