In [3]:
import sys
sys.path.append('../')  # Add parent directory to path

from src.preprocessing.data_processor import NetworkDataProcessor
import pandas as pd
import numpy as np
import joblib

print("Imports successful")

Imports successful!


In [4]:
file_paths = [
    '../data/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv',
]

print(f"Files to process: {len(file_paths)}")
for f in file_paths:
    print(f"  - {f.split('/')[-1]}")

Files to process: 1
  - Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv


In [5]:
# Create processor
processor = NetworkDataProcessor()

# Run complete pipeline
X_train, X_test, y_train, y_test = processor.full_pipeline(
    file_paths=file_paths,
    save_path='../models/'
)

print("\nFinal shapes:")
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test: {y_test.shape}")

NETWORK IDS - DATA PREPROCESSING PIPELINE
Loading data files...
  Loading Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv...
✓ Loaded 225,745 total rows
✓ Cleaned 79 column names

Cleaning data...
  Found 68 missing/infinity values
✓ Cleaned data: 225,711 rows remaining
✓ Removed 2,629 duplicate rows

Label distribution:
  BENIGN: 95,068 (42.6%)
  ATTACK: 128,014 (57.4%)

✓ Prepared 78 features

✓ Train set: 178,465 rows
✓ Test set: 44,617 rows

Scaling features...
✓ Features scaled
✓ Saved preprocessor to ../models/preprocessor.pkl

PREPROCESSING COMPLETE!

Final shapes:
X_train: (178465, 78)
X_test: (44617, 78)
y_train: (178465,)
y_test: (44617,)


In [6]:
import os
os.makedirs('../data/processed', exist_ok=True)

df = processor.load_multiple_files(file_paths)
df = processor.clean_column_names(df)
df = processor.handle_infinity_and_missing(df)
df = processor.remove_duplicates(df)
df = processor.create_binary_labels(df)
X, y = processor.prepare_features(df)

from sklearn.model_selection import train_test_split
X_train_unscaled, X_test_unscaled, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

np.save('../data/processed/X_train_unscaled.npy', X_train_unscaled.values)
np.save('../data/processed/X_test_unscaled.npy', X_test_unscaled.values)

print(" Saved unscaled test data")
print(f"Shape: {X_test_unscaled.shape}")
print(f"Sample values (first 5): {X_test_unscaled.values[0][:5]}")
print("These should be real numbers (like 80, 12000, 500), not scaled (-0.4, 0.2, etc.)")

Loading data files...
  Loading Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv...
✓ Loaded 225,745 total rows
✓ Cleaned 79 column names

Cleaning data...
  Found 68 missing/infinity values
✓ Cleaned data: 225,711 rows remaining
✓ Removed 2,629 duplicate rows

Label distribution:
  BENIGN: 95,068 (42.6%)
  ATTACK: 128,014 (57.4%)

✓ Prepared 78 features
✓ Saved unscaled test data
Shape: (44617, 78)
Sample values (first 5): [5.482300e+04 4.916412e+06 1.000000e+00 5.000000e+00 6.000000e+00]
These should be real numbers (like 80, 12000, 500), not scaled (-0.4, 0.2, etc.)


In [13]:
X_train_check = np.load('../data/processed/X_train.npy')
y_train_check = np.load('../data/processed/y_train.npy')

print(f"\nVerification:")
print(f"Loaded X_train shape: {X_train_check.shape}")
print(f"Loaded y_train shape: {y_train_check.shape}")

print(f"\nClass distribution in training set:")
unique, counts = np.unique(y_train_check, return_counts=True)
for label, count in zip(unique, counts):
    label_name = "BENIGN" if label == 0 else "ATTACK"
    print(f"  {label_name}: {count:,} ({count/len(y_train_check)*100:.1f}%)")


Verification:
Loaded X_train shape: (178465, 78)
Loaded y_train shape: (178465,)

Class distribution in training set:
  BENIGN: 76,054 (42.6%)
  ATTACK: 102,411 (57.4%)
