In [1]:
import pandas as pd
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
# --- CONFIG ---
INPUT_FILE = "../data/processed/cleaned_data.parquet"
MODEL_PATH = "../models" # Make sure this matches your folder name (ml_engine/models)
os.makedirs(MODEL_PATH, exist_ok=True)

In [3]:
print("Loading Data...")
df = pd.read_parquet(INPUT_FILE)

Loading Data...


In [4]:
# Separate Features (X) and Target (y)
X = df.drop(columns=['Label', 'attack_cat'])
y = df['Label'] # Binary Target: 0=Normal, 1=Attack

In [5]:
# Identify Column Types
# Categorical = Text (proto, service, state)
# Numeric = Everything else
cat_cols = ['proto', 'service', 'state']
num_cols = [c for c in X.columns if c not in cat_cols]

In [6]:
# --- BUILD THE PREPROCESSING PIPELINE ---
# This is the "Industry Standard" way.
# We create a transformer that handles numbers and text separately.

preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), num_cols),
        # handle_unknown='ignore' PREVENTS CRASHES on new/weird protocols
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
    ]
)

In [7]:
# --- SPLIT DATA ---
print("Splitting Data...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

Splitting Data...


In [8]:
# --- FIT & TRANSFORM ---
print("Fitting Preprocessor...")
# We fit only on TRAIN data to avoid data leakage
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

Fitting Preprocessor...


In [9]:
# --- SAVE EVERYTHING ---
# We save the preprocessor itself. The API will load this to process new alerts.
joblib.dump(preprocessor, f"{MODEL_PATH}/preprocessor.joblib")

['../models/preprocessor.joblib']

In [10]:
# Save processed arrays for the next notebook (Training)
joblib.dump((X_train_processed, y_train), f"{MODEL_PATH}/train_data.joblib")
joblib.dump((X_test_processed, y_test), f"{MODEL_PATH}/test_data.joblib")

print("Feature Engineering Complete. Preprocessor saved.")

Feature Engineering Complete. Preprocessor saved.
