# 02 - Data Preprocessing

**Purpose**: Clean data, encode, scale, split, and SAVE for next notebook.

**Outputs**:
- `data/processed/features_v1_train.csv`
- `data/processed/features_v1_test.csv`
- `models/scaler_v1.joblib`

In [None]:
import sys; sys.path.append('..')
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split

from src.preprocessing import clean_missing_values, encode_categorical, scale_numerical
from src.features import create_clinical_features

# === SETTINGS (edit these) ===
TARGET_COL = 'recurrence'
TEST_SIZE = 0.2
RANDOM_STATE = 42
VERSION = 'v1'  # Increment when making changes

print("✅ Setup complete")

In [None]:
# TODO: Load your data
# df = pd.read_csv('../data/raw/YOUR_DATA.csv')

# Mock data
print("⚠️ Using MOCK DATA")
np.random.seed(42)
n = 500
df = pd.DataFrame({
    'patient_id': range(1, n+1),
    'age': np.random.normal(62, 10, n).astype(int),
    'grade': np.random.choice([1, 2, 3], n, p=[0.3, 0.5, 0.2]),
    'histology': np.random.choice(['Endometrioid', 'Serous', 'Clear Cell'], n),
    'lvsi': np.random.choice(['Yes', 'No'], n, p=[0.3, 0.7]),
    'myometrial_invasion': np.random.uniform(0, 1, n).round(2),
    'tumor_size': np.random.exponential(3, n).round(1),
    'recurrence': np.random.choice([0, 1], n, p=[0.75, 0.25])
})
df.loc[np.random.choice(df.index, 20), 'tumor_size'] = np.nan  # Add missing
print(f"Raw data: {df.shape}")

In [None]:
# Clean missing values
df_clean = clean_missing_values(df)

# Feature engineering
df_features = create_clinical_features(df_clean)

# Encode categorical
cat_cols = df_features.select_dtypes(include=['object', 'category']).columns.tolist()
df_encoded, encoders = encode_categorical(df_features, columns=cat_cols, method='onehot')

print(f"After preprocessing: {df_encoded.shape}")

In [None]:
# Train/test split
drop_cols = [c for c in ['patient_id', TARGET_COL] if c in df_encoded.columns]
X = df_encoded.drop(columns=drop_cols)
y = df_encoded[TARGET_COL]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)
print(f"Train: {len(X_train)}, Test: {len(X_test)}")

In [None]:
# Scale numerical (fit on train only!)
num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
X_train_scaled, scaler = scale_numerical(X_train, columns=num_cols)
X_test_scaled = X_test.copy()
X_test_scaled[num_cols] = scaler.transform(X_test[num_cols])

In [None]:
# SAVE for next notebook
train_data = X_train_scaled.copy()
train_data[TARGET_COL] = y_train.values
test_data = X_test_scaled.copy()
test_data[TARGET_COL] = y_test.values

train_data.to_csv(f'../data/processed/features_{VERSION}_train.csv', index=False)
test_data.to_csv(f'../data/processed/features_{VERSION}_test.csv', index=False)
joblib.dump(scaler, f'../models/scaler_{VERSION}.joblib')
joblib.dump(list(X_train.columns), f'../models/feature_names_{VERSION}.joblib')

print(f"\n✅ SAVED:")
print(f"   data/processed/features_{VERSION}_train.csv")
print(f"   data/processed/features_{VERSION}_test.csv")
print(f"   models/scaler_{VERSION}.joblib")

## Next Step
→ Go to `03_model_training.ipynb`