# 01 — Data Preprocessing & EDA

This notebook:
- Loads the UCI Heart Disease dataset from `data/heart_disease.csv`
- Handles missing values
- Encodes categorical variables
- Scales numeric features
- Performs EDA (histograms, correlation)
- Splits data into train/test
- Saves `data/processed/train.csv`, `data/processed/test.csv` (optional) and `results/eda_summary.txt`

In [None]:
import os, sys, json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

RANDOM_STATE = 42
DATA_PATH = Path("../data/heart_disease.csv")

if not DATA_PATH.exists():
    raise FileNotFoundError("Please place the dataset at data/heart_disease.csv")

df = pd.read_csv(DATA_PATH)
print("Shape:", df.shape)
df.head()

## Basic Info & Missing Values

In [None]:
display(df.info())
df.isna().sum()

## Quick EDA

In [None]:
# Histograms for numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
for col in numeric_cols:
    plt.figure()
    df[col].hist(bins=30)
    plt.title(f"Histogram: {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.show()

# Correlation matrix (numeric only)
corr = df[numeric_cols].corr()
plt.figure()
plt.imshow(corr, cmap="viridis", aspect="auto")
plt.colorbar()
plt.title("Correlation Heatmap (numeric columns)")
plt.xticks(range(len(numeric_cols)), numeric_cols, rotation=90)
plt.yticks(range(len(numeric_cols)), numeric_cols)
plt.tight_layout()
plt.show()

## Train/Test Split & Preprocessing Pipeline

In [None]:
# Assume target column name variations:
possible_targets = ["target", "num", "condition", "disease"]
target = None
for t in possible_targets:
    if t in df.columns:
        target = t
        break
if target is None:
    raise ValueError("Could not find target column. Rename your label column to 'target' or 'num'.")

X = df.drop(columns=[target])
y = df[target]

cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y if len(np.unique(y))<=20 else None
)

print("Train:", X_train.shape, " Test:", X_test.shape)

## Save train/test indices (optional)

In [None]:
out_dir = Path("../data/processed")
out_dir.mkdir(parents=True, exist_ok=True)
pd.concat([X_train, y_train], axis=1).to_csv(out_dir/"train.csv", index=False)
pd.concat([X_test, y_test], axis=1).to_csv(out_dir/"test.csv", index=False)

with open("../results/eda_summary.txt", "w") as f:
    f.write(f"Rows: {len(df)}\nColumns: {len(df.columns)}\nTarget: {target}\n")
    f.write(f"Numeric: {len(num_cols)}; Categorical: {len(cat_cols)}\n")

## Export preprocessing object (for reuse)

In [None]:
import joblib
joblib.dump(preprocessor, "../models/preprocessor.pkl")
print("Saved ../models/preprocessor.pkl")