In [14]:
import os
import json
import pandas as pd
from sklearn.model_selection import train_test_split

# Import helper functions from model_utils.py
from model_utils import (
    load_data,
    preprocess,
    train_baselines,
    run_grid_search,
    evaluate_model,
    save_artifacts
)

# Paths
DATA_PATH = "heart_disease_dataset.csv"   # Update if needed
ARTIFACTS_DIR = "artifacts"
TEST_SIZE = 0.2
RANDOM_STATE = 42


In [15]:
print("📂 Loading data...")
df = load_data(DATA_PATH)

print("⚙️ Preprocessing...")
X, y, scaler = preprocess(df, target_col="heart_disease")

print(f"✅ Data shape after preprocessing: {X.shape}, Target shape: {y.shape}")


📂 Loading data...
⚙️ Preprocessing...
✅ Data shape after preprocessing: (400, 13), Target shape: (400,)


In [16]:
print("✂️ Splitting into train/test...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=TEST_SIZE,
    stratify=y,
    random_state=RANDOM_STATE
)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")


✂️ Splitting into train/test...
Train shape: (320, 13), Test shape: (80, 13)


In [17]:
print("🏗️ Training baseline models...")
baselines = train_baselines(X_train, y_train)
print("✅ Baseline models trained")


🏗️ Training baseline models...
✅ Baseline models trained


In [None]:
print("🔍 Running grid search for hyperparameter optimization...")
best_models = run_grid_search(X_train, y_train)
print("✅ Grid search complete")


🔍 Running grid search for hyperparameter optimization...
🔍 Grid searching: decision_tree
Fitting 5 folds for each of 90 candidates, totalling 450 fits
🔍 Grid searching: random_forest
Fitting 5 folds for each of 324 candidates, totalling 1620 fits


In [None]:
print("📊 Evaluating optimized models on test set...")

evaluations = {}
for name, info in best_models.items():
    estimator = info['best_estimator']
    eval_res = evaluate_model(estimator, X_test, y_test)
    evaluations[name] = {
        "best_params": info["best_params"],
        "cv_score": info["best_score"],
        "test_eval": eval_res
    }

print("✅ Evaluation complete")


In [None]:
# Choose best model by test ROC-AUC
best_name = max(evaluations.keys(), key=lambda n: evaluations[n]["test_eval"]["roc_auc"])
chosen_estimator = best_models[best_name]["best_estimator"]

metadata = {
    "chosen_model": best_name,
    "evaluations": evaluations
}

save_artifacts(chosen_estimator, scaler, metadata, artifacts_dir=ARTIFACTS_DIR)
print(f"✅ Saved best model: {best_name} to {ARTIFACTS_DIR}")


In [None]:
print("📊 Summary of evaluations (ROC-AUC on test set):")
print(json.dumps(
    {k: {"roc_auc": v["test_eval"]["roc_auc"]} for k, v in evaluations.items()},
    indent=2
))
