In [1]:
# Heart Disease UCI - Initial Data Inspection
# -------------------------------------------
# Step 1: Load libraries and dataset

import numpy as np
import pandas as pd
import os

os.makedirs("figures", exist_ok=True)

file_path = (
    "/Users/jeremygoetschy/Projects/Heart_Disease/Raw/heart_cleveland_upload.csv"
)
df = pd.read_csv(file_path)

# Dataset dimensions
print(df.shape)
# ðŸ”Ž Observations:
# - The dataset contains 297 rows and 14 columns (small dataset).

# Dataset preview
print(df.head())
# ðŸ”Ž Observations:
# - Features include demographics, clinical measures, and test results.
# - Target column: "condition" â†’ binary classification (heart disease or not).

# Dataset info (dtypes, nulls)
print(df.info())
# ðŸ”Ž Observations:
# - Numerical features: age, trestbps (resting BP), chol (cholesterol),
#   thalach (max HR), oldpeak (ST depression).
# - Categorical features: sex, cp (chest pain), fbs, restecg, exang,
#   slope, ca, thal.
# - Target: "condition" (0 = no disease, 1 = disease).

# Descriptive statistics
print(df.describe())
# ðŸ”Ž Observations:
# - Values are within medical ranges (e.g., age ~29â€“77).
# - Continuous variables show variability, no obvious invalid values.

# Check missing values
print(df.isna().sum())
# ðŸ”Ž Observations:
# - No missing values across any column.

# Check duplicates
print(df.duplicated().sum())
# ðŸ”Ž Observations:
# - No duplicate rows found in the dataset.

# Class balance
print(df["condition"].value_counts(normalize=True))
# ðŸ”Ž Observations:
# - Condition: ~46% positive (heart disease) vs 54% negative â†’ mild imbalance.

# Sex distribution
print(df["sex"].value_counts(normalize=True))
# ðŸ”Ž Observations:
# - Sex distribution is skewed: ~67% male, 33% female.

(297, 14)
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   69    1   0       160   234    1        2      131      0      0.1      1   
1   69    0   0       140   239    0        0      151      0      1.8      0   
2   66    0   0       150   226    0        0      114      0      2.6      2   
3   65    1   0       138   282    1        2      174      0      1.4      1   
4   64    1   0       110   211    0        2      144      1      1.8      1   

   ca  thal  condition  
0   1     0          0  
1   2     0          0  
2   0     0          0  
3   1     0          1  
4   0     0          0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297 entries, 0 to 296
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        297 non-null    int64  
 1   sex        297 non-null    int64  
 2   cp         297 non-null    int64  
 3   trestbps   297 non-null    int64  
 4   cho

In [2]:
# Heart Disease UCI - Exploratory Data Analysis (EDA)
# ---------------------------------------------------
# Goal: Explore feature distributions, relationship with the target,
#       and check for correlations & multicollinearity.

import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

numerical = ["age", "trestbps", "chol", "thalach", "oldpeak"]
categorical = ["sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal"]

# ============================
# 1. Numerical Features - Histograms & Boxplots
# ============================
fig, axes = plt.subplots(len(numerical), 2, figsize=(12, 5 * len(numerical)))
for i, col in enumerate(numerical):
    sns.histplot(df[col], kde=True, ax=axes[i, 0])
    sns.boxplot(x=df[col], ax=axes[i, 1])
    axes[i, 0].set_title(f"Distribution of {col}")
    axes[i, 1].set_title(f"Boxplot of {col}")
plt.tight_layout()
plt.savefig("figures/numerical_features_distribution.png")
plt.close()

# ðŸ”Ž Observations:
# - trestbps (resting BP): right-skewed, outliers >170.
# - chol (cholesterol): slightly right-skewed, outliers >400.
# - thalach (max heart rate): slightly left-skewed, outliers <80.
# - oldpeak (ST depression): heavily right-skewed, many zeros, outliers >4.


# ============================
# 2. Categorical Features - Countplots
# ============================
fig, axes = plt.subplots(len(categorical), 1, figsize=(8, 5 * len(categorical)))
for i, col in enumerate(categorical):
    sns.countplot(x=df[col], ax=axes[i])
    axes[i].set_title(f"Count of {col}")
plt.tight_layout()
plt.savefig("figures/categorical_features_count.png")
plt.close()

# ðŸ”Ž Observations:
# - sex: more males than females.
# - cp (chest pain): increasing trend with higher values.
# - fbs (fasting blood sugar): highly imbalanced, very few positives.
# - restecg: category "1" is rare.
# - slope: category "2" is rare.
# - ca: distribution decreases as value increases.
# - thal: category "1" is rare.


# ============================
# 3. Numerical vs Target (Boxplots)
# ============================
fig, axes = plt.subplots(len(numerical), 1, figsize=(8, 5 * len(numerical)))
for i, col in enumerate(numerical):
    sns.boxplot(x=df["condition"], y=df[col], ax=axes[i])
    axes[i].set_title(f"Boxplot of {col} by Condition")
plt.tight_layout()
plt.savefig("figures/numerical_vs_target_boxplots.png")
plt.close()

# ðŸ”Ž Observations:
# - age: patients with condition (1) slightly older (~60 vs ~53).
# - thalach: significantly lower for condition (1) (~140 vs ~160).
# - oldpeak: higher for condition (1) (avg +1).


# ============================
# 4. Categorical vs Target (Countplots)
# ============================
fig, axes = plt.subplots(len(categorical), 1, figsize=(8, 5 * len(categorical)))
for i, col in enumerate(categorical):
    sns.countplot(x=df[col], hue=df["condition"], ax=axes[i])
    axes[i].set_title(f"{col} by Condition")
plt.tight_layout()
plt.savefig("figures/categorical_vs_target_countplots.png")
plt.close()

# ðŸ”Ž Observations:
# - sex: males more affected by condition.
# - cp: type 3 strongly associated with disease.
# - restecg: category 2 more frequent in disease cases.
# - exang: 1 strongly associated with disease.
# - slope: slope=1 more frequent in disease.
# - ca: higher values reduce probability of no disease.
# - thal: category 2 strongly associated with disease.


# ============================
# 5. Multivariate Analysis - Pairplot
# ============================
sns.pairplot(df, hue="condition", vars=numerical)
plt.savefig("figures/pairplot_numerical_features.png")
plt.close()

# ðŸ”Ž Observations:
# - Considerable overlap between classes in most pairs.
# - thalach and oldpeak show separation, but no perfect discriminators.


# ============================
# 6. Correlation Heatmap (Spearman)
# ============================
plt.figure(figsize=(12, 8))
sns.heatmap(
    df.corr(method="spearman"),
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    center=0,
    vmin=-1,
    vmax=1,
)
plt.title("Correlation Heatmap")
plt.savefig("figures/correlation_heatmap.png")
plt.close()

# ðŸ”Ž Observations:
# - Target correlated with:
#   - thal (0.52), ca (0.49), cp (0.46), thalach (-0.43).
# - Multicollinearity:
#   - oldpeak & slope (0.59).
#   - thalach, slope, oldpeak, exang all correlated (~-0.40).

In [3]:
# Heart Disease UCI - Featureâ€“Target Correlation Analysis
# -------------------------------------------------------
# Goal: Quantify the strength of association between features and target (condition).
# - Numerical features â†’ Point-Biserial Correlation
# - Categorical features â†’ CramÃ©râ€™s V

from scipy.stats import pointbiserialr, chi2_contingency


# Function to compute CramÃ©râ€™s V for categorical vs binary target
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
    rcorr = r - ((r - 1) ** 2) / (n - 1)
    kcorr = k - ((k - 1) ** 2) / (n - 1)
    return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))


# Compute correlations
correlations = {}
for col in numerical:
    correlations[col] = pointbiserialr(df[col], df["condition"])[0]

for col in categorical:
    correlations[col] = cramers_v(df[col], df["condition"])

# Convert to DataFrame
corr_df = pd.DataFrame.from_dict(correlations, orient="index", columns=["Correlation"])
corr_df = corr_df.sort_values(by="Correlation", ascending=False)

# Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_df, annot=True, cmap="coolwarm", center=0, vmin=-1, vmax=1)
plt.title("Correlation Heatmap (Point-Biserial + CramÃ©râ€™s V)")
plt.savefig("figures/feature_target_correlation_heatmap.png")
plt.close()

# Barplot
plt.figure(figsize=(12, 8))
sns.barplot(x=corr_df["Correlation"], y=corr_df.index)
plt.title("Featureâ€“Target Correlation Strength")
plt.xlabel("Correlation")
plt.ylabel("Feature")
plt.savefig("figures/feature_target_correlation_barplot.png")
plt.close()

In [4]:
# Heart Disease UCI - Feature Engineering
# ---------------------------------------
# Goal: Create new features and transformations to enhance predictive power.

# ============================
# 1. Log Transformation
# ============================
# oldpeak is heavily right-skewed â†’ log transform improves distribution
df["oldpeak_log"] = np.log1p(df["oldpeak"])

# ============================
# 2. Interaction Features
# ============================
# trestbps (resting BP) and age both weakly correlated with condition
# Hypothesis: older patients with higher BP may have higher risk
df["trestbps_age"] = df["trestbps"] * df["age"]

# ============================
# 3. Composite Stress Feature
# ============================
# oldpeak (ST depression) and slope are moderately correlated with condition
# Combining them may capture stress-test severity better
df["stress_score"] = df["oldpeak_log"] * df["slope"]

In [5]:
# Heart Disease UCI - Data Preprocessing
# --------------------------------------
# Goal: Scale numerical features, one-hot encode categorical features,
#       and prepare dataset for machine learning models.

from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer

# ============================
# 1. Define Feature Groups
# ============================
numerical = [
    "age",
    "thalach",
    "trestbps",
    "oldpeak_log",
    "trestbps_age",
    "stress_score",
]

categorical = ["sex", "cp", "restecg", "exang", "ca", "thal", "slope"]

# ============================
# 2. Build Preprocessor
# ============================
preprocessor = ColumnTransformer(
    transformers=[
        ("num", RobustScaler(), numerical),  # robust scaling for skewed distributions
        ("cat", OneHotEncoder(drop="first", sparse_output=False), categorical),  # OHE
    ]
)

# ============================
# 3. Apply Transformation
# ============================
X = df.drop(columns=["condition"], axis=1)
y = df["condition"]

X_preprocessed = preprocessor.fit_transform(X)

# Retrieve feature names after preprocessing
feature_names = preprocessor.get_feature_names_out()

# Convert back to DataFrame
X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=feature_names)

# Clean column names
X_preprocessed_df.columns = [
    col.replace("cat__", "").replace("num__", "") for col in X_preprocessed_df.columns
]

print(X_preprocessed.shape)  # Expected: (297, 20)
print(X_preprocessed_df.head())

(297, 20)
        age   thalach  trestbps  oldpeak_log  trestbps_age  stress_score  \
0  1.000000 -0.666667       1.5    -0.515406      1.547619      0.099748   
1  1.000000 -0.060606       0.5     0.462404      1.000000      0.000000   
2  0.769231 -1.181818       1.0     0.725420      1.095238      2.681148   
3  0.692308  0.636364       0.4     0.301077      0.726190      0.916231   
4  0.615385 -0.272727      -1.0     0.462404     -0.039683      1.077558   

   sex_1  cp_1  cp_2  cp_3  restecg_1  restecg_2  exang_1  ca_1  ca_2  ca_3  \
0    1.0   0.0   0.0   0.0        0.0        1.0      0.0   1.0   0.0   0.0   
1    0.0   0.0   0.0   0.0        0.0        0.0      0.0   0.0   1.0   0.0   
2    0.0   0.0   0.0   0.0        0.0        0.0      0.0   0.0   0.0   0.0   
3    1.0   0.0   0.0   0.0        0.0        1.0      0.0   1.0   0.0   0.0   
4    1.0   0.0   0.0   0.0        0.0        1.0      1.0   0.0   0.0   0.0   

   thal_1  thal_2  slope_1  slope_2  
0     0.0     0.0   

In [6]:
# Heart Disease UCI - Baseline Models
# -----------------------------------
# Goal: Benchmark Logistic Regression against tree-based methods (RF, XGBoost, LightGBM)
#       and SVM, using cross-validation with multiple metrics.

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC

# ============================
# 1. Define Models
# ============================
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42, class_weight="balanced"),
    "XGBoost": XGBClassifier(eval_metric="logloss", random_state=42),
    "LightGBM": LGBMClassifier(random_state=42),
    "SVC": SVC(probability=True, random_state=42),
}

# ============================
# 2. Cross-Validation Setup
# ============================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = {"roc_auc": "roc_auc", "average_precision": "average_precision", "f1": "f1"}

# ============================
# 3. Evaluate All Models
# ============================
results = {}
for model_name, model in models.items():
    pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", model)])

    scores = cross_validate(
        pipeline, X, y, cv=cv, scoring=scoring, return_train_score=False
    )

    results[model_name] = {
        "ROC AUC": np.mean(scores["test_roc_auc"]),
        "Average Precision": np.mean(scores["test_average_precision"]),
        "F1 Score": np.mean(scores["test_f1"]),
    }

df_results = pd.DataFrame(results).T
print(df_results)

[LightGBM] [Info] Number of positive: 109, number of negative: 128
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000476 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 267
[LightGBM] [Info] Number of data points in the train set: 237, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.459916 -> initscore=-0.160682
[LightGBM] [Info] Start training from score -0.160682
[LightGBM] [Info] Number of positive: 109, number of negative: 128
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000365 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 268
[LightGBM] [Info] Number of data points in the train set: 237, number of used features: 16
[LightGBM] [Info] [binary:BoostF



[LightGBM] [Info] Number of positive: 110, number of negative: 128
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000413 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 267
[LightGBM] [Info] Number of data points in the train set: 238, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.462185 -> initscore=-0.151550
[LightGBM] [Info] Start training from score -0.151550




[LightGBM] [Info] Number of positive: 110, number of negative: 128
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000737 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 270
[LightGBM] [Info] Number of data points in the train set: 238, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.462185 -> initscore=-0.151550
[LightGBM] [Info] Start training from score -0.151550
[LightGBM] [Info] Number of positive: 110, number of negative: 128
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000390 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 264
[LightGBM] [Info] Number of data points in the train set: 238, number of used features: 16
[LightGBM] [Info] [binary:BoostF



In [8]:
# Heart Disease UCI - Logistic Regression with Optuna
# ---------------------------------------------------
# Goal: Tune hyperparameters of Logistic Regression using Optuna,
#       evaluate performance, and interpret results with SHAP.

import optuna
import shap
from sklearn.model_selection import train_test_split


# ============================
# 1. Define Objective Function
# ============================
def objective(trial):
    C = trial.suggest_float("C", 1e-5, 1e2, log=True)
    solver = trial.suggest_categorical("solver", ["lbfgs", "newton-cg", "saga"])

    model = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            (
                "classifier",
                LogisticRegression(
                    C=C, penalty="l2", solver=solver, max_iter=2000, random_state=42
                ),
            ),
        ]
    )
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_validate(model, X, y, cv=cv, scoring="roc_auc")
    return np.mean(scores["test_score"])


# ============================
# 2. Run Optuna Optimization
# ============================
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

print("Best hyperparameters: ", study.best_params)
print("Best ROC AUC: ", study.best_value)

# ============================
# 3. Train-Test Split
# ============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Build final model with best params
best_params = study.best_params
final_model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        (
            "classifier",
            LogisticRegression(**best_params, max_iter=1000, random_state=42),
        ),
    ]
)

final_model.fit(X_train, y_train)

# ============================
# 4. Evaluation
# ============================
y_pred = final_model.predict(X_test)
y_proba = final_model.predict_proba(X_test)[:, 1]

print("\nFinal Model Results:")
print(f"ROC AUC:           {roc_auc_score(y_test, y_proba):.3f}")
print(f"Average Precision: {average_precision_score(y_test, y_proba):.3f}")
print(f"F1 Score:          {f1_score(y_test, y_pred):.3f}")

# ============================
# 5. SHAP Analysis
# ============================
X_test_preprocessed = final_model.named_steps["preprocessor"].transform(X_test)
explainer = shap.Explainer(final_model.named_steps["classifier"], X_test_preprocessed)
shap_values = explainer(X_test_preprocessed)

# Global importance
shap.summary_plot(
    shap_values,
    features=X_test_preprocessed,
    feature_names=X_preprocessed_df.columns,
    show=False,
)
plt.savefig("figures/shap_summary_plot.png")
plt.close()

import joblib

# Save trained pipeline
joblib.dump(final_model, "heart_disease_model.pkl")

# Load and use model
loaded_model = joblib.load("heart_disease_model.pkl")
preds = loaded_model.predict(X)

# ðŸ”Ž Observations:
# Logistic Regression: Strong baseline (ROC AUC â‰ˆ 0.92), improved further with feature engineering and Optuna tuning.
# Tree-based models (RF, XGBoost, LightGBM): Competitive but slightly underperformed Logistic Regression, likely due to small dataset size.
# SVC: Performed well (ROC AUC â‰ˆ 0.90), but without surpassing Logistic Regression.

# Conclusion:
# Key predictors: Chest pain type (cp â†‘ risk), number of major vessels (ca â†‘ risk), thalassemia results (thal â†‘ risk), stress score (oldpeak Ã— slope â†‘ risk), and maximum heart rate achieved (thalach â†“ risk).
# Best model: Logistic Regression, balancing high predictive performance with interpretability, ideal for clinical use.
# Lesson learned: With smaller, structured medical datasets, feature engineering + interpretable models can outperform more complex methods while remaining transparent for healthcare decision-making.

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-09-10 14:02:18,956] A new study created in memory with name: no-name-1f8a380f-3624-415c-9a7b-71861276859d
[I 2025-09-10 14:02:19,027] Trial 0 finished with value: 0.8904513888888888 and parameters: {'C': 0.01695594733547261, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.8904513888888888.
[I 2025-09-10 14:02:19,156] Trial 1 finished with value: 0.9148478835978835 and parameters: {'C': 9.642987769043083, 'solver': 'saga'}. Best is trial 1 with value: 0.9148478835978835.
[I 2025-09-10 14:02:19,201] Trial 2 finished with value: 0.8689814814814815 and parameters: {'C': 0.0004039194808940517, 'solver': 'saga'}. Best is trial 1 with value: 0.9148478835978835.
[I 2025-09-10 14:02:19,267] Trial 3 finished with value: 0.8710565476190476 and parameters: {'C': 0.0011753414641545864, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.9148478835978835.
[I 2025-09-10 14:02:19,324] Trial 4 finished with value: 0.9138310185185186 and paramete

Best hyperparameters:  {'C': 0.8753203594322335, 'solver': 'saga'}
Best ROC AUC:  0.9193287037037038

Final Model Results:
ROC AUC:           0.975
Average Precision: 0.979
F1 Score:          0.902
