In [1]:
import sys, sysconfig, platform
print(sys.executable)
print(platform.python_version())


c:\Users\kevin\project-code\clinical-omics\.venv\Scripts\python.exe
3.11.9


In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ucimlrepo import fetch_ucirepo

DATA_DIR = Path("../data")
REPORTS_DIR = Path("../reports")
REPORTS_DIR.mkdir(exist_ok=True, parents=True)


In [3]:
heart = fetch_ucirepo(id=45)  # UCI Heart Disease (Cleveland)
X = heart.data.features.copy()
y = heart.data.targets.copy()
print(X.shape, y.shape)
X.head()


(303, 13) (303, 1)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0


In [4]:
df = pd.concat([X, y], axis=1)
# Many versions call the target 'num' (0=no disease, 1-4=disease)
target_name = df.columns[-1]
df.rename(columns={target_name: "num"}, inplace=True)
df["disease"] = (df["num"] > 0).astype(int)
df.drop(columns=["num"], inplace=True)
df.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,disease
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0


In [5]:
df = df.replace('?', np.nan)
for c in df.columns:
    df[c] = pd.to_numeric(df[c], errors="ignore")
df[df.columns] = df[df.columns].apply(
    lambda col: col.fillna(col.median()) if pd.api.types.is_numeric_dtype(col) else col
)
df.isna().sum().sum()


  df[c] = pd.to_numeric(df[c], errors="ignore")


np.int64(0)

In [6]:
print("Class balance (0=no disease, 1=disease):")
print(df["disease"].value_counts(normalize=True))
display(df.describe().T)


Class balance (0=no disease, 1=disease):
disease
0    0.541254
1    0.458746
Name: proportion, dtype: float64


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,303.0,54.438944,9.038662,29.0,48.0,56.0,61.0,77.0
sex,303.0,0.679868,0.467299,0.0,0.0,1.0,1.0,1.0
cp,303.0,3.158416,0.960126,1.0,3.0,3.0,4.0,4.0
trestbps,303.0,131.689769,17.599748,94.0,120.0,130.0,140.0,200.0
chol,303.0,246.693069,51.776918,126.0,211.0,241.0,275.0,564.0
fbs,303.0,0.148515,0.356198,0.0,0.0,0.0,0.0,1.0
restecg,303.0,0.990099,0.994971,0.0,0.0,1.0,2.0,2.0
thalach,303.0,149.607261,22.875003,71.0,133.5,153.0,166.0,202.0
exang,303.0,0.326733,0.469794,0.0,0.0,0.0,1.0,1.0
oldpeak,303.0,1.039604,1.161075,0.0,0.0,0.8,1.6,6.2


In [7]:
# Age
plt.figure()
df["age"].hist(bins=20)
plt.title("Age distribution")
plt.xlabel("Age"); plt.ylabel("Count")
plt.tight_layout()
plt.savefig(REPORTS_DIR / "clinical_age_hist.png", dpi=150)

# Cholesterol by disease
plt.figure()
df.boxplot(column="chol", by="disease")
plt.title("Cholesterol by Disease Status"); plt.suptitle("")
plt.xlabel("Disease (0/1)"); plt.ylabel("Chol (mg/dL)")
plt.tight_layout()
plt.savefig(REPORTS_DIR / "clinical_chol_by_disease.png", dpi=150)

# Chest pain vs disease
ct = pd.crosstab(df["cp"], df["disease"], normalize="index")
plt.figure()
ct.plot(kind="bar", rot=0, title="Chest Pain Type vs Disease (row-normalized)")
plt.xlabel("cp"); plt.ylabel("Proportion"); plt.tight_layout()
plt.savefig(REPORTS_DIR / "clinical_cp_vs_disease.png", dpi=150)
plt.close('all')


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

X_ = df.drop(columns=["disease"])
y_ = df["disease"]

X_train, X_test, y_train, y_test = train_test_split(
    X_, y_, test_size=0.25, random_state=42, stratify=y_
)

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000))
])

pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)
proba = pipe.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, pred)
auc = roc_auc_score(y_test, proba)
print("Accuracy:", acc)
print("ROC-AUC:", auc)
print(classification_report(y_test, pred))


Accuracy: 0.868421052631579
ROC-AUC: 0.9310104529616725
              precision    recall  f1-score   support

           0       0.90      0.85      0.88        41
           1       0.84      0.89      0.86        35

    accuracy                           0.87        76
   macro avg       0.87      0.87      0.87        76
weighted avg       0.87      0.87      0.87        76



In [9]:
summary = f"""# Clinical EDA (UCI Heart Disease — Cleveland)

- Rows: {df.shape[0]}, Cols: {df.shape[1]}
- Class balance (disease=1): {df['disease'].mean():.2%}
- Baseline logistic regression:
  - Accuracy: {acc:.3f}
  - ROC-AUC: {auc:.3f}

Artifacts saved:
- reports/clinical_age_hist.png
- reports/clinical_chol_by_disease.png
- reports/clinical_cp_vs_disease.png
"""
(Path("../reports") / "clinical_summary.md").write_text(summary)
summary


'# Clinical EDA (UCI Heart Disease — Cleveland)\n\n- Rows: 303, Cols: 14\n- Class balance (disease=1): 45.87%\n- Baseline logistic regression:\n  - Accuracy: 0.868\n  - ROC-AUC: 0.931\n\nArtifacts saved:\n- reports/clinical_age_hist.png\n- reports/clinical_chol_by_disease.png\n- reports/clinical_cp_vs_disease.png\n'