# 05 — Unsupervised Learning

- KMeans with elbow method
- Agglomerative (hierarchical) clustering + dendrogram
- Compare clusters vs. true labels (optional)

In [None]:
import numpy as np, pandas as pd, joblib
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import KMeans, AgglomerativeClustering

RANDOM_STATE = 42

train = pd.read_csv("../data/processed/train.csv")
target = next((t for t in ["target","num","condition","disease"] if t in train.columns), None)

X_train, y_train = train.drop(columns=[target]), train[target]
preprocessor = joblib.load("../models/preprocessor.pkl")

Xtr = preprocessor.fit_transform(X_train)
if hasattr(Xtr, "toarray"):
    Xtr = Xtr.toarray()

# Elbow method for K
sse = []
K_range = range(1, 11)
for k in K_range:
    km = KMeans(n_clusters=k, random_state=RANDOM_STATE, n_init=10).fit(Xtr)
    sse.append(km.inertia_)

plt.figure()
plt.plot(list(K_range), sse, marker="o")
plt.xlabel("k")
plt.ylabel("SSE (Inertia)")
plt.title("Elbow Method (KMeans)")
plt.show()

# Choose k=2 (for heart disease: disease vs. no disease), just as an example
kmeans = KMeans(n_clusters=2, random_state=RANDOM_STATE, n_init=10).fit(Xtr)
labels_km = kmeans.labels_
print("KMeans cluster counts:", np.bincount(labels_km))

# Hierarchical clustering (on a small subset for speed/visual clarity)
rng = np.random.RandomState(RANDOM_STATE)
idx = rng.choice(Xtr.shape[0], size=min(150, Xtr.shape[0]), replace=False)
Z = linkage(Xtr[idx], method="ward")
plt.figure()
dendrogram(Z)
plt.title("Hierarchical Clustering Dendrogram (subset)")
plt.xlabel("Samples")
plt.ylabel("Distance")
plt.show()