# 03 — Evaluation & Interpretation


In [None]:
import gdown # need to run '!pip install gdown' to use this
import pandas as pd
import numpy as np

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
from tabulate import tabulate

# ML and statistics
from scipy.stats import chi2_contingency
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score

RANDOM_STATE = 42

## Load clustering artifacts
Run `02_clustering.ipynb` first.


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

labeled_path = Path("data/processed/recalls_labeled.csv")
X_path = Path("data/processed/design_matrix.npy")
k_path = Path("data/processed/best_k.txt")

missing = [p for p in [labeled_path, X_path, k_path] if not p.exists()]
if missing:
    raise FileNotFoundError("Missing artifacts: " + ", ".join(str(p) for p in missing))

df_labeled = pd.read_csv(labeled_path)
X = np.load(X_path)

with open(k_path, "r") as f:
    best_k = int(f.read().strip())

print("Loaded:")
print("-", labeled_path.resolve(), "| rows:", len(df_labeled))
print("-", X_path.resolve(), "| shape:", X.shape)
print("-", k_path.resolve(), "| best_k:", best_k)


## Clustering Evaluation

In [None]:
#Cluster Evaluation

#Davies-Bouldin score
DB_score = davies_bouldin_score(
    X,
    df_labeled["cluster"].values
)
print(f"Davies–Bouldin score for (k={best_k}): {DB_score:.3f}")

#Calinski-Harabasz scores for
KVals = range(2, 6)
CH_scores = []

for k in KVals:
    km = KMeans(n_clusters=k, n_init=10, max_iter=300, algorithm="elkan", random_state=0)
    labels = km.fit_predict(X)
    ch = calinski_harabasz_score(X, labels)
    CH_scores.append(ch)
    print(f"K={k}, CH Score={ch:.2f}")

#Silhouette score
Sil_score = silhouette_score(
    X,
    df_labeled["cluster"].values
)

print(f"Silhouette score for (k={best_k}): {Sil_score:.3f}")

#DB score of 1.297 and silhouette score of 0.266 indicate moderate cluster separation with some overlap
#When combined with other evaluation metrics, such as the distribution of size by recall cluster, the DB and silhouette scores can be taken to indicate meaningful clustering in a messy real-world dataset
#CH score provides additional support for choosing K=3, since it's max among k-values

Davies–Bouldin score for (k=3): 1.297
K=2, CH Score=9219.25
K=3, CH Score=10583.52
K=4, CH Score=8611.12
K=5, CH Score=7528.96
Silhouette score for (k=3): 0.266


In [None]:
#print evaluation results as table
best_k = 3

KVals = range(2,7)

DB_scores = []
CH_scores = []

for k in KVals:
  km = KMeans(n_clusters=k, n_init=10, max_iter=300, algorithm="elkan", random_state=0)
  labels = km.fit_predict(X)

  #compute metrics
  ch = calinski_harabasz_score(X, labels)
  db = davies_bouldin_score(X, labels)

  CH_scores.append(ch)
  DB_scores.append(db)

#create df
results_df = pd.DataFrame({
    "K": list(KVals),
    "Calinski-Harabasz": CH_scores,
    "Davies-Bouldin": DB_scores})

print(tabulate(results_df, headers='keys', tablefmt='fancy_grid', showindex=False))

#Silhouette score
Sil_score = silhouette_score(
    X,
    df_labeled["cluster"].values
)

print(f"Silhouette score for (k={best_k}): {Sil_score:.3f}")

╒═════╤═════════════════════╤══════════════════╕
│   K │   Calinski-Harabasz │   Davies-Bouldin │
╞═════╪═════════════════════╪══════════════════╡
│   2 │             9219.25 │          1.51434 │
├─────┼─────────────────────┼──────────────────┤
│   3 │            10583.5  │          1.29764 │
├─────┼─────────────────────┼──────────────────┤
│   4 │             8611.12 │          1.6522  │
├─────┼─────────────────────┼──────────────────┤
│   5 │             7528.96 │          1.71692 │
├─────┼─────────────────────┼──────────────────┤
│   6 │             6782.87 │          1.66665 │
╘═════╧═════════════════════╧══════════════════╛
Silhouette score for (k=3): 0.266
