# Impports

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import sys
import os

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)
    
from utils.data import load_breast_cancer_kagglehub, standardize_fit_transform
from utils.internal_metrics import silhouette_score, best_k_by_gap, gap_statistic
from utils.internal_metrics import davies_bouldin_index, calinski_harabasz_index
from utils.external_metrics import adjusted_rand_index, normalized_mutual_info, purity_score, confusion_matrix

from KMeans import KMeans


# Load and standardize dataset

### Internal + External metrics (evaluation only)
We compute internal metrics (Silhouette / DBI / CH / WCSS) and external metrics (ARI / NMI / Purity / Confusion Matrix).
True labels y are used only for evaluation, not training.

In [None]:
X, y, feature_names = load_breast_cancer_kagglehub()
Xs, scaler = standardize_fit_transform(X)


# Elbow & Silhouette analysis

In [None]:
ks = list(range(2, 11))  # typical range
seed = 42

# store metrics for each k 
inertia_pp, inertia_r = [], []
iters_pp, iters_r = [], []
sil_pp, sil_r = [], []

# internal extra
dbi_pp, dbi_r = [], []
ch_pp, ch_r = [], []

# external (evaluation only)
ari_pp, ari_r = [], []
nmi_pp, nmi_r = [], []
pur_pp, pur_r = [], []


for k in ks:
    km_pp = KMeans(n_clusters=k, init="kmeans++", max_iter=300, tol=1e-4, random_state=seed)
    labels_pp = km_pp.fit_predict(Xs)

    # internal extra
    dbi_pp.append(davies_bouldin_index(Xs, labels_pp))
    ch_pp.append(calinski_harabasz_index(Xs, labels_pp))

    # external (evaluation only)
    ari_pp.append(adjusted_rand_index(y, labels_pp))
    nmi_pp.append(normalized_mutual_info(y, labels_pp))
    pur_pp.append(purity_score(y, labels_pp))


    km_r = KMeans(n_clusters=k, init="random", max_iter=300, tol=1e-4, random_state=seed)
    labels_r = km_r.fit_predict(Xs)

    # internal extra
    dbi_r.append(davies_bouldin_index(Xs, labels_r))
    ch_r.append(calinski_harabasz_index(Xs, labels_r))

    # external (evaluation only)
    ari_r.append(adjusted_rand_index(y, labels_r))
    nmi_r.append(normalized_mutual_info(y, labels_r))
    pur_r.append(purity_score(y, labels_r))

    inertia_pp.append(km_pp.inertia_)
    inertia_r.append(km_r.inertia_)

    iters_pp.append(km_pp.n_iter_)
    iters_r.append(km_r.n_iter_)

    sil_pp.append(silhouette_score(Xs, labels_pp))
    sil_r.append(silhouette_score(Xs, labels_r))

    print(
    f"k={k}: "
    f"sil++={sil_pp[-1]:.4f} dbi++={dbi_pp[-1]:.3f} ch++={ch_pp[-1]:.1f} "
    f"ari++={ari_pp[-1]:.4f} nmi++={nmi_pp[-1]:.4f} pur++={pur_pp[-1]:.4f} | "
    f"silR={sil_r[-1]:.4f} dbiR={dbi_r[-1]:.3f} chR={ch_r[-1]:.1f} "
    f"ariR={ari_r[-1]:.4f} nmiR={nmi_r[-1]:.4f} purR={pur_r[-1]:.4f}"
    )


# Elbow plot

In [None]:
plt.figure()
plt.plot(ks, inertia_pp, marker="o", label="K-Means++")
plt.plot(ks, inertia_r, marker="o", label="Random init")
plt.xlabel("k")
plt.ylabel("Inertia (WCSS)")
plt.title("Elbow Method (Inertia) on Original Data")
plt.grid(True)
plt.legend()
plt.show()

# Silhouette plot

In [None]:
plt.figure()
plt.plot(ks, sil_pp, marker="o", label="K-Means++")
plt.plot(ks, sil_r, marker="o", label="Random init")
plt.xlabel("k")
plt.ylabel("Silhouette score")
plt.title("Silhouette score vs k (Original Data)")
plt.grid(True)
plt.legend()
plt.show()

best_k_sil_pp = ks[int(np.argmax(sil_pp))]
best_k_sil_r  = ks[int(np.argmax(sil_r))]
print("Best k by silhouette (K-Means++):", best_k_sil_pp)
print("Best k by silhouette (Random):", best_k_sil_r)

# External metrics vs k


In [10]:
plt.figure()
plt.plot(ks, ari_pp, marker="o", label="ARI (K-Means++)")
plt.plot(ks, ari_r, marker="o", label="ARI (Random)")
plt.xlabel("k")
plt.ylabel("ARI")
plt.title("ARI vs k (Original Data)")
plt.grid(True)
plt.legend()
plt.show()

plt.figure()
plt.plot(ks, nmi_pp, marker="o", label="NMI (K-Means++)")
plt.plot(ks, nmi_r, marker="o", label="NMI (Random)")
plt.xlabel("k")
plt.ylabel("NMI")
plt.title("NMI vs k (Original Data)")
plt.grid(True)
plt.legend()
plt.show()

plt.figure()
plt.plot(ks, pur_pp, marker="o", label="Purity (K-Means++)")
plt.plot(ks, pur_r, marker="o", label="Purity (Random)")
plt.xlabel("k")
plt.ylabel("Purity")
plt.title("Purity vs k (Original Data)")
plt.grid(True)
plt.legend()
plt.show()


NameError: name 'ari_pp' is not defined

<Figure size 640x480 with 0 Axes>

# Gap statistic
Gap Statistic Implementation

In [None]:
def clusterer_factory(init_mode):
    def _factory(k, seed):
        return KMeans(
            n_clusters=k,
            init=init_mode,
            max_iter=300,
            tol=1e-4,
            random_state=seed
        )
    return _factory


# Determining Optimal k Using the Gap Statistic

In [None]:
ks = list(range(2, 11))
B = 10

gaps_pp, sks_pp = [], []
gaps_r, sks_r = [], []

factory_pp = clusterer_factory("kmeans++")
factory_r  = clusterer_factory("random")

for k in ks:
    g, s = gap_statistic(Xs, factory_pp, k, B=B, random_state=42)
    gaps_pp.append(g)
    sks_pp.append(s)

    g, s = gap_statistic(Xs, factory_r, k, B=B, random_state=42)
    gaps_r.append(g)
    sks_r.append(s)

    print(f"k={k}: gap++={gaps_pp[-1]:.4f} sk++={sks_pp[-1]:.4f} | "
          f"gapR={gaps_r[-1]:.4f} skR={sks_r[-1]:.4f}")\
          
best_k_gap_pp = best_k_by_gap(gaps_pp, sks_pp, ks)
best_k_gap_r  = best_k_by_gap(gaps_r, sks_r, ks)
print("Best k by gap rule (++):", best_k_gap_pp)
print("Best k by gap rule (random):", best_k_gap_r)



In [None]:
plt.figure()
plt.plot(ks, gaps_pp, marker="o", label="Gap (K-Means++)")
plt.plot(ks, gaps_r, marker="o", label="Gap (Random)")
plt.xlabel("k")
plt.ylabel("Gap statistic")
plt.title("Gap statistic vs k (Original Data)")
plt.grid(True)
plt.legend()
plt.show()

# Convergence speed comparison (iterations)

In [None]:
plt.figure()
plt.plot(ks, iters_pp, marker="o", label="iters (K-Means++)")
plt.plot(ks, iters_r, marker="o", label="iters (Random)")
plt.xlabel("k")
plt.ylabel("Iterations to converge")
plt.title("Convergence speed vs k")
plt.grid(True)
plt.legend()
plt.show()

# Confusion matrix for best-performing setting
Choose best by ARI or Silhouette

In [11]:
best_idx = int(np.argmax(ari_pp))
best_k = ks[best_idx]
print("Best k by ARI (K-Means++):", best_k)

km_best = KMeans(n_clusters=best_k, init="kmeans++", max_iter=300, tol=1e-4, random_state=seed)
best_labels = km_best.fit_predict(Xs)

cm, true_classes, pred_clusters = confusion_matrix(y, best_labels)
print("True classes:", true_classes)
print("Pred clusters:", pred_clusters)
print("Confusion Matrix:\n", cm)

plt.figure(figsize=(4,3))
plt.imshow(cm)
plt.title(f"Confusion Matrix (k={best_k}, KMeans++)")
plt.xlabel("Pred cluster")
plt.ylabel("True class")
plt.colorbar()
plt.grid(False)
plt.show()


NameError: name 'ari_pp' is not defined