# Impports

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import sys
import os

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)
    
from utils.data import load_breast_cancer_kagglehub, standardize_fit_transform
from utils.internal_metrics import silhouette_score, best_k_by_gap, gap_statistic
from KMeans import KMeans


# Load and standardize dataset

In [None]:
X, y, feature_names = load_breast_cancer_kagglehub()
Xs, scaler = standardize_fit_transform(X)


# Elbow & Silhouette analysis

In [None]:
ks = list(range(2, 11))  # typical range
seed = 42

inertia_pp, inertia_r = [], []
iters_pp, iters_r = [], []
sil_pp, sil_r = [], []

for k in ks:
    km_pp = KMeans(n_clusters=k, init="kmeans++", max_iter=300, tol=1e-4, random_state=seed)
    labels_pp = km_pp.fit_predict(Xs)

    km_r = KMeans(n_clusters=k, init="random", max_iter=300, tol=1e-4, random_state=seed)
    labels_r = km_r.fit_predict(Xs)

    inertia_pp.append(km_pp.inertia_)
    inertia_r.append(km_r.inertia_)

    iters_pp.append(km_pp.n_iter_)
    iters_r.append(km_r.n_iter_)

    sil_pp.append(silhouette_score(Xs, labels_pp))
    sil_r.append(silhouette_score(Xs, labels_r))

    print(f"k={k}: inertia++={km_pp.inertia_:.2f} iters++={km_pp.n_iter_} sil++={sil_pp[-1]:.4f} | "
          f"inertiaR={km_r.inertia_:.2f} itersR={km_r.n_iter_} silR={sil_r[-1]:.4f}")

# Elbow plot

In [None]:
plt.figure()
plt.plot(ks, inertia_pp, marker="o", label="K-Means++")
plt.plot(ks, inertia_r, marker="o", label="Random init")
plt.xlabel("k")
plt.ylabel("Inertia (WCSS)")
plt.title("Elbow Method (Inertia) on Original Data")
plt.grid(True)
plt.legend()
plt.show()

# Silhouette plot

In [None]:
plt.figure()
plt.plot(ks, sil_pp, marker="o", label="K-Means++")
plt.plot(ks, sil_r, marker="o", label="Random init")
plt.xlabel("k")
plt.ylabel("Silhouette score")
plt.title("Silhouette score vs k (Original Data)")
plt.grid(True)
plt.legend()
plt.show()

best_k_sil_pp = ks[int(np.argmax(sil_pp))]
best_k_sil_r  = ks[int(np.argmax(sil_r))]
print("Best k by silhouette (K-Means++):", best_k_sil_pp)
print("Best k by silhouette (Random):", best_k_sil_r)

# Gap statistic
Gap Statistic Implementation

In [None]:
def clusterer_factory(init_mode):
    def _factory(k, seed):
        return KMeans(
            n_clusters=k,
            init=init_mode,
            max_iter=300,
            tol=1e-4,
            random_state=seed
        )
    return _factory


# Determining Optimal k Using the Gap Statistic

In [None]:
ks = list(range(2, 11))
B = 10

gaps_pp, sks_pp = [], []
gaps_r, sks_r = [], []

factory_pp = clusterer_factory("kmeans++")
factory_r  = clusterer_factory("random")

for k in ks:
    g, s = gap_statistic(Xs, factory_pp, k, B=B, random_state=42)
    gaps_pp.append(g)
    sks_pp.append(s)

    g, s = gap_statistic(Xs, factory_r, k, B=B, random_state=42)
    gaps_r.append(g)
    sks_r.append(s)

    print(f"k={k}: gap++={gaps_pp[-1]:.4f} sk++={sks_pp[-1]:.4f} | "
          f"gapR={gaps_r[-1]:.4f} skR={sks_r[-1]:.4f}")\
          
best_k_gap_pp = best_k_by_gap(gaps_pp, sks_pp, ks)
best_k_gap_r  = best_k_by_gap(gaps_r, sks_r, ks)
print("Best k by gap rule (++):", best_k_gap_pp)
print("Best k by gap rule (random):", best_k_gap_r)



In [None]:
plt.figure()
plt.plot(ks, gaps_pp, marker="o", label="Gap (K-Means++)")
plt.plot(ks, gaps_r, marker="o", label="Gap (Random)")
plt.xlabel("k")
plt.ylabel("Gap statistic")
plt.title("Gap statistic vs k (Original Data)")
plt.grid(True)
plt.legend()
plt.show()

# Convergence speed comparison (iterations)

In [None]:
plt.figure()
plt.plot(ks, iters_pp, marker="o", label="iters (K-Means++)")
plt.plot(ks, iters_r, marker="o", label="iters (Random)")
plt.xlabel("k")
plt.ylabel("Iterations to converge")
plt.title("Convergence speed vs k")
plt.grid(True)
plt.legend()
plt.show()