<a href="https://colab.research.google.com/github/inderpreetsingh01/ml_machine_coding/blob/main/Kmeans.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Handles empty clusters gracefully (retains old centroid)
# Supports convergence tolerance and max iterations
# Easy to extend to k-means++ or mini-batch k-means

In [None]:
class KMeans:
    def __init__(self, k=3, max_iters=100, tol=1e-4, random_state=None):
        self.k = k
        self.max_iters = max_iters
        self.tol = tol
        self.random_state = random_state

    def fit(self, X):
        np.random.seed(self.random_state)
        n_samples, _ = X.shape

        # Step 1: Initialize centroids
        random_indices = np.random.choice(n_samples, self.k, replace=False)
        self.centroids = X[random_indices]

        for _ in range(self.max_iters):
            # Step 2: Assign to nearest centroid
            distances = self._compute_distances(X, self.centroids)
            self.labels = np.argmin(distances, axis=1)

            # Step 3: Update centroids
            new_centroids = np.array([
                X[self.labels == i].mean(axis=0) if np.any(self.labels == i) else self.centroids[i]
                for i in range(self.k)
            ])

            # Step 4: Check for convergence
            shift = np.linalg.norm(self.centroids - new_centroids)
            self.centroids = new_centroids
            if shift < self.tol:
                break

        # Store final inertia
        self._inertia = self._compute_inertia(X)

    def _compute_distances(self, X, centroids):
        return np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)

    def _compute_inertia(self, X):
        return np.sum((np.linalg.norm(X - self.centroids[self.labels], axis=1)) ** 2)

    def inertia(self):
        return self._inertia

    def predict(self, X):
        distances = self._compute_distances(X, self.centroids)
        return np.argmin(distances, axis=1)

In [None]:
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)

model = KMeans(k=4, random_state=42)
model.fit(X)

print("Inertia:", model.inertia())

plt.scatter(X[:, 0], X[:, 1], c=model.labels, cmap='viridis')
plt.scatter(model.centroids[:, 0], model.centroids[:, 1], c='red', s=200, marker='X')
plt.title("K-Means Clustering")
plt.show()

In [None]:
import numpy as np

# This version is O(n²) due to pairwise distance matrix — fine for ≤10k points.
def silhouette_score(X, labels):
    n = len(X)
    unique_labels = np.unique(labels)
    k = len(unique_labels)
    if k == 1 or k == n:
        return 0  # Not meaningful

    # Precompute pairwise distances
    dist_matrix = np.linalg.norm(X[:, np.newaxis] - X, axis=2)

    scores = []

    for i in range(n):
        same_cluster = (labels == labels[i])
        other_clusters = unique_labels[unique_labels != labels[i]]

        # a(i): mean intra-cluster distance
        a = np.mean(dist_matrix[i][same_cluster & (np.arange(n) != i)])

        # b(i): mean nearest-cluster distance
        b = float('inf')
        for other_label in other_clusters:
            other_mask = (labels == other_label)
            b = min(b, np.mean(dist_matrix[i][other_mask]))

        # Handle edge case if a or b is zero
        s = (b - a) / max(a, b) if max(a, b) > 0 else 0
        scores.append(s)

    return np.mean(scores)

In [None]:
if __name__ == "__main__":
    from sklearn.datasets import make_blobs
    import matplotlib.pyplot as plt

    X, _ = make_blobs(n_samples=300, centers=4, cluster_std=0.6, random_state=0)

    model = KMeans(k=4, random_state=42)
    model.fit(X)

    score = silhouette_score(X, model.labels)
    print(f"Silhouette Score: {score:.4f}")

In [None]:
# The current implementation of silhouette_score is clear and correct, but it's not optimized — it's O(n²) in time and space due to:
# We’ll optimize in two main ways:
# ✅ 1. Avoid storing the full distance matrix
# We compute only what we need on-the-fly, using:
# But this makes each step slower individually, so we need smarter grouping.
# ✅ 2. Pre-group points by cluster for efficiency
# Create a dictionary mapping each label to its members' indices
# Reuse these to compute intra-cluster and inter-cluster distances more efficiently

# ⚡ Complexity Comparison:
# Version	Time Complexity	Space Complexity	Notes
# Original (full matrix)	O(n²)	O(n²)	Fastest for small n
# Optimized (on-the-fly)	O(n²) worst-case	O(n)	Better space usage

# ⚡ Further Optimizations (if needed):
# Replace inner loops with scipy.spatial.cKDTree or BallTree for faster distance queries.
# Parallelize using joblib or numba.
# Use mini-batches or sampling for approximate silhouette scores in very large datasets.

In [None]:
def silhouette_score_optimized(X, labels):
    n = len(X)
    unique_labels = np.unique(labels)
    k = len(unique_labels)
    if k == 1 or k == n:
        return 0  # Not meaningful

    # Group points by cluster
    clusters = {label: np.where(labels == label)[0] for label in unique_labels}

    scores = []

    for i in range(n):
        xi = X[i]
        label_i = labels[i]
        same_cluster = clusters[label_i]

        # Intra-cluster distance (a)
        if len(same_cluster) > 1:
            a = np.mean([
                np.linalg.norm(xi - X[j])
                for j in same_cluster if j != i
            ])
        else:
            a = 0  # Only point in its cluster

        # Inter-cluster distance (b): find min avg dist to other clusters
        b = float('inf')
        for other_label, other_indices in clusters.items():
            if other_label == label_i:
                continue
            b_candidate = np.mean([np.linalg.norm(xi - X[j]) for j in other_indices])
            if b_candidate < b:
                b = b_candidate

        # Compute silhouette score for point i
        s = (b - a) / max(a, b) if max(a, b) > 0 else 0
        scores.append(s)

    return np.mean(scores)

In [None]:
from sklearn.datasets import make_blobs

X, _ = make_blobs(n_samples=500, centers=4, cluster_std=0.6, random_state=0)

model = KMeans(k=4, random_state=42)
model.fit(X)

score = silhouette_score_optimized(X, model.labels)
print(f"Optimized Silhouette Score: {score:.4f}")