## Question
Implement K-Means clustering into K clusters based on which cluster's centroid each data point is closest to.

---

**Clarifications**

Assumptions:
- distance: Euclidean distance
- - iteration: predefined by the user
- X: feature matrix of the data input

- K: determined in advance
- clusters: named 0 to K-1


In [None]:
'''
centroid initialization:
- randomly select k data points

for each iteration:
1. for each point: 
    - calculate the distance between each centroid
    - find the centroid with the smallest distance
    - label the point to the centroid
2. update centroids' coordinates:
    - average all the data points of their scope
    - if no data point included -> no change

'''

In [1]:
import numpy as np

In [49]:
class KMeans:
    def __init__(self, k: int, n_iter: int = 100):
        self.n_clusters = k
        self.n_iterations = n_iter
        self.centroids = None
        self.labels_ = None
    
    def distance(self, x: np.ndarray, y: np.ndarray) -> float:
        return np.sum((x - y) ** 2)
    
    def fit(self, X: np.ndarray) -> None:
        n_samples, n_features = X.shape
        
        # initialize centroids
        centroid_idx = np.random.choice(n_samples, self.n_clusters, replace=False)
        self.centroids = X[centroid_idx]
        self.labels_ = np.array([-1] * n_samples)
        
        for _ in range(self.n_iterations):
            # # cluster data points
            # for i in range(n_samples):
            #     distances = [self.distance(X[i], centroid) for centroid in self.centroids]
            #     self.labels_[i] = np.argmin(distances)
            # assign clusters
            labels = np.array([
                np.argmin([self.distance(x, centroid) for centroid in self.centroids])
                for x in X
            ])
            
            # # update centroids
            # new_centroids = np.array([
            #     np.mean(X[self.labels_ == i], axis = 0)
            #     if np.any(self.labels_ == i) else self.centroids[i]
            #     for i in range(self.n_clusters)
            # ])
            # update centroids
            new_centroids = np.array([
                X[labels == i].mean(axis=0) if np.any(labels == i) else self.centroids[i]
                for i in range(self.n_clusters)
            ])
            
            # check for convergence
            if np.allclose(new_centroids, self.centroids):
                break
            self.centroids = new_centroids
            self.labels_ = labels
    
    def predict(self, X_new: np.ndarray) -> np.ndarray:
        # labels = []
        # for x_i in X_new:
        #     distances = [self.distance(x_i, centroid) for centroid in self.centroids]
        #     labels.append(np.argmin(distances))
        # return np.array(labels)
        return np.array([
            np.argmin([self.distance(x_i, centroid) for centroid in self.centroids])
            for x_i in X_new
        ])

In [50]:
# Testing
# Generate a toy dataset
X = np.vstack([
    np.random.randn(50, 2) + np.array([0, 0]),
    np.random.randn(50, 2) + np.array([5, 5]),
    np.random.randn(50, 2) + np.array([0, 5])
])

# Run KMeans
kmeans = KMeans(k=3)
kmeans.fit(X)

print("Centroids:\n", kmeans.centroids)
print("Labels:", kmeans.labels_[:10])  # show first 10 labels

Centroids:
 [[-0.06548217  5.16748621]
 [-0.13916157  0.12684626]
 [ 4.7004148   4.68490916]]
Labels: [1 1 1 1 1 1 1 1 1 1]


In [51]:
# kmeans.labels_.size
kmeans.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [52]:
# New points we want to cluster
X_new = np.array([
    [0, 1],   # near cluster around (0,0)
    [4.5, 4.8], # near cluster around (5,5)
    [-1, 4.5],  # near cluster around (0,5)
    [2, 2],   # in-between (could go to 0,0 or 5,5)
])

labels_new = kmeans.predict(X_new)
print("Predicted cluster for new samples:", labels_new)


Predicted cluster for new samples: [1 2 0 1]
