In [27]:
import typing as t
from scipy.spatial.distance import cdist
import numpy as np
import numpy.ma as ma
import pandas as pd


In [3]:
def generate_dataset(
        size: int = 20,
        dim: int = 2,
        min: int = -10,
        max: int = 10,
        seed: int = 0,
) -> np.ndarray:
    return (max - min) * np.random.default_rng(seed).random((size, dim)) + min

In [54]:
class KMeans:
    def __init__(self, n_clusters: int = 8, max_iter: int = 300):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        
        self.history = []
        self.centroids = None
        self.labels = None
    
    def fit(self, X: Array):
        data = X
        self.centroids = self._initialize_clusters(data=data, n_clusters=self.n_clusters)
        self.history = [centroids]
        
        for _ in range(self.max_iter):
            self.labels = self._assign_points(data=data, centroids=centroids)
            self.centroids = KMeans._update_centroids(data=data, assignations=assignations, n_clusters=2)
            self.history.append(self.centroids)
        return self

    @staticmethod
    def _initialize_clusters(data: np.ndarray, n_clusters: int, seed: int = 0) -> np.ndarray:
        if len(data.shape) != 2:
            raise ValueError(f"Expected data of shape 2, shape={data.shape}")

        min = data.min()
        max = data.max()
        dim = data.shape[1]
        return (max - min) * np.random.default_rng(seed).random((n_clusters, dim)) + min

    @staticmethod
    def _assign_points(data: np.ndarray, centroids: np.ndarray) -> np.ndarray:
        # e.g. [ [0.71, 12.6] , [12.3, 0.40] ... ]
        distances = cdist(data, centroids)
        # e.g. array([0, 1, ... ])
        return np.argmin(distances, axis=1)

    @staticmethod
    def _update_centroids(data: np.ndarray, assignations: np.ndarray, n_clusters: int) -> np.ndarray:
        new_centroids = []
        for i in range(n_clusters):
            mask = assignations == i
            data_in_cluster = data[mask]
            # WARNING: if no points are within the cluster, we should probably reset it?
            new_centroid = data_in_cluster.mean(axis=0)
            new_centroids.append(new_centroid)
        return np.array(new_centroids)

In [55]:
data = generate_dataset(size=5)
print(data.shape)
data

(5, 2)


array([[ 2.73923375, -4.60426572],
       [-9.18052952, -9.66944729],
       [ 6.26540478,  8.25511155],
       [ 2.13271552,  4.58993122],
       [ 0.87249983,  8.70144848]])

In [56]:
kmeans = KMeans(n_clusters=3, max_iter=5)

In [57]:
kmeans.fit(X=data)

<__main__.KMeans at 0x140a70490>

In [58]:
kmeans.history

[array([[ 2.03210947, -4.71322369],
        [-8.91672695, -9.36581982]]),
 array([[ 3.00246347,  4.23555638],
        [-9.18052952, -9.66944729]]),
 array([[ 3.00246347,  4.23555638],
        [-9.18052952, -9.66944729]]),
 array([[ 3.00246347,  4.23555638],
        [-9.18052952, -9.66944729]]),
 array([[ 3.00246347,  4.23555638],
        [-9.18052952, -9.66944729]]),
 array([[ 3.00246347,  4.23555638],
        [-9.18052952, -9.66944729]])]

In [59]:
kmeans.labels

array([0, 1, 0, 0, 0])

In [51]:
centroids = KMeans._initialize_clusters(data=data, n_clusters=2)
print(centroids.shape)
centroids

(2, 2)


array([[ 2.03210947, -4.71322369],
       [-8.91672695, -9.36581982]])

In [52]:
assignations = KMeans._assign_points(data=data, centroids=centroids)
print(assignations.shape)
assignations

(5,)


array([0, 1, 0, 0, 0])

In [53]:
new_centroids = KMeans._update_centroids(data=data, assignations=assignations, n_clusters=2)
print(new_centroids.shape)
new_centroids

(2, 2)


array([[ 3.00246347,  4.23555638],
       [-9.18052952, -9.66944729]])

In [47]:
def update_centroids(data: np.ndarray, assignations: np.ndarray, n_clusters: int) -> np.ndarray:
    new_centroids = []
    for i in range(n_clusters):
        mask = assignations == i
        data_in_cluster = data[mask]
        new_centroid = data_in_cluster.mean(axis=0)
        new_centroids.append(new_centroid)
    return np.array(new_centroids)

In [48]:
new_centroids = update_centroids(data=data, assignations=assignations, n_clusters=2)
print(new_centroids.shape)
new_centroids

(2, 2)


array([[ 3.00246347,  4.23555638],
       [-9.18052952, -9.66944729]])

In [35]:
assignations == 1

array([False,  True, False, False, False])

In [43]:
data[assignations == 1]

array([[-9.18052952, -9.66944729]])

In [46]:
data[assignations == 1].mean(axis=0)

array([-9.18052952, -9.66944729])

In [None]:
def assign_points(data: np.ndarray, centroids: np.ndarray) -> np.numpy:
    # e.g. [ [0.71, 12.6] , [12.3, 0.40] ... ]
    distances = cdist(data, centroids)
    # e.g. array([0, 1, ... ])
    return np.argmin(distances, axis=1)

In [None]:
distances = assign_points(data=dataset, centroids=centroids)
distances


In [None]:
distances.shape

In [None]:
distances[0]