### reference https://realpython.com/numpy-array-programming/#clustering-algorithms

In [1]:
import numpy as np

In [46]:
class KMeansClustering:
    def __init__(self):
        self.clusters = None
        self.diffs = []
    def fit(self, data, k, iterations=1000):
        cluster_choice = np.random.choice(data.shape[0], k, replace=False)
        cluster_centers = data[cluster_choice]
        diffs = []
        
        for _ in range(iterations):
            # find the euclidean distance and assign the values as per the cluster
            # assign points to the nearest cluster center
            relative_distance = (data - cluster_centers[:, None])
            euclidean = np.linalg.norm(relative_distance, axis=2)
            assign = np.argmin(euclidean, axis = 0)
            
            # update cluster centers: set center to the mean of each cluster.
            new_cluster_centers = np.empty(cluster_centers.shape)
            for i in range(k):
                new_cluster_centers[i] = X[assign==i].mean(axis=0)
                
            # get the change in the clusters and see what is the update
            change = np.linalg.norm(cluster_centers - new_cluster_centers)
            if change < 0.001:
                print("model converged")
            diffs.append(change)
            
            # update cluster center for the next iteration
            cluster_centers = new_cluster_centers
            
        self.clusters = cluster_centers
        self.diffs = diffs

In [3]:
X = np.repeat([[5, 5], [10, 10]], [5, 5], axis=0)
X = X + np.random.randn(*X.shape)  # 2 distinct "blobs"
centroids = np.array([[5, 5], [10, 10]])

assign = np.argmin(np.linalg.norm(X - centroids[:, None], axis=2), axis=0)

In [4]:
assign

array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])

In [5]:
X[assign==0]

array([[5.59897502, 5.03447691],
       [5.43869613, 5.22017741],
       [4.90206887, 3.43353889],
       [2.80471097, 3.42742263],
       [5.21470916, 3.44943913]])

In [6]:
X[assign==0].mean(axis=0)

array([4.79183203, 4.11301099])

In [7]:
centroids - X[assign==0].mean(axis=0)

array([[0.20816797, 0.88698901],
       [5.20816797, 5.88698901]])

In [9]:
(centroids - X[assign==0].mean(axis=0))

array([[0.20816797, 0.88698901],
       [5.20816797, 5.88698901]])

In [13]:
((centroids - X[assign==0].mean(axis=0)).abs()).max()

AttributeError: 'numpy.ndarray' object has no attribute 'abs'

In [14]:
np.abs(centroids - X[assign==0].mean(axis=0)).max()

5.886989007372473

In [15]:
sum([4.19077743, 4.28422161, 4.82180068, 3.6263979, 5.07714605]) / 5

4.4000687339999995

In [16]:
X

array([[ 5.59897502,  5.03447691],
       [ 5.43869613,  5.22017741],
       [ 4.90206887,  3.43353889],
       [ 2.80471097,  3.42742263],
       [ 5.21470916,  3.44943913],
       [ 8.88418161,  9.16975435],
       [ 9.32081253, 11.95916267],
       [ 9.40306806, 11.3840304 ],
       [10.09684978,  9.12497013],
       [ 8.85503756, 12.12938513]])

In [17]:
X[assign==1]

array([[ 8.88418161,  9.16975435],
       [ 9.32081253, 11.95916267],
       [ 9.40306806, 11.3840304 ],
       [10.09684978,  9.12497013],
       [ 8.85503756, 12.12938513]])

In [41]:
model = KMeansClustering()
model.fit(X, 3, 10)

model converged
model converged
model converged
model converged
model converged
model converged
model converged
model converged
model converged


In [42]:
cluster_centers = np.array([[5.59897502, 5.03447691],
       [4.90206887, 3.43353889],
       [5.43869613, 5.22017741]])
new_cluster_centers = np.array([[5.59897502, 5.03447691],
       [4.307163  , 3.43680022],
       [8.66644095, 9.83124668]])
np.linalg.norm(cluster_centers - new_cluster_centers)

5.659878096408878

In [43]:
model.clusters

array([[ 5.28861229,  4.28440808],
       [ 9.31198991, 10.75346054],
       [ 2.80471097,  3.42742263]])

In [44]:
model.diffs

[1.9926416747894133, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

In [45]:
X

array([[ 5.59897502,  5.03447691],
       [ 5.43869613,  5.22017741],
       [ 4.90206887,  3.43353889],
       [ 2.80471097,  3.42742263],
       [ 5.21470916,  3.44943913],
       [ 8.88418161,  9.16975435],
       [ 9.32081253, 11.95916267],
       [ 9.40306806, 11.3840304 ],
       [10.09684978,  9.12497013],
       [ 8.85503756, 12.12938513]])

In [121]:
data - cluster_centers[:, None]

NameError: name 'data' is not defined