Modify the scratch code of K-means clustering in our lecture:
- Modify so it print out the total within-cluster variation.  Then try to run several k and identify which k is best.
- Since k-means can be slow due to its pairwise computations, let's implement a mini-batch k-means in which the cluster is create using only partial subset of samples.
- Put everything into a class

Mini-Batch will rarely converge, thus it is important to add a max_iteration or some tolerance.  Last, theoretically speaking, Mini-Batch will never perform better in terms of accuracy when compare to K-means, but it is very close to optimal but will almost always beat K-means in terms of time given large dataset and a modest tolerance parameter.

In [1]:
from sklearn.datasets import make_blobs
from sklearn.metrics import pairwise_distances_argmin
import numpy as np

In [2]:
X, y = make_blobs(n_samples= 1500, centers = 4, cluster_std= 0.60, random_state= 27)
print(X.shape)
print(y.shape)

(1500, 2)
(1500,)


In [11]:
class Kmeans:
    def __init__(self, k, replacement = True, batch_size = 100, max_iter = 100):
        self.k = k
        self.replacement = replacement
        self.batch_size = batch_size
        self.max_iter = max_iter
        
    def fit(self, X):
        m, n = X.shape
        
        rng = np.random.RandomState(27)
        i = rng.permutation(m)[:self.k]
        self.centers = X[i]
        
        for ix in np.arange(self.max_iter):
            random = rng.randint(m)
            X_batch = X[random:random+self.batch_size]
            labels = pairwise_distances_argmin(X_batch, self.centers)
            new_centers = []
            for i in range(self.k):
                new_centers.append(X_batch[labels == i].mean(axis=0))

            new_centers = np.array(new_centers)

            if(np.allclose(self.centers, new_centers, rtol = 0.1)):
                break
            else:
                self.centers = new_centers
        print(f'iterations stop at {ix}')

        total_with_variation_score = 0
        labels = pairwise_distances_argmin(X, self.centers)
        for i in range(self.k):
            cluster_mean = X[labels == i].mean(axis =0)
            total_with_variation_score += ((X[labels == i] - cluster_mean)** 2).sum()
        
        print(f'Total with the variation score = {total_with_variation_score}')
        
    def predict(self, X):
        return pairwise_distances_argmin(X, self.centers)

In [12]:
for k in range(2, 7):
    print(f'k = {k}')
    model = Kmeans(k)
    model.fit(X)
    preds = model.predict(X)

k = 2
iterations stop at 8
Total with the variation score = 14164.269175554673
k = 3
iterations stop at 6
Total with the variation score = 3270.7473517399058
k = 4
iterations stop at 3
Total with the variation score = 1055.7563573135321
k = 5
iterations stop at 4
Total with the variation score = 967.5433377147152
k = 6
iterations stop at 4
Total with the variation score = 893.0925365790056
