In [16]:
import scipy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.cluster import KMeans

In [17]:
N, p = 100, 5
X = np.random.randn(N, p)
X.shape

(100, 5)

In [18]:
X

array([[ 0.52915321, -1.35089799, -1.66379361, -0.05020703,  0.86475472],
       [ 0.27141522, -0.63543185,  1.21500036,  0.77117788,  0.44154891],
       [-0.64059424, -1.75092642,  0.51513213, -0.78196484,  0.25089112],
       [ 1.49833674,  1.09697512, -0.85294632, -0.16677745, -0.61041446],
       [-0.19000486,  1.37300145,  0.16204768, -0.77276072, -1.93658647],
       [-1.57243078, -0.16683623,  0.04320556, -0.90927715, -0.08698336],
       [-0.0094222 , -0.94417359,  0.49623608,  0.18565057, -0.8395366 ],
       [-1.4393985 ,  0.93079091, -1.32787065, -0.52228488, -0.02255664],
       [-0.0089822 , -2.05953189,  0.34591242,  0.14613821, -0.74503782],
       [ 0.05104445,  0.42307827,  1.01226337, -0.14975985,  0.95537198],
       [ 0.59852582,  1.2807477 ,  0.24934363,  0.96281446, -0.37286588],
       [-2.51092923,  1.40540411, -0.30783389,  0.54697972,  0.63612013],
       [-0.54687261,  0.31754821,  0.94331499,  1.72957222,  0.23193003],
       [-0.41673406, -1.0470967 , -1.2

In [19]:
def initialize_centroids(X,k):
    ids = np.random.permutation(X.shape[0])
    centroids = X[ids[:k]]
    return centroids

In [20]:
centroids = initialize_centroids(X, k=3)

In [21]:
from scipy.spatial import distance

dist = distance.cdist(X, centroids, 'euclidean')
dist

array([[2.25305446, 2.1610313 , 4.79269645],
       [2.53703153, 2.36836848, 2.98459538],
       [3.01657962, 3.11627907, 3.86538037],
       [2.52851053, 2.03107534, 3.79942105],
       [3.4548117 , 3.65493384, 2.9751404 ],
       [2.42643226, 3.70492282, 2.66485384],
       [2.78239525, 2.17462021, 3.35731478],
       [1.94165128, 3.83829324, 2.92198139],
       [3.40603483, 2.47298994, 4.32665204],
       [1.91274988, 2.86439493, 2.16925328],
       [2.22411122, 2.47914823, 2.48136215],
       [2.54175083, 4.84272195, 2.08244457],
       [2.45994564, 3.18388035, 2.25762866],
       [2.96094703, 2.6464286 , 4.32175829],
       [1.69228794, 2.03066972, 3.39672188],
       [2.27695672, 2.26296717, 2.52537594],
       [3.49135441, 2.69430851, 3.75987249],
       [3.21259114, 3.11628383, 2.71085394],
       [2.11561044, 3.19705715, 1.85154278],
       [1.59008511, 2.80736173, 2.58013718],
       [2.10843827, 2.83587022, 2.85733636],
       [2.15233328, 2.58110236, 2.76046871],
       [2.

In [22]:
labels = np.argmin(dist, axis=1)

In [30]:
labels

array([1, 1, 0, 1, 2, 0, 1, 0, 1, 0, 0, 2, 2, 1, 0, 1, 1, 2, 2, 0, 0, 0,
       2, 1, 0, 0, 2, 1, 2, 0, 0, 0, 2, 1, 2, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 2, 2, 1, 1, 0, 0, 2, 0,
       0, 2, 0, 0, 2, 0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 0, 0, 1, 1, 2, 1, 1,
       0, 0, 0, 2, 0, 0, 0, 2, 1, 1, 0, 2], dtype=int64)

In [23]:
K = 3
def update_centroids(X, labels):
    return np.array([X[labels==k].mean(axis=0) for k in range(K)])

In [24]:
update_centroids(X, labels)

array([[-0.51688289,  0.05628857, -0.21218061,  0.2412033 ,  0.40878255],
       [ 0.58751586, -0.77031556, -0.26667624,  0.25052716, -0.54665775],
       [-0.87647573,  0.80429737,  0.811754  , -0.30123552, -0.01708233]])

In [25]:
def update_centroids_v2(X, labels):
    tmp = []
    for k in range(K):
        X_cluster = X[labels==k]
        tmp.append(X_cluster.mean(axis=0))
    return np.array(tmp)

def compute_wcv(X, centroids, labels):
    return np.sum([np.sum(np.linalg.norm(X[labels==k] - centroids[k], axis=1)**2)\
                  for k in range(K)])

def has_convert(labels, prev_labels):
    return (labels == prev_labels).all()


In [26]:
update_centroids_v2(X, labels)

array([[-0.51688289,  0.05628857, -0.21218061,  0.2412033 ,  0.40878255],
       [ 0.58751586, -0.77031556, -0.26667624,  0.25052716, -0.54665775],
       [-0.87647573,  0.80429737,  0.811754  , -0.30123552, -0.01708233]])

In [27]:
compute_wcv(X, centroids, labels)

551.365554701605

In [28]:
X[labels==0].shape

(50, 5)

In [29]:
data = pd.read_csv("kmeans/CustomerData.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'kmeans/CustomerData.csv'

In [None]:
data.head()

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185


In [None]:
def kmean_clustering(X: np.array, K: int=3):
    # init centroids
    centroids = initialize_centroids(X, k=K)
    # initialize labels
    N = X.shape[0]
    pre_labels = np.zeros((N, 1))
    # training
    it = 0
    max_iter = 10_000
    while True:
        # compute distances from X_i to centroids
        distances = distance.cdist(X, centroids) # shape: (N, K)
        # assign new labels
        labels = np.argmin(distances, axis=1)
        # check convergence
        if has_convert(pre_labels, labels) or it >= max_iter:
            break
        # compute total within cluster variance
        wcv = compute_wcv(X, centroids, labels)
        print(f">> iters: {it} - wcv: {wcv}")
        # update centroids
        centroids = update_centroids(X, labels)
        # update old labels
        pre_labels = labels
        # increase iteration count
        it += 1

    return labels

In [None]:
labels = kmean_clustering(X=data.to_numpy(), K=10)

>> iters: 0 - wcv: 8718694470.0
>> iters: 1 - wcv: 131355103003.66766
>> iters: 2 - wcv: 106307987579.01207
>> iters: 3 - wcv: 95271186283.8492
>> iters: 4 - wcv: 88010771562.67642
>> iters: 5 - wcv: 84204302063.01915
>> iters: 6 - wcv: 82790644253.67743
>> iters: 7 - wcv: 82254179232.01402
>> iters: 8 - wcv: 82087759002.23119
>> iters: 9 - wcv: 82002333043.53404
>> iters: 10 - wcv: 81909937022.34155
>> iters: 11 - wcv: 81781068121.00833
>> iters: 12 - wcv: 81627388793.87651
>> iters: 13 - wcv: 81560504664.06033
>> iters: 14 - wcv: 81554225129.534


In [None]:
labels

array([2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
       1, 0, 1, 2, 2, 2, 0, 1, 2, 2, 1, 1, 2, 2, 1, 2, 2, 1, 1, 2, 2, 2,
       2, 0, 2, 0, 2, 0, 2, 2, 1, 2, 1, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0,
       2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 0, 2, 2, 2, 2, 2, 1, 2, 0, 0, 1,
       2, 1, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 0,
       2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1, 2, 1, 2, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 0, 2, 2, 2, 1, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2,
       1, 2, 2, 2, 2, 1, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2,
       2, 2, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2,
       2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1,
       2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2,