**k-Means example with Iris data**

In [1]:
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)

n = len(X)
d = len(X[0])

**k-Means by sklearn**

It's not quite stable, that's why sklearn provide args for smarter init and retry

In [162]:
from sklearn.cluster import KMeans
import numpy as np
from itertools import permutations

kmeans = KMeans(n_clusters=3, init='random', n_init=1).fit(X)
y_hat = kmeans.predict(X)

def eval(y_hat):
    y_hat_mapped = []

    for mapping in permutations([0,1,2]):
        map_y = lambda x: mapping[x]
        y_hat_mapped.append(np.vectorize(map_y)(y_hat))

    accuracies = [np.mean(y_pred == y) for y_pred in y_hat_mapped]
    print(f'accuracy: {max(accuracies)}')

eval(y_hat)

accuracy: 0.5733333333333334


**k-Means from scratch**

In [152]:
i_centroids = np.random.randint(low=0, high=n, size=3)
max_iter = 300
centroids = X[i_centroids]

def predict(X, centroids):
    derive_dist = lambda x_row: [np.linalg.norm(x_row - centroid) for centroid in centroids]
    all_distances = np.apply_along_axis(derive_dist, arr=X, axis=1)
    return np.argmin(all_distances, axis=1)

for i in range(1, max_iter+1):
    # predict and assign
    prediction = predict(X, centroids)
    assignment =  [[] for _ in range(3)]
    for row_num, pred_row in enumerate(prediction):
        assignment[pred_row].append(row_num)

    # adjust
    new_centroids = np.array([np.mean(X[indicies], axis=0) for indicies in assignment])
    if np.array_equal(new_centroids, centroids):
        print(f'Early stop at iteration {i}')
        break

    centroids = new_centroids


y_hat = predict(X, centroids)
eval(y_hat)

Early stop at iteration 14
accuracy: 0.8866666666666667
