In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Cluster Algorithms

## k-means
1) Implement k-means with a maximum iteration setting and a convergence criterion.

In [None]:
def kmeans(x: np.ndarray, k: int,
           maxiter: int = 1000, rtol: float = 1e-3, atol: float = 1e-7):
    assert data.shape[1] == 2
    inds = np.random.choice(range(len(x)), size=k, replace=False)
    means = x[inds]
    for _ in range(maxiter):
        # Assignment
        dist = np.linalg.norm(x[None, ...] - means[:, None, :], axis=-1)
        labels = np.argmin(dist, axis=0)
        # Update
        new_means = []
        for i in range(k):
            new_means.append(np.mean(x[labels == i], axis=0))
        new_means = np.asarray(new_means)
        # Convergence
        if np.all(np.isclose(new_means, means, rtol=rtol, atol=atol)):
            break
        means = new_means
    return means, labels

In [None]:
data = np.load("p12.npy")
data.shape

In [None]:
kmeans(data, 2)

2) We will plot the clustering. Note that the results will not be reproducible exactly, unless the random seed is set, because this k-means algorithm does not necessairly find the optimum.

In [None]:
colors = ['r', 'c', 'g', 'm']

In [None]:
np.random.seed(0)
k = 2
means, labels = kmeans(data, k)

fig, ax = plt.subplots()
for i, c in enumerate(colors[:k]):
    ax.scatter(*data[labels == i].T, marker='o', c=c, alpha=0.25)
    ax.scatter(*means[i].T, s=100, marker='*', c='k')
fig.show()

In [None]:
# SAME ONE TWICE TO SHOW THAT IT IS NOT REPRODUCIBLE
np.random.seed(1)
k = 2
means, labels = kmeans(data, k)

fig, ax = plt.subplots()
for i, c in enumerate(colors[:k]):
    ax.scatter(*data[labels == i].T, marker='o', c=c, alpha=0.25)
    ax.scatter(*means[i].T, s=100, marker='*', c='k')
fig.show()

In [None]:
k = 3
means, labels = kmeans(data, k)

fig, ax = plt.subplots()
for i, c in enumerate(['r', 'c', 'g', 'm'][:k]):
    ax.scatter(*data[labels == i].T, marker='o', c=c, alpha=0.25)
    ax.scatter(*means[i].T, s=100, marker='*', c='k')
fig.show()

In [None]:
k = 4
means, labels = kmeans(data, k)

fig, ax = plt.subplots()
for i, c in enumerate(['r', 'c', 'g', 'm'][:k]):
    ax.scatter(*data[labels == i].T, marker='o', c=c, alpha=0.25)
    ax.scatter(*means[i].T, s=100, marker='*', c='k')
fig.show()