In [None]:
import numpy as np
import scipy.linalg
import matplotlib.pyplot as plt
from ipywidgets import interact, interactive, fixed, interact_manual, IntSlider

In [None]:
def assign_clusters(data, means):
  n, d = data.shape
  k = means.shape[0]
  assert d == means.shape[1]
  out = np.zeros(n)
  for i, x in enumerate(data):
    # ...
  return out

def update_means(data, clusters):
  n, d = data.shape
  assert len(clusters) == n
  k = len(set(clusters))
  cluster_means = []
  for i in range(k):
    # ...
    cluster_means.append(cluster_mean)
  return np.array(cluster_means)

def cost(data, clusters, means):
  out = 0
  n, d = data.shape
  k = means.shape[0]
  assert means.shape[1] == d
  assert len(clusters) == n
  for i in range(k):
    out += np.linalg.norm(data[clusters == i] - means[i])
  return out

def k_means_cluster(data, k):
  n, d = data.shape
  means = data[np.random.choice(n, k, replace=False)]
  assignments = assign_clusters(data, means)
  while True:
    yield means, assignments
    means = update_means(data, assignments)
    new_assignments = assign_clusters(data, means)
    if np.all(assignments == new_assignments):
      yield means, assignements
      print("Final cost = {}".format(cost(data, assignments, means)))
      break
    assignments = new_assignments

In [None]:
def final_k_means_cluster(data, k):
  out = list(k_means_cluster(data, k))
  return out[-1]

def plot_clustering(data, means, assignments, title="Predicted Clusters"):
  k = len(means)
  for j in range(k):
    plt.scatter(*data[assignments == j].T)
  plt.scatter(*means.T, marker="x", s=240, c="black")
  plt.title(title)
  plt.show()

def interact_clustering(data, logger):
  history = list(logger)
  k = history[0][0].shape[0]

  def plotter(i):
    plot_clustering(data, *history[i])

  interact(plotter, i=IntSlider(min=0, max=len(history)-1, continuous_update=False))

def demo(classes, history=False):
  for c in classes:
    plt.scatter(*c.T)
  plt.title("Ground Truth Clusters")
  plt.show()

  points = np.vstack(classes)

  if history:
    interact_clustering(points, k_means_cluster(points, len(classes)))
  else:
    means, assignments = final_k_means_cluster(points, len(classes))
    plot_clustering(points, means, assignments)


In [None]:
def gen_gaussian_points(n, mean, sigma):
  return np.random.normal(mean, sigma, [n, 2])

N = 100

class_a = gen_gaussian_points(N, [-1, 0], [1, 1])
class_b = gen_gaussian_points(N, [1, 0], [1, 1])

points = np.vstack([class_a, class_b])

plt.scatter(*class_a.T)
plt.scatter(*class_b.T)


In [None]:
interact_clustering(points, k_means_cluster(points, 3))

In [None]:
class_a = gen_gaussian_points(N, [-3, 0], [1, 1])
class_b = gen_gaussian_points(N, [3, 0], [1, 1])
class_c = gen_gaussian_points(N, [0, 3], [1, 1])
class_d = gen_gaussian_points(10, [0, 15], [1, 1])

demo([class_a, class_b, class_c, class_d], history=False)

In [None]:
class_a = gen_gaussian_points(N, [-2, 0], [1, 1])
class_b = gen_gaussian_points(N, [0, 0], [1, 1])
class_c = gen_gaussian_points(N, [2, 0], [1, 1])

points = np.vstack([class_a, class_b, class_c])

gt_means = [np.mean(cluster, axis = 0) for cluster in [class_a, class_b, class_c]]
gt_means = np.stack(gt_means)
gt_assignments = np.array([0] * N + [1] * N + [2] * N)
plot_clustering(points, gt_means, gt_assignments, title="Ground Truth Clusters")

means, assignments = final_k_means_cluster(points, 3)
plot_clustering(points, means, assignments, "Predicted Clusters")

print("Ground Truth Means", gt_means)
print("Predicted Means", means)

In [None]:
RESCALE_DATA = False

class_a = gen_gaussian_points(N, [-3, 0], [1, 10])
class_b = gen_gaussian_points(N, [3, 0], [1, 10])
demo([class_a, class_b])