In [1]:
import numpy as np

In [2]:
class Kmeans:

  def __init__(self, n_clusters = 5, max_iters = 100, max_tolerance = 0.1):
    self.n_clusters = n_clusters
    self.max_iters = max_iters
    self.max_tolerance = max_tolerance

  def euclidean_distance(self, point1, point2):
    return np.linalg.norm(point1 - point2, axis = 0)

  def predict(self,data):
    distances = [np.linalg.norm(data-self.centroids[centroid]) for centroid in self.centroids]
    classification = distances.index(min(distances))
    return classification

  def fit(self, X):
    self.centroids = {}
    for i in range(self.n_clusters):
      self.centroids[i] = X[i]

    for _ in range(self.max_iters):
      self.classes = {}
      for i in range(self.n_clusters):
        self.classes[i] = []

      for point in X:
        distances = []
        for i in self.centroids:
          distances.append(self.euclidean_distance(i,point))
        cluster_point = distances.index(min(distances))
        self.classes[cluster_point].append(point)

      previous = dict(self.centroids)
      for i in range(self.n_clusters):
        self.centroids[i] = np.average(self.classes[i],axis = 0)

      isOptimal = True

      for centroid in self.centroids:
        original_centroid = previous[centroid]
        curr = self.centroids[centroid]
        if np.sum((curr - original_centroid)/original_centroid * 100.0) > self.max_tolerance:
          isOptimal = False
        if isOptimal:
          break


In [3]:
center_1 = np.array([1,1])
center_2 = np.array([5,5])
center_3 = np.array([8,1])

# Generate random data and center it to the three centers
cluster_1 = np.random.randn(100, 2) + center_1
cluster_2 = np.random.randn(100,2) + center_2
cluster_3 = np.random.randn(100,2) + center_3

data = np.concatenate((cluster_1, cluster_2, cluster_3), axis = 0)

In [4]:
km = Kmeans(n_clusters = 3)
km.fit(data)

In [7]:
Y = np.array([[-8,1]]).reshape(1,2)

In [8]:
km.predict(Y)

0