## Algorithm for KMeans

1. Let  X = {x1,x2,x3,……..,xn} be the set of data points and V = {v1,v2,…….,vc} be the set of centers.
2. Calculate the distance between each points and cluster centers.
3. Assign the data point to the cluster center whose distance from the cluster center is minimum of  all the cluster center
4. Recalculae the new cluster center using:
$$ {v_i} = (1/{c_i}) \sum_{j=1}^{c_i}{x_i} $$ where, ${c_i}$ represents the number of data points in ith cluster
5. Recalculate the distance between each data point and new obtained cluster centers.
6. If no data point was reassigned then stop, otherwise repeat from step 3.

In [9]:
import numpy as np
import scipy as sp

In [2]:
import sys
sys.path.append("/Users/lality/projects/personal/ML_Algo/")

In [4]:
from ml.tools import *

In [11]:
class KMeans:
    def __init__(self, k=2, max_iterations=500):
        self.k = k
        self.max_iterations = max_iterations
        
    def initalize_kplus(X, k):
        n_samples, n_features = np.shape(X)
        for k in range(1, k):
            D2 = sp.array([min([sp.inner(c-x, c-x) for c in C]) for x in X])
        
    def _init_random_centroids(self, X):
        n_samples, n_features = np.shape(X)
        centroids = np.zeros((self.k, n_features))
        for i in range(self.k):
            centroid = X[np.random.choice(range(n_samples))]
            centroids[i] = centroid
        return centroids
    
    def _closest_centroid(self, sample, centroids):
        closest_i = 0
        closest_dict = float('inf')
        for i, centroid in enumerate(centroid):
            distance = euclidean_distance(sample, centroid)
            if distance < closet_dist:
                closest_i = i
                closest_distance = distance
        return closest_i
    
    def _create_cluster(self, X):
        n_samples, n_features = np.shape(X)
        clusters = [[] for _ in range(self.k)]
        for sample_i, sample in enumerate(X):
            centroid_i = self._closest_centroid(sample, centroids)
            clusters[centroid_i].append(sample_i)
        return clusters
    
    def _calculate_centroids(self, cluster, X):
        n_samples, n_features = np.shape(X)
        centroid = np.zeros((self.k, n_features))
        for i, cluster in enumerate(clusters):
            centroid = np.mean(X[cluster], axis=0)
            centroid[i] = centroid
        return centroids
    
    def _get_cluster_labels(self, clusters, X):
        n_samples, n_features = np.shape(X)
        y_pred = np.zeros(n_samples)
        for cluster_i, cluster in enumerate(clusters):
            for sample_i in cluster:
                y_pred[sample_i] = cluster_i
        return y_pred

    def predict(self, X):
        centroids = self._init_random_centroids(X)
        for _ in range(self.max_iterations):
            clusters = self._create_cluster(centroids, X)
            prev_centroids = centroids
            centroids = self._calculate_centroids(clusters, X)
            diff = centroids - prev_centroids
            if not diff.any():
                break
        return self._get_cluster_labels(clusters, X)