# K Means

## Jaeho Kim (kjh3690@unist.ac.kr)

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys

In [5]:
df= pd.read_csv('./faithful.csv',index_col=0)

In [6]:
df.head()

Unnamed: 0,eruptions,waiting
1,3.6,79
2,1.8,54
3,3.333,74
4,2.283,62
5,4.533,85


In [27]:
class Kmeans:
    def __init__(self,K):
        """
        k = Number of clusters
        n = Number of data points
        d = Number of features
        mu = k x d array of Mu
        """
        self.k= K
        self.n= 0
        self.d= 0
        self.mu= np.array([K,self.d])

        
    def load_data(self,csv_path):
        df= pd.read_csv(csv_path,index_col=0)
        self.n,self.d= df.shape
        return np.array(df)
    
    def calculate_euclidean(self,a,b):
        return np.linalg.norm(a-b,axis=1)
    
    def distortion_measure(self,np_cluster,data,iteration):
        """Cost function of K means
        """
        cost_j=0
        for i in range(self.k):
            cluster_i = np.where(np_cluster == i)
            cost_j+=np.sum(self.calculate_euclidean(data[cluster_i],self.mu[i]))
        print("COST at iteration {} is {} ".format(iteration,cost_j))
        
        return cost_j

    def initialize_mu(self,data):
        """ 
        """
        mu= np.empty([self.k, self.d])
 
        for i in range(self.k):
            random_row= np.random.randint(self.n)
            mu[i]= data[random_row][:]
        self.mu= mu
            
    def update_cluster(self, data):
        np_cluster= np.empty(self.n)
        for i in range(self.n):
            np_cluster[i]= int(np.argmin(self.calculate_euclidean(self.mu,data[i])))
        return np_cluster
            
    def update_mu(self,np_cluster,data):
        for i in range(self.k):
            cluster_i = np.where(np_cluster == i)
            self.mu[i]= np.sum(data[cluster_i],axis=0)/len(data[cluster_i])
    
    def classify(self,array_x):
        assert array_x.shape[1] == self.d
        predict_class=[]
        for i in range(array_x.shape[0]):
            predict_class.append(np.argmin(self.calculate_euclidean(self.mu, array_x[i])))
        
        print(predict_class)
        return predict_class
        

    def train(self, train_path):
        
        data= self.load_data(train_path)
        self.initialize_mu(data)
        
        cost_decreased= True
        cost_list= []
        i=0
        
        while cost_decreased ==True:
            
            cluster = self.update_cluster(data)
            mu = self.update_mu(cluster,data)
            cost_list.append(self.distortion_measure(cluster,data,i))
            
            if i > 2:
                if cost_list[i]-cost_list[i-1] <0.5:
                    cost_decreased= False
                    print("No more decrease in cost")
            i+=1
        print(" ")
        print("="*40)
        print("FINAL Mu is ")
        print(self.mu)
        print("="*40)
        print("")
        print(cluster)
        
        
    

In [28]:
model= Kmeans(3)

In [29]:
model.train('./faithful.csv')

COST at iteration 0 is 996.270616412667 
COST at iteration 1 is 959.963736977936 
COST at iteration 2 is 961.5598665568355 
COST at iteration 3 is 961.5598665568355 
No more decrease in cost
 
FINAL Mu is 
[[ 2.05673404 54.05319149]
 [ 4.37731522 84.48913043]
 [ 4.10036047 74.76744186]]

[2. 0. 2. 0. 1. 0. 1. 1. 0. 1. 0. 1. 2. 0. 1. 0. 0. 1. 0. 2. 0. 0. 2. 2.
 2. 1. 0. 2. 2. 2. 2. 2. 2. 1. 2. 0. 0. 1. 0. 1. 1. 0. 1. 0. 2. 1. 0. 0.
 1. 0. 2. 1. 0. 1. 0. 1. 2. 0. 2. 1. 0. 1. 0. 1. 0. 1. 2. 2. 2. 2. 1. 0.
 2. 2. 0. 2. 0. 2. 2. 1. 2. 1. 2. 2. 2. 1. 2. 1. 0. 1. 0. 1. 0. 2. 0. 2.
 1. 2. 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 1. 1. 2. 0. 1. 2. 0. 1. 0. 1. 0. 1.
 0. 2. 2. 0. 1. 1. 0. 1. 0. 1. 0. 1. 0. 1. 0. 1. 0. 1. 0. 2. 1. 0. 1. 2.
 2. 0. 1. 0. 1. 0. 2. 2. 2. 1. 2. 2. 1. 1. 0. 1. 0. 1. 0. 2. 2. 2. 0. 1.
 0. 1. 0. 0. 2. 2. 1. 1. 2. 0. 1. 2. 0. 2. 1. 1. 0. 2. 1. 0. 1. 0. 1. 0.
 2. 1. 2. 1. 1. 2. 0. 2. 0. 1. 1. 0. 2. 0. 2. 1. 0. 1. 2. 1. 0. 2. 0. 2.
 0. 1. 0. 2. 0. 1. 0. 2. 2. 2. 2. 2. 2. 2. 2. 0. 1. 0.

In [30]:
model.classify(np.array([[3.6,70]]))

[2]


[2]