In [1]:
from sklearn import datasets
import math
import numpy as np

In [2]:
data = datasets.load_iris().data

# Class

In [3]:
import numpy as np
import math
import random

class KMedoid:
    
    n_clusters = 2
    init = 'random'
    max_iter = 300
    init_val = []
    randomize_cluster = 0
    choosen_cluster = []
    
    available_init = ['random', 'manual']
    
    def __init__(self, n_clusters=n_clusters, init=init, init_val=init_val, max_iter=max_iter, randomize_cluster=randomize_cluster):
        if n_clusters <= 0:
            raise Exception('n_clusters must be higher than 0')
        if init not in self.available_init:
            raise Exception('No init method \'' + str(init) + '\'. Available init methods'+ str(self.available_init))
        if (init == 'manual' and len(init_val) != n_clusters):
            raise Exception('init_val length doesn\'t match with n_clusters '+ str(n_clusters))
        if (n_clusters-1 < randomize_cluster) or (randomize_cluster < 0):
            raise Exception('randomize_cluster must be between 0 and n_clusters-1')
        self.n_clusters = n_clusters
        self.init = init
        self.max_iter = max_iter
        self.init_val = init_val
        self.randomize_cluster = randomize_cluster
    
    def __is_in_array(self, data, arr):
        is_exist = False
        arr_idx = 0
        while (not is_exist and arr_idx < len(arr)):
            is_data_equal = True
            data_idx = 0
            while (is_data_equal and data_idx < len(data)):
                if (data[data_idx] != arr[arr_idx][data_idx]):
                    is_data_equal = False
                else:
                    data_idx += 1
            if is_data_equal:
                is_exist = True
            else:
                arr_idx += 1
        return is_exist
        
    def __manhattan_distance(self, data1, data2):
        '''
        Fungsi untuk menghitung manhattan distance di antara dua vector dengan panjang yang sama
        '''
        sum = 0
        if (len(data1) == len(data2)):
            for x1, x2 in zip(data1, data2):
                sum += abs(x1 - x2)
            return sum
        else:
            raise Exception('Length doesn\'t match')
            
    def __get_distance(self, data1, data2):
        return self.__manhattan_distance(data1, data2)
        
    def __calculate_distance_matrix(self, data, centroids):
        dist_matrix = []        
        for i in range(len(centroids)):
            dist_curr_centroid = []
            for j in range(len(data)):
                dist = self.__get_distance(centroids[i], data[j])
                dist_curr_centroid.append(dist)
            dist_matrix.append(dist_curr_centroid)
        
        return dist_matrix
    
    def __assign_data_to_cluster(self, dist_matrix):
        cluster_of_data = []
        for j in range(len(dist_matrix[0])):
            cluster = 0
            min_distance = dist_matrix[0][j]
            for i in range(1,len(dist_matrix)):
                if (dist_matrix[i][j] < min_distance):
                    cluster = i
                    min_distance = dist_matrix[i][j]
            cluster_of_data.append(cluster)
        return cluster_of_data
    
    def __get_centroids(self, data, cluster_of_data, centroids):
        # centroid candidate
        data_of_randomize_cluster = []
        for idx, data_cluster in enumerate(cluster_of_data):
            if data_cluster == self.randomize_cluster:
                data_of_randomize_cluster.append(data[idx])
        
        # choose random
        idx = 0
        stop = False
        while ( not stop and idx < len(data_of_randomize_cluster)
        ):
            new_centroid = np.copy(data_of_randomize_cluster[idx])
            if (
                self.__get_distance(new_centroid, centroids[self.randomize_cluster]) == 0 or 
                self.__is_in_array(new_centroid, self.choosen_cluster)
            ):
                idx += 1
            else:
                stop = True
        
        if not self.__is_in_array(new_centroid, self.choosen_cluster):
            self.choosen_cluster.append(new_centroid)
        
        new_centroids = np.copy(centroids)
        new_centroids[self.randomize_cluster] = new_centroid
        return new_centroids
    
    def __calculate_error(self, data, cluster_of_data, new_cluster_of_data, centroids, new_centroids):
        old_error = 0
        new_error = 0
        for n in range(self.n_clusters):
            for idx, val in enumerate(data):
                old_error += self.__get_distance(val, centroids[cluster_of_data[idx]])
                new_error += self.__get_distance(val, new_centroids[new_cluster_of_data[idx]])
        
        return new_error-old_error
        
    def fit_predict(self, data):
        cluster_of_data = []
        
        # initiate centroid
        centroids = []
        if (self.init == 'random'):
            # cek keunikan data
            unique_data_idx = []
            unique_data = []
            i = 0
            while (len(unique_data_idx) < self.n_clusters) and (i < len(data)):
                if not self.__is_in_array(data[i], unique_data):
                    unique_data_idx.append(i)
                    unique_data.append(data[i])
                i += 1
                
            if (len(unique_data_idx) < self.n_clusters):
                # jika keunikan data kurang dari n_clusters
                for u in unique_data_idx:
                    curr_centroid = np.copy(data[u])
                    centroids.append(curr_centroid)
                for i in range(self.n_clusters - len(unique_data_idx)):
                    rand_idx = random.randint(-1,len(data)-1)
                    # cek apakah sudah terpilih atau belum
                    while (rand_idx in unique_data_idx):
                        rand_idx = random.randint(-1,len(data)-1)
                    curr_centroid = np.copy(data[rand_idx])
                    centroids.append(curr_centroid)
            else:
                for i in range(self.n_clusters):
                    rand_idx = random.randint(-1,len(data)-1)
                    curr_centroid = np.copy(data[rand_idx])
                    # cek apakah sudah terpilih atau belum
                    while (self.__is_in_array(curr_centroid, centroids)):
                        rand_idx = random.randint(-1,len(data)-1)
                        curr_centroid = np.copy(data[rand_idx])
                    centroids.append(curr_centroid)
        else:
            # self.init == 'manual'
            centroids = self.init_val
        
        iteration = 1
        is_convergen = False
        
        while (not is_convergen and iteration <= self.max_iter):
            # calculate distance all data to all centroid
            dist_matrix = self.__calculate_distance_matrix(data, centroids)
            # assign all data to cluster
            cluster_of_data = self.__assign_data_to_cluster(dist_matrix)
            # get new possible centroid
            new_centroids = self.__get_centroids(data, cluster_of_data, centroids)
            # calculate distance all data to all new centroid
            new_dist_matrix = self.__calculate_distance_matrix(data, new_centroids)
            # assign all data to new cluster
            new_cluster_of_data = self.__assign_data_to_cluster(new_dist_matrix)
            # convergency checking
            if (self.__calculate_error(data, cluster_of_data, new_cluster_of_data, centroids, new_centroids) >= 0):
                is_convergen = True
            
            # for next iteration
            if not is_convergen:
                cluster_of_data = np.copy(new_cluster_of_data)
                centroids = np.copy(new_centroids)
                iteration += 1
                
        return np.array(cluster_of_data)

## Experiment

In [4]:
kmedoid = KMedoid(n_clusters=4)
pred_a = kmedoid.fit_predict(data[:50])
pred_a

array([0, 2, 2, 2, 0, 1, 0, 0, 2, 2, 1, 3, 2, 2, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 0, 3, 2, 0, 0, 0, 2, 2, 1, 1, 1, 2, 0, 0, 2, 2, 0, 0, 2, 2, 0,
       1, 2, 0, 2, 0, 0])