#### Importing modules

In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import cluster
from sklearn.metrics import accuracy_score
from itertools import permutations

#### KMeans class

In [2]:
class KMeans:  
    # Constructor
    def __init__(self, num_clusters=8, num_centroid_seeds=10, metric='l1', max_iter=300):
        self.num_clusters = num_clusters # Кол-во кластеров
        self.num_centroid_seeds = num_centroid_seeds # Кол-во генераций начальных центров кластеров
        self.clusters = [i for i in range(num_clusters)] # Кластеры
        self.metric = {'l1': self.l1, 'euclid': self.euclid, 'chebyshev': self.chebyshev}[metric] # Метрика
        self.max_iter = max_iter # Максимальное число итераций
    
    # l1 metric
    def l1(self, samples, points):
        return np.array([[np.sum( np.abs(sample - point) ) for point in points] for sample in samples])
    
    # l2 metric
    def euclid(self, samples, points):
        return np.array([[np.sqrt( np.sum((sample - point)**2) ) for point in points] for sample in samples])
    
    # l_infinity metric
    def chebyshev(self, samples, points):
        return np.array([[np.max( np.abs(sample - point) ) for point in points] for sample in samples])
    
    # Fit method
    def fit(self, x):
        best_clusters_centers = None
        best_inertia = np.Infinity
        for i in range(self.num_centroid_seeds):
            self.cluster_centers = x[np.random.choice(x.shape[0], self.num_clusters, replace=False)]
            clusters_dist = self.metric(x, self.cluster_centers)
            x_clusters = np.array([np.argmin(elem) for elem in clusters_dist])
            
            current_cluster_centers, current_inertia = self.adjust_centroids(x_clusters)
            if (current_inertia < best_inertia):
                best_inertia = current_inertia
                best_cluster_centers = current_cluster_centers
        
        self.cluster_centers = best_cluster_centers
    
    # Shift clusters center points
    def adjust_centroids(self, x_clusters):
        prev_cluster_centers = list()
        for i in range(self.max_iter):
            self.cluster_centers = list()
            for cluster in self.clusters:
                indices = np.where(x_clusters==cluster)[0]
                if indices.shape[0] == 0: continue
                self.cluster_centers.append( np.mean(x[indices], axis=0) )
            clusters_dist = self.metric(x, self.cluster_centers)
            x_clusters = np.array([np.argmin(elem) for elem in clusters_dist])
            if (np.array_equal(prev_cluster_centers, self.cluster_centers)): break
            prev_cluster_centers = self.cluster_centers
        inertia = np.sum([min(elem) for elem in clusters_dist])
        return (self.cluster_centers, inertia)
    
    # Predict method
    def predict(self, x):
        clusters_dist = self.metric(x, self.cluster_centers)
        return np.array( [np.argmin(elem) for elem in clusters_dist] )

In [3]:
# Evaluate clusterization accuracy
def accuracy(y_pred, y_real):
    max_eff = 0
    clusters = np.unique(y_pred)
    for combination in list(permutations(clusters, clusters.shape[0])):
        new_clusters = list()
        for value in y_pred:
            new_clusters.append(combination[value])
        accuracy = accuracy_score(new_clusters, y_real)
        if (accuracy > max_eff): max_eff = accuracy
    return max_eff*100

In [4]:
iris = datasets.load_iris()
x = iris['data']
y = iris['target']

my_kmeans = KMeans(num_clusters=3, num_centroid_seeds=2, metric='euclid', max_iter=300)
my_kmeans.fit(x)
y_pred = my_kmeans.predict(x)
print(f'iris:\nmanual: accuracy = {accuracy(y_pred, y)}%')

sklearn_kmeans_full = cluster.KMeans(n_clusters=3, n_init=10, max_iter=300, algorithm='lloyd')
sklearn_kmeans_full.fit(x)
y_pred = sklearn_kmeans_full.predict(x)
print(f'\nsklearn, full: accuracy = {accuracy(y_pred, y)}%')

sklearn_kmeans_elkan = cluster.KMeans(n_clusters=3, n_init=10, max_iter=300, algorithm='elkan')
sklearn_kmeans_elkan.fit(x)
y_pred = sklearn_kmeans_elkan.predict(x)
print(f'\nsklearn, elkan: accuracy = {accuracy(y_pred, y)}%')

iris:
manual: accuracy = 89.33333333333333%

sklearn, full: accuracy = 89.33333333333333%

sklearn, elkan: accuracy = 89.33333333333333%


In [5]:
wine = datasets.load_wine()
x = wine['data']
y = wine['target']

my_kmeans = KMeans(num_clusters=3, num_centroid_seeds=1, metric='l1', max_iter=300)
my_kmeans.fit(x)
y_pred = my_kmeans.predict(x)
print(f'wine:\nmanual: accuracy = {accuracy(y_pred, y)}%')

sklearn_kmeans_full = cluster.KMeans(n_clusters=3, n_init=10, max_iter=300, algorithm='lloyd')
sklearn_kmeans_full.fit(x)
y_pred = sklearn_kmeans_full.predict(x)
print(f'\nsklearn, full: accuracy = {accuracy(y_pred, y)}%')

sklearn_kmeans_elkan = cluster.KMeans(n_clusters=3, n_init=10, max_iter=300, algorithm='elkan')
sklearn_kmeans_elkan.fit(x)
y_pred = sklearn_kmeans_elkan.predict(x)
print(f'\nsklearn, elkan: accuracy = {accuracy(y_pred, y)}%')

wine:
manual: accuracy = 70.78651685393258%

sklearn, full: accuracy = 70.2247191011236%

sklearn, elkan: accuracy = 70.2247191011236%


In [6]:
cancer = datasets.load_breast_cancer()
x = cancer['data']
y = cancer['target']

my_kmeans = KMeans(num_clusters=2, num_centroid_seeds=10, metric='chebyshev', max_iter=300)
my_kmeans.fit(x)
y_pred = my_kmeans.predict(x)
print(f'cancer:\nmanual: accuracy = {accuracy(y_pred, y)}%')

sklearn_kmeans_full = cluster.KMeans(n_clusters=2, n_init=10, max_iter=300, algorithm='lloyd')
sklearn_kmeans_full.fit(x)
y_pred = sklearn_kmeans_full.predict(x)
print(f'\nsklearn, full: accuracy = {accuracy(y_pred, y)}%')

sklearn_kmeans_elkan = cluster.KMeans(n_clusters=2, n_init=10, max_iter=300, algorithm='elkan')
sklearn_kmeans_full.fit(x)
y_pred = sklearn_kmeans_full.predict(x)
print(f'\nsklearn, elkan: accuracy = {accuracy(y_pred, y)}%')

cancer:
manual: accuracy = 85.41300527240774%

sklearn, full: accuracy = 85.41300527240774%

sklearn, elkan: accuracy = 85.41300527240774%
