In [0]:
# %% [code]
import math
import random
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets


iris_dataset = datasets.load_iris()

# %% [code]
iris_data = iris_dataset["data"]
iris_target = iris_dataset["target"]
iris_dataset = np.concatenate((iris_data, np.reshape(iris_target, (-1, 1))), axis=1)
np.random.shuffle(iris_dataset)

# %% [code]
iris_data = iris_dataset[:, :-1]
iris_target = iris_dataset[:, -1]

# %% [markdown]
# ### Non-Processed Data

# %% [code]
iris_data_non = iris_data.copy()

# %% [markdown]
# ### Standard Score

# %% [code]
iris_data_standard = iris_data.copy()

# %% [code]
iris_data_standard[:, 0] = (iris_data_standard[:, 0] - iris_data_standard[:, 0].mean())/(iris_data_standard[:, 0].std())
iris_data_standard[:, 1] = (iris_data_standard[:, 1] - iris_data_standard[:, 1].mean())/(iris_data_standard[:, 1].std())
iris_data_standard[:, 2] = (iris_data_standard[:, 2] - iris_data_standard[:, 2].mean())/(iris_data_standard[:, 2].std())
iris_data_standard[:, 3] = (iris_data_standard[:, 3] - iris_data_standard[:, 3].mean())/(iris_data_standard[:, 3].std())

# %% [markdown]
# ### Scaling

# %% [code]
iris_data_scaling = iris_data.copy()

# %% [code]
iris_data_scaling[:, 0] = (iris_data_scaling[:, 0] -iris_data_scaling[:, 0].min()) / (iris_data_scaling[:, 0].max() - iris_data_scaling[:, 0].min())
iris_data_scaling[:, 1] = (iris_data_scaling[:, 1] -iris_data_scaling[:, 1].min()) / (iris_data_scaling[:, 1].max() - iris_data_scaling[:, 1].min())
iris_data_scaling[:, 2] = (iris_data_scaling[:, 2] -iris_data_scaling[:, 2].min()) / (iris_data_scaling[:, 2].max() - iris_data_scaling[:, 2].min())
iris_data_scaling[:, 3] = (iris_data_scaling[:, 3] -iris_data_scaling[:, 3].min()) / (iris_data_scaling[:, 3].max() - iris_data_scaling[:, 3].min())

# %% [markdown]
# ## K-Means Algorithm

# %% [code]
def kmeans(sample, K, maxiter):
    num_sample = sample.shape[0]
    dim_sample = sample.shape[1]
    
    # label of each sample
    label_sample = np.zeros((num_sample, 1))
    
    # distance between each sample and each center of cluster
    dist = np.zeros((num_sample, K))
    
    # select K center from sample
    center_idx = random.sample(range(num_sample), K)
    center_pos = sample[center_idx, :] 
    
    iteration = 0
    while iteration < maxiter:
        for i in range(K):
            dist[:, i] = np.sum((np.tile(center_pos[i, :], (num_sample, 1)) - sample)**2, axis=1)
        
        new_label_sample = np.argmin(dist, axis=1)
        
        if np.array_equal(new_label_sample, label_sample):
            break
        
        label_sample = new_label_sample
        
        for i in range(K):
            idx = np.nonzero(new_label_sample == i)[0]
            if len(idx)>0:
                center_pos[i, :] = np.mean(sample[idx, :], axis=0)
        
        iteration += 1
    
    return center_pos, label_sample

# %% [markdown]
# ## Test

# %% [code]
all_cominations = [[[0, 0], [1, 1], [2, 2]],
                   [[0, 0], [1, 2], [2, 1]],
                   [[0, 1], [1, 0], [2, 2]],
                   [[0, 1], [1, 2], [2, 0]],
                   [[0, 2], [1, 1], [2, 0]],
                   [[0, 2], [1, 0], [2, 1]],
                  ]

# %% [markdown]
# ### Non-Processed Data

# %% [code]
center_pos, label_sample = kmeans(iris_data_non, 3, 100)

# %% [code]
min_error = 1000

for combination in all_cominations:
    error = ((label_sample == combination[0][0]) != (iris_target == combination[0][1])).sum() + ((label_sample == combination[1][0]) != (iris_target == combination[1][1])).sum() + ((label_sample == combination[2][0]) != (iris_target == combination[2][1])).sum()
    min_error = min(min_error, error)

print("Error: ", min_error)

# %% [markdown]
# ### Standard Score

# %% [code]
center_pos, label_sample = kmeans(iris_data_standard, 3, 100)

# %% [code]
min_error = 1000

for combination in all_cominations:
    error = ((label_sample == combination[0][0]) != (iris_target == combination[0][1])).sum() + ((label_sample == combination[1][0]) != (iris_target == combination[1][1])).sum() + ((label_sample == combination[2][0]) != (iris_target == combination[2][1])).sum()
    min_error = min(min_error, error)

print("Error: ", min_error)

# %% [markdown]
# ### Scaling

# %% [code]
center_pos, label_sample = kmeans(iris_data_scaling, 3, 100)

# %% [code]
min_error = 1000

for combination in all_cominations:
    error = ((label_sample == combination[0][0]) != (iris_target == combination[0][1])).sum() + ((label_sample == combination[1][0]) != (iris_target == combination[1][1])).sum() + ((label_sample == combination[2][0]) != (iris_target == combination[2][1])).sum()
    min_error = min(min_error, error)

print("Error: ", min_error)