In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [None]:
data = pd.read_csv("cm_dataset.csv")
x, y = np.array(data.iloc[:, 0].values), np.array(data.iloc[:, 1].values)
points = np.array([[x[i], y[i]] for i in range(len(x))])
print(len(x))
num_points = len(x)
dimensions = 2
iterations = 20
k_vals = np.array([2, 3, 4, 5])
distant_point = np.array([1e9, 1e9])

In [None]:
def calculate_distance(point1, point2):
    return np.sqrt(np.sum((point1 - point2)**2))

# This is the fixed random initialisation
z = np.random.randint(0, 4, 1000)
k_z = [] # stores the final cluster values of each point
k_updated_means = [] # stores the values of the updated means at each iteration

for k in k_vals:
    # k_z.append(z%k) # alloting the out of range clusters to one of the existing k clusters
    k_z.append(np.random.randint(0, k, 1000))

k_z = np.array(k_z)

k_clusters = [[], []] # stores the initial and final clusters

for i, k in enumerate(k_vals): # loops for all values of k
    num_cluster_points = np.zeros(k)
    clusters = [] # initial clusters
    for _ in range(k):
        clusters.append([[], []])
    # Intial cluster capturing started
    for j, point in enumerate(points):
        num_cluster_points[k_z[i, j]] += 1
        for _ in range(dimensions):
            clusters[k_z[i, j]][_].append(point[_])

    k_clusters[0].append(clusters)
    # initial cluster captured

    # Calculating the means of the clusters
    means = np.zeros((k, dimensions)) # initialising the means of each cluster to 0
    for j, point in enumerate(points):
        means[k_z[i, j]] += point
    
    for j, mean in enumerate(means):
        if num_cluster_points[j] == 0:
            mean = distant_point
        else:
            mean /= num_cluster_points[j]
        

    # Iteration using Lloyd's algorithm
    count = 0
    updated_means = []
    while count < iterations:
        updated_means.append(means)
        temp = np.zeros((k, 2))
        num_cluster_points = np.zeros(k)
        for j, point in enumerate(points):
            dist = calculate_distance(means[k_z[i]], point)
            for l in range(k):
                if dist > calculate_distance(means[l], point):
                    dist = calculate_distance(means[l], point)
                    k_z[i, j] = l
                    num_cluster_points[l] += 1
        
        # Means calculated and updated
        for j, point in enumerate(points):
            temp[k_z[i, j]] += point
        
        for j, temp_mean in enumerate(temp):
            if num_cluster_points[j] == 0:
                temp_mean = distant_point
            else:
                temp_mean /= num_cluster_points[j]
        
        means = temp
        count += 1
    
    k_updated_means.append(updated_means)

    # final cluster capturing
    clusters = []
    for _ in range(k):
        clusters.append([[], []])

    for j, point in enumerate(points):
        for _ in range(dimensions):
            clusters[k_z[i, j]][_].append(point[_])

    k_clusters[1].append(clusters)
    # final cluster captured

In [None]:
fig, axes = plt.subplots(2, 4, figsize=(16, 6))

for i in range(2):
    for j, k in enumerate(k_vals):
        for _ in range(k):
            axes[i][j].scatter(k_clusters[i][j][_][0], k_clusters[i][j][_][1], s=7)
        axes[0][j].set_xlabel('Before K means, k = '+str(k))
        axes[1][j].set_xlabel('After K means, k = ' + str(k))
        axes[i][j].tick_params(left = False, bottom = False, labelleft = False, labelbottom = False)

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(16, 4))
for i in range(4):
    for j in range(k_vals[i]):
        for iteration in range(iterations):
            axes[i].plot(k_updated_means[i][iteration][j][0], k_updated_means[i][iteration][j][1], 'r*')
        axes[i].plot(k_updated_means[i][iterations-1][j][0], k_updated_means[i][iterations-1][j][1], 'b*')
        axes[i].set_xlabel('Position of Means with iteration, k = ' + str(k_vals[i]))

plt.show()