In [1]:
import numpy as np
import matplotlib.pyplot as plt

import pandas as pd
from sklearn.datasets import load_iris

from dataclasses import dataclass

## Exercise 1: Implement K-means using numpy

In [31]:
@dataclass
class KMeans:
    k: int
    iterations: int
    tol: float
    
    def fit(self, x: np.ndarray) ->None:
        "Clustering the data"
        
        num_samples, _ = x.shape
        
        # initial centroid and their closets
        self.centroids = x[np.random.choice(num_samples, size=self.k, replace=False)]
        self.closet = np.zeros(num_samples)
        self.inertia = np.zeros(self.k)
        for _ in range(self.iterations):
            old_closet = self.closet.copy()  # current closet values
            # distance between sample to each centroid
            distances = [np.linalg.norm(self.centroids - feature, axis=1) for feature in x] # n_samples*k dim
            self.closet = np.argmin(distances, axis=1) # find the index of the closet centroid in k centroid, n_samples d
            
            # update the centroid
            for idx in range(self.k):
                self.centroids[idx] = (x[self.closet == idx]).mean(axis=0)
            
            # if distance between old closets and new closets < tol:
            if np.linalg.norm(self.closet - old_closet) < self.tol:
                break
#             if np.array_equal(self.closet, old_closet):
#                 break    
#         for idx in range(self.k):
            
#             self.inertia[idx] = np.linalg.norm(self.centroids[idx] - feature, axis=0) for feature in x[self.closet == idx]

In [None]:
# Testing

np.random.seed(8)
plt.style.use('bmh')
features = np.random.rand(1000, 2)

kmeans = KMeans(k=8, iterations=16, tol=1e-4)
kmeans.fit(features)

fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(14,7))
# fig.suptitle('Before and after clustering', fontsize=24)
axs[0].scatter(features[:, 0], features[:, 1])
axs[0].set_title('Before clustering', fontsize=20)

axs[1].scatter(features[:, 0], features[:, 1], c=kmeans.closet)
axs[1].set_title('After clustering', fontsize=20)

plt.show()

## Exercise 2: Apply kmeans on at least 3 toy dataset. Try different # clusters

### Iris dataset

In [15]:
iris = load_iris()

 #### k=2

In [32]:
kmeans = KMeans(k=2, iterations=16, tol=1e-4)
kmeans.fit(iris.data)
print(kmeans.centroids)
print(kmeans.closet)

[[5.00566038 3.36981132 1.56037736 0.29056604]
 [6.30103093 2.88659794 4.95876289 1.69587629]]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1]


#### k=3

In [5]:
kmeans = KMeans(k=3, iterations=16, tol=1e-4)
kmeans.fit(iris.data)
print(kmeans.centroids)
print(kmeans.closet)

[[5.88360656 2.74098361 4.38852459 1.43442623]
 [6.85384615 3.07692308 5.71538462 2.05384615]
 [5.006      3.428      1.462      0.246     ]]
[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 1 0 1 1 1 1
 1 1 0 0 1 1 1 1 0 1 0 1 0 1 1 0 0 1 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1 1 1 0 1
 1 0]


In [10]:
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

#### k=4

In [11]:
kmeans = KMeans(k=4, iterations=16, tol=1e-4)
kmeans.fit(iris.data)
print(kmeans.centroids)
print(kmeans.closet)

[[5.006      3.428      1.462      0.246     ]
 [5.58       2.63333333 3.98666667 1.23333333]
 [7.08695652 3.12608696 6.01304348 2.14347826]
 [6.29361702 2.9        4.95106383 1.72978723]]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 3 3 3 1 3 1 3 1 3 1 1 1 1 3 1 3 1 1 3 1 3 1 3 3
 3 3 3 3 3 1 1 1 1 3 1 3 3 1 1 1 1 3 1 1 1 1 1 1 1 1 2 3 2 3 2 2 1 2 2 2 3
 3 2 3 3 3 3 2 2 3 2 3 2 3 2 2 3 3 3 2 2 2 3 3 3 2 2 3 3 2 2 3 3 2 2 3 3 3
 3 3]


#### k=5

In [13]:
kmeans = KMeans(k=5, iterations=16, tol=1e-4)
kmeans.fit(iris.data)
print(kmeans.centroids)
print(kmeans.closet)

[[7.12272727 3.11363636 6.03181818 2.13181818]
 [6.42380952 2.91904762 4.6047619  1.43809524]
 [6.19655172 2.88275862 5.18275862 1.93448276]
 [5.53214286 2.63571429 3.96071429 1.22857143]
 [5.006      3.428      1.462      0.246     ]]
[4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 4 4 4 4 4 4 4 4 4 4 1 1 1 3 1 3 1 3 1 3 3 3 3 1 3 1 3 3 1 3 2 3 1 1
 1 1 1 1 1 3 3 3 3 2 3 1 1 1 3 3 3 1 3 3 3 3 3 1 3 3 0 2 0 2 0 0 3 0 0 0 2
 2 0 2 2 2 2 0 0 2 0 2 0 2 0 0 2 2 2 0 0 0 2 2 2 0 2 2 2 0 0 2 2 0 0 2 2 2
 2 2]


## Exercise 3

In [None]:
x = iris.data
sum_of_squared_distances = []
k = range(1,10)
for num_clusters in k :
    kmeans = KMeans(k=num_clusters)
    kmeans.fit(x)
    sum_of_squared_distances.append(kmeans.inertia_)
plt.plot(k,sum_of_squared_distances,’bx-’)
plt.xlabel(‘Values of K’) 
plt.ylabel(‘Sum of squared distances/Inertia’) 
plt.title(‘Elbow Method For Optimal k’)
plt.show()