### In this lab, we will go through the K-means algorithm in the figure below using only NumPy library. We will test the implmentation on the Iris dataset. 

![k-means algorithm](k-means.png)

### Loading the Iris dataset

In [1]:
import numpy as np

In [3]:
# load the Iris data 
iris = np.loadtxt('iris_proc.data',delimiter=',')
print(iris.shape)
print(iris)

(150, 5)
[[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]
 [4.6 3.4 1.4 0.3 0. ]
 [5.  3.4 1.5 0.2 0. ]
 [4.4 2.9 1.4 0.2 0. ]
 [4.9 3.1 1.5 0.1 0. ]
 [5.4 3.7 1.5 0.2 0. ]
 [4.8 3.4 1.6 0.2 0. ]
 [4.8 3.  1.4 0.1 0. ]
 [4.3 3.  1.1 0.1 0. ]
 [5.8 4.  1.2 0.2 0. ]
 [5.7 4.4 1.5 0.4 0. ]
 [5.4 3.9 1.3 0.4 0. ]
 [5.1 3.5 1.4 0.3 0. ]
 [5.7 3.8 1.7 0.3 0. ]
 [5.1 3.8 1.5 0.3 0. ]
 [5.4 3.4 1.7 0.2 0. ]
 [5.1 3.7 1.5 0.4 0. ]
 [4.6 3.6 1.  0.2 0. ]
 [5.1 3.3 1.7 0.5 0. ]
 [4.8 3.4 1.9 0.2 0. ]
 [5.  3.  1.6 0.2 0. ]
 [5.  3.4 1.6 0.4 0. ]
 [5.2 3.5 1.5 0.2 0. ]
 [5.2 3.4 1.4 0.2 0. ]
 [4.7 3.2 1.6 0.2 0. ]
 [4.8 3.1 1.6 0.2 0. ]
 [5.4 3.4 1.5 0.4 0. ]
 [5.2 4.1 1.5 0.1 0. ]
 [5.5 4.2 1.4 0.2 0. ]
 [4.9 3.1 1.5 0.1 0. ]
 [5.  3.2 1.2 0.2 0. ]
 [5.5 3.5 1.3 0.2 0. ]
 [4.9 3.1 1.5 0.1 0. ]
 [4.4 3.  1.3 0.2 0. ]
 [5.1 3.4 1.5 0.2 0. ]
 [5.  3.5 1.3 0.3 0. ]
 [4.5 2.3 1.3 0.3 0. ]
 [4.4 3.2 1.3 0.2 0. ]
 [

In [6]:
# let's shuffle the data set
np.random.shuffle(iris)

In [7]:
# Splitting the dataset into Xtrain, ytrain, Xtest, ytest
X = iris[:,:4]
y = iris[:,4:]
num_train = 120 # number of train samples 120 for training and 30 for testing
Xtrain, Xtest = X[:num_train], X[num_train:]
ytrain, ytest = y[:num_train], y[num_train:]

### K-means algorithm 

In [8]:
# Step 1 : initialisation  

nData = np.shape(Xtrain)[0] # number data samples 
nDim = np.shape(Xtrain)[1] # number of features 
# lets initialize the k to be 3 since we know the number of classes already
k = 3 
nData, nDim, k

(120, 4, 3)

In [9]:
# choosing k random positions in the imput space 
# Note: we multiply the initialised centers position by(maxima-minima) to be within the data manifold 
minima = Xtrain.min(axis=0)
maxima = Xtrain.max(axis=0)
centres = np.random.rand(k, nDim)*(maxima-minima)+minima 
oldCentres = np.random.rand(k, nDim)# To copy the centers while training *(maxima-minima)+minima
print(oldCentres)

[[0.20258733 0.99525039 0.78898835 0.46682883]
 [0.85144455 0.60679562 0.0451325  0.98271295]
 [0.903942   0.02029594 0.82332333 0.65637917]]


In [10]:
# Step 2 : Learning 

# maximum iteration for training (repeat updating the cluster centres) 
maxIterations=2 # lets start with 2
count = 0 # a variable to count how many the number of iteration  

# here the while loop will stop when either we reach the number of mxIteration or 
# the cluster centers stop moving 
while np.sum(np.sum(oldCentres-centres)) != 0 and count < maxIterations:

    oldCentres = centres.copy()
    count += 1

    # Compute distances
    distances = np.ones((1,nData)) * np.sum((Xtrain-centres[0,:])**2,axis=1)
    for j in range(k-1):
        distances = np.append(distances,np.ones((1,nData))*np.sum((Xtrain - centres[j+1,:])**2,axis=1),axis=0)
    
    
    # Identify the closest cluster
    cluster = distances.argmin(axis=0)
    cluster = np.transpose(cluster*np.ones((1,nData)))
    
    
    # Update the cluster centres	
    for j in range(k):
        thisCluster = np.where(cluster==j,1,0)
        if sum(thisCluster)>0:
            centres[j,:] = np.sum(Xtrain*thisCluster,axis=0) / np.sum(thisCluster)



In [11]:
print('centres positions are \n', centres)

centres positions are 
 [[5.60991098 3.07207999 6.69968303 0.13158718]
 [5.00952381 3.4047619  1.45238095 0.24047619]
 [6.26025641 2.86923077 4.85384615 1.64230769]]


In [14]:
# Step 3: Usage

num_test = np.shape(Xtest)[0]
# Compute distances
distances = np.ones((1,num_test))*np.sum((Xtest-centres[0,:])**2,axis=1)
for j in range(k-1):
    distances = np.append(distances,np.ones((1,num_test))*np.sum((Xtest-centres[j+1,:])**2,axis=1),axis=0)

# Identify the closest cluster
cluster = distances.argmin(axis=0)
cluster = np.transpose(cluster*np.ones((1,num_test)))

In [15]:
print(cluster[:10].ravel())
print(ytest[:10].ravel())

[2. 2. 2. 2. 2. 2. 1. 1. 1. 1.]
[2. 2. 2. 2. 2. 2. 1. 0. 0. 0.]


In [11]:
# How do we know if the k-means learned to cluster the dataset correctly?
# We can evaluate the model by examining if the model learns to cluster 
# test samples of a single class into the same cluster 

In [12]:
# TODO: Can you train the model for more number of iterations? will you get a better result? 
# Note: don't run the data split cells again for comparision 