### In this lab, we will go through the K-means algorithm in the figure below using only NumPy library. We will test the implmentation on the Iris dataset. 

In [1]:
import numpy as np

![k-means algorithm](k-means.png)

### Loading the Iris dataset

In [2]:
# load the Iris data 
iris = np.loadtxt('iris_proc.data',delimiter=',')
print(iris.shape)

(150, 5)


In [3]:
# let's shuffle the data set
np.random.shuffle(iris)

In [4]:
# Splitting the dataset into Xtrain, ytrain, Xtest, ytest
X = iris[:,:4]
y = iris[:,4:]
num_train = 120 # number of train samples 120 for training and 30 for testing
Xtrain, Xtest = X[:num_train], X[num_train:]
ytrain, ytest = y[:num_train], y[num_train:]

### K-means algorithm 

In [5]:
# Step 1 : initialisation  

nData = np.shape(Xtrain)[0] # number data samples 
nDim = np.shape(Xtrain)[1] # number of features 
# lets initialize the k to be 3 since we know the number of classes already
k = 3 
nData, nDim, k

(120, 4, 3)

In [6]:
# choosing k random positions in the imput space 
# Note: we multiply the initialised centers position by(maxima-minima) to be within the data manifold 
minima = Xtrain.min(axis=0)
maxima = Xtrain.max(axis=0)
centres = np.random.rand(k, nDim)*(maxima-minima)+minima 
oldCentres = np.random.rand(k, nDim)# To copy the centers while training *(maxima-minima)+minima

In [7]:
# Step 2 : Learning 

# maximum iteration for training (repeat updating the cluster centres) 
maxIterations=2 # lets start with 2
count = 0 # a variable to count how many the number of iteration  

# here the while loop will stop when either we reach the number of mxIteration or 
# the cluster centers stop moving 
while np.sum(np.sum(oldCentres-centres))!= 0 and count<maxIterations:

    oldCentres = centres.copy()
    count += 1

    # Compute distances
    distances = np.ones((1,nData))*np.sum((Xtrain-centres[0,:])**2,axis=1)
    for j in range(k-1):
        distances = np.append(distances,np.ones((1,nData))*np.sum((Xtrain - centres[j+1,:])**2,axis=1),axis=0)
    
    
    # Identify the closest cluster
    cluster = distances.argmin(axis=0)
    cluster = np.transpose(cluster*np.ones((1,nData)))
    
    
    # Update the cluster centres	
    for j in range(k):
        thisCluster = np.where(cluster==j,1,0)
        if sum(thisCluster)>0:
            centres[j,:] = np.sum(Xtrain*thisCluster,axis=0)/np.sum(thisCluster)



In [8]:
print('centres positions are \n', centres)

centres positions are 
 [[5.01707317 3.4097561  1.50243902 0.26341463]
 [6.65142857 2.99142857 5.64285714 2.04571429]
 [5.96818182 2.75909091 4.29318182 1.33181818]]


In [9]:
# Step 3: Usage

num_test = np.shape(Xtest)[0]
# Compute distances
distances = np.ones((1,num_test))*np.sum((Xtest-centres[0,:])**2,axis=1)
for j in range(k-1):
    distances = np.append(distances,np.ones((1,num_test))*np.sum((Xtest-centres[j+1,:])**2,axis=1),axis=0)

# Identify the closest cluster
cluster = distances.argmin(axis=0)
cluster = np.transpose(cluster*np.ones((1,num_test)))

In [10]:
print(cluster[:10].ravel())
print(ytest[:10].ravel())

[1. 0. 1. 2. 0. 0. 0. 2. 0. 2.]
[2. 0. 2. 1. 0. 0. 0. 1. 0. 1.]


In [11]:
# How do we know if the k-means learned to cluster the dataset correctly?
# We can evaluate the model by examining if the model learns to cluster 
# test samples of a single class into the same cluster 

In [12]:
# TODO: Can you train the model for more number of iterations? will you get a better result? 
# Note: don't run the data split cells again for comparision 