K-Means clustering

Author - Kshitij Chhajed

In [1]:
# Imports
!pip install validclust
from tensorflow import keras
from validclust import dunn
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_score
import numpy as np
import cv2
import copy

Collecting validclust
  Downloading validclust-0.1.1-py2.py3-none-any.whl (8.1 kB)
Installing collected packages: validclust
Successfully installed validclust-0.1.1


In [2]:
#Preprocessing of image: Loading data, converting to grayscale and reshaping
(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()
x_test = np.array([cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) for image in x_test])
x_test = x_test.reshape(x_test.shape[0],(x_test.shape[1]*x_test.shape[2]))

#Centroid initialization using random functions
np.random.seed(2)
a=[]  
idx = np.random.choice(len(x_test), 10, replace=False)
a = [x_test[i] for i in idx] 

#Function to calculate cluster contents and predicted labels
def init_clust(m):
  cluster= [[] for i in range(10)]
  y_pred = [ None for i in range(len(x_test))]
  for i in range (len(x_test)):
    dist=[]
    for j in range (len(m)):
      dist.append(np.linalg.norm(x_test[i,:] - m[j]))
    idx = np.argmin(dist)
    cluster[idx].append(x_test[i,:])
    y_pred[i] = idx
  return cluster, y_pred

#Function to re-calculate centroid everytime after clustering
def new_centroid(cluster):
  for i in range (len(a)):
    if(len(cluster[i])==0):
      a[i] = x_test[np.random.choice(len(x_test), 1)]
    else:
      a[i] = np.average(cluster[i], axis=0)
  return a

#Converging clusters until iterations complete or difference between centroids diminish to small value    
for i in range(60):
  old_a = copy.deepcopy(a)
  c, y_pred = init_clust(old_a)
  a = new_centroid(c)
  diff = np.linalg.norm(np.array(a) - np.array(old_a))
  if (diff < 0.00001):
    print("Iteration: ",i, "- Diff:", diff)
    break

score = silhouette_score(x_test, y_pred, metric='euclidean')
print("Silhouette score:", score)
pair_dist = pairwise_distances(x_test)
print ("Dunn's index:" ,dunn(pair_dist, y_pred))

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
Iteration:  55 - Diff: 0.0
Silhouette score: 0.05585021207357926
Dunn's index: 0.09108497591578157
