### K-means: clustering Iris dataset
This dataset is a typical test case for classification techniques.

In [1]:
%matplotlib inline
import matplotlib as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import seaborn as sns; sns.set(style="ticks", color_codes=True)
iris = datasets.load_iris()



In [2]:
# Iris dataset to pandas df
irisDF = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                     columns= iris['feature_names'] + ['species'])
# columnames
columnNames =  irisDF.columns.tolist()

In [3]:
# Using Kmeans with 3 clusters
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=0)

In [4]:
# Fitting the classifier with the training data
kmeansModel= kmeans.fit(irisDF[columnNames[:-1]])

In [5]:
predictions = kmeansModel.labels_
print predictions

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 2 2 2 0 2 2 2 2
 2 2 0 0 2 2 2 2 0 2 0 2 0 2 2 0 0 2 2 2 2 2 0 2 2 2 2 0 2 2 2 0 2 2 2 0 2
 2 0]


In [6]:
centers = kmeansModel.cluster_centers_
print centers

[[ 5.9016129   2.7483871   4.39354839  1.43387097]
 [ 5.006       3.418       1.464       0.244     ]
 [ 6.85        3.07368421  5.74210526  2.07105263]]


In [7]:
def conversion(x):
    if x==1:
        return 0
    elif x== 0:
        return 1
    else:
        return 2
predictionsFixed = map(conversion,predictions)
print predictionsFixed

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1]


In [14]:
actual = irisDF['species'].values
print actual
irisDF['clusters'] = predictionsFixed

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  2.  2.  2.  2.  2.  2.  2.  2.
  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.
  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.
  2.  2.  2.  2.  2.  2.]


### Clustering metrics
#### Given the knowledge of the ground truth class assignments 

In [15]:
# adjusted_rand_score: measures the similarity of the two assignments
from sklearn.metrics import adjusted_rand_score
adjusted_rand_score(actual,predictions) 

0.73023827228346971

In [16]:
# homogeneity: each cluster contains only members of a single class.
# completeness: all members of a given class are assigned to the same cluster.
# v_measure: mean of homogenity and completness
from sklearn.metrics import homogeneity_completeness_v_measure
homogeneity_completeness_v_measure(actual,predictions)

(0.75148540219883386, 0.76498615144898163, 0.75817568000577862)

In [23]:
# FMI = TP / sqrt((TP+FP)*(TP+FN)) 
from sklearn.metrics import fowlkes_mallows_score
fowlkes_mallows_score(actual,predictions)  

0.82080807291141533

#### When the ground truth labels are not known

In [20]:
# Silhouette Coefficient: Higher => better defined clusters
from sklearn.metrics import silhouette_score
labels = kmeansModel.labels_
silhouette_score(irisDF[columnNames[:-2]], labels, metric='euclidean')


0.55259194452136751

In [21]:
# Calinski-Harabaz Index: higher when clusters are dense and well separated
from sklearn.metrics import calinski_harabaz_score
calinski_harabaz_score(irisDF[columnNames[:-2]], labels) 

560.39992424664024