In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn import metrics

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import MeanShift
from sklearn.cluster import Birch
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import MiniBatchKMeans

In [2]:
iris = pd.read_csv('iris.csv', skiprows = 1,
                  names = ['sepal-length','sepal-width','patel-length','patel-width','class'])

iris.head()

Unnamed: 0,sepal-length,sepal-width,patel-length,patel-width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
iris = iris.sample(frac=1).reset_index(drop=True)
iris.head()

Unnamed: 0,sepal-length,sepal-width,patel-length,patel-width,class
0,5.0,3.6,1.4,0.2,Iris-setosa
1,5.7,2.6,3.5,1.0,Iris-versicolor
2,5.7,3.8,1.7,0.3,Iris-setosa
3,5.2,4.1,1.5,0.1,Iris-setosa
4,5.0,3.2,1.2,0.2,Iris-setosa


In [4]:

from sklearn import preprocessing 

label_encoding = preprocessing.LabelEncoder()

#iris['class'] = preprocessing.LabelEncoder().fit_transform(iris['class'].astype(str))
iris['class'] = label_encoding.fit_transform(iris['class'].astype(str))

iris.head()

Unnamed: 0,sepal-length,sepal-width,patel-length,patel-width,class
0,5.0,3.6,1.4,0.2,0
1,5.7,2.6,3.5,1.0,1
2,5.7,3.8,1.7,0.3,0
3,5.2,4.1,1.5,0.1,0
4,5.0,3.2,1.2,0.2,0


In [5]:
iris_features = iris.drop('class', axis = 1)
iris_features.head()

Unnamed: 0,sepal-length,sepal-width,patel-length,patel-width
0,5.0,3.6,1.4,0.2
1,5.7,2.6,3.5,1.0
2,5.7,3.8,1.7,0.3
3,5.2,4.1,1.5,0.1
4,5.0,3.2,1.2,0.2


In [6]:
iris_labels = iris['class']
iris_labels.sample(5)

57     2
104    2
7      2
128    2
18     0
Name: class, dtype: int64

In [7]:
def build_model(clustering_model, data, labels):
    model = clustering_model(data)
    
    print('homo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
    
    print(50 * '-')
    
    print('%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
        %(metrics.homogeneity_score(labels, model.labels_),
         metrics.completeness_score(labels, model.labels_),
         metrics.v_measure_score(labels, model.labels_),
         metrics.adjusted_rand_score(labels, model.labels_),
         metrics.adjusted_mutual_info_score(labels, model.labels_),
         metrics.silhouette_score(data, model.labels_)))
   

   

In [8]:
def k_means(data, n_clusters=3, max_iter=1000):
    model = KMeans(n_clusters=n_clusters, max_iter=max_iter).fit(data)
    
    return model

In [9]:
build_model(k_means, iris_features, iris_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.751	0.765	0.758	0.730	0.755	0.553


In [10]:
def agglomerative_fn(data, n_clusters=3):

    model = AgglomerativeClustering(n_clusters=n_clusters).fit(data)

    #Default linkage criterion is ward which minimizes the variances of clusters being merged

    return model

In [11]:
build_model(agglomerative_fn, iris_features, iris_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.761	0.780	0.770	0.731	0.767	0.554


In [12]:
def dbscan_fn(data, eps=0.45, min_samples=4):
    
#eps=0.45 al points within this maximum distance are considered neighbors|dense region - smaller value preferred
#minimum no of smaples in the neighborhood for the point to be a core point
    model = DBSCAN(eps=eps, min_samples=min_samples).fit(data)
    
    return model

In [13]:
build_model(dbscan_fn, iris_features, iris_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.577	0.609	0.593	0.508	0.584	0.372


In [14]:
def mean_shift_fn(data, bandwidth=0.85):
    model = MeanShift(bandwidth=bandwidth).fit(data)
    
    return model

In [15]:
build_model(mean_shift_fn, iris_features, iris_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.760	0.772	0.766	0.744	0.763	0.551


In [16]:
def birch_fn(data, n_clusters=3):
    model = Birch(n_clusters=n_clusters).fit(data)
    
    return model

In [17]:
build_model(birch_fn, iris_features, iris_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.762	0.793	0.777	0.707	0.774	0.551


In [18]:
def affinity_propagation_fn(data, damping=0.6, max_iter = 1000):

#damping - learning rate
    model = AffinityPropagation(damping=damping, max_iter=max_iter).fit(data)
    
    return model

In [26]:
build_model(affinity_propagation_fn, iris_features, iris_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.851	0.492	0.623	0.437	0.612	0.349




In [20]:
def mini_batch_fn(data, n_clusters=3, max_iter=100):
    model = MiniBatchKMeans(n_clusters=n_clusters, max_iter=max_iter, batch_size=20).fit(data)
    
    return model

In [21]:
build_model(mini_batch_fn, iris_features, iris_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.761	0.780	0.770	0.731	0.767	0.554


In [22]:
from sklearn.cluster import SpectralClustering

In [24]:
SS = 1000 #Self Similarity

In [26]:
IS = 10 #Intra Similarity

In [29]:
LS = 0.01 #Low Similarity 

In [30]:
similarity_mat = [[SS, LS, LS, LS, LS, LS, LS, LS, LS],
                 [LS, SS, LS, LS, LS, LS, LS, LS, LS],
                 [LS, LS, SS, LS, LS, LS, LS, LS, LS],
                 [LS, LS, LS, SS, LS, LS, LS, LS, LS],
                 [LS, LS, LS, LS, SS, LS, LS, LS, LS],
                 [LS, LS, LS, LS, LS, SS, LS, LS, LS],
                 [LS, LS, LS, LS, LS, LS, SS, LS, LS],
                 [LS, LS, LS, LS, LS, LS, LS, SS, LS],
                 [LS, LS, LS, LS, LS, LS, LS, LS, SS]]

In [32]:
#When input is similarity matrix affinity values are already available so affinity=precomputed 
spectral_model = SpectralClustering(n_clusters=3, affinity = 'precomputed').fit(similarity_mat)

In [33]:
spectral_model.labels_

array([0, 1, 0, 1, 0, 0, 2, 2, 0], dtype=int32)