In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import silhouette_score
from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph

# Dataset

In [None]:
np.random.seed(844)
clust1 = np.random.normal(5, 2, (1000,2))
clust2 = np.random.normal(15, 3, (1000,2))
clust3 = np.random.multivariate_normal([17,3], [[1,0],[0,1]], 1000)
clust4 = np.random.multivariate_normal([2,16], [[1,0],[0,1]], 1000)
dataset1 = np.concatenate((clust1, clust2, clust3, clust4))

# we take the first array as the second array has the cluster labels
dataset2 = datasets.make_circles(n_samples=1000, factor=.5, noise=.05)[0]

dataset3 = datasets.make_blobs(n_samples=1000,
                             cluster_std=[1.0, 2.5, 0.5],
                             random_state=844)[0]

dataset3 = np.random.rand(2000, 2), None
dataset3 = dataset3[0]

In [None]:
# plot clustering output on the two datasets
def cluster_plots(set1, set2, set3, colours1 = 'gray', colours2 = 'gray', colours3 = 'gray',
                  title1 = 'Dataset 1',  title2 = 'Dataset 2', title3 = 'Dataset3'):
    
    fig,(ax1,ax2,ax3) = plt.subplots(1, 3)
    
    fig.set_size_inches(10, 4)
    
    ax1.set_title(title1,fontsize=14)
    ax1.set_xlim(min(set1[:,0]), max(set1[:,0]))
    ax1.set_ylim(min(set1[:,1]), max(set1[:,1]))
    ax1.scatter(set1[:, 0], set1[:, 1],s=8,lw=0,c= colours1)
    
    ax2.set_title(title2,fontsize=14)
    ax2.set_xlim(min(set2[:,0]), max(set2[:,0]))
    ax2.set_ylim(min(set2[:,1]), max(set2[:,1]))
    ax2.scatter(set2[:, 0], set2[:, 1],s=8,lw=0,c=colours2)
    
    ax3.set_title(title3,fontsize=14)
    ax3.set_xlim(min(set3[:,0]), max(set3[:,0]))
    ax3.set_ylim(min(set3[:,1]), max(set3[:,1]))
    ax3.scatter(set3[:, 0], set3[:, 1],s=8,lw=0,c=colours3)
    
    fig.tight_layout()
    plt.show()

In [None]:
cluster_plots(dataset1, dataset2, dataset3)

# K Means

In [None]:
kmeans_dataset1 = cluster.KMeans(n_clusters=4, max_iter=300, 
                                 init='k-means++',n_init=10).fit_predict(dataset1)

kmeans_dataset2 = cluster.KMeans(n_clusters=2, max_iter=300, 
                                 init='k-means++',n_init=10).fit_predict(dataset2)

kmeans_dataset3 = cluster.KMeans(n_clusters=4, max_iter=300, 
                                 init='k-means++',n_init=10).fit_predict(dataset3)

cluster_plots(dataset1, dataset2,dataset3,
              kmeans_dataset1, kmeans_dataset2, kmeans_dataset3)

# Agglomerative Clustering

In [None]:
hc_dataset1 = cluster.AgglomerativeClustering(n_clusters=4, affinity='euclidean', 
                                              linkage='ward').fit_predict(dataset1)

hc_dataset2 = cluster.AgglomerativeClustering(n_clusters=2, affinity='euclidean', 
                                              linkage='average').fit_predict(dataset2)

hc_dataset3 = cluster.AgglomerativeClustering(n_clusters=4, affinity='euclidean', 
                                              linkage='average').fit_predict(dataset3)

cluster_plots(dataset1, dataset2, dataset3, hc_dataset1, hc_dataset2, hc_dataset3)


# DB Scan

In [None]:
# implenting DBSCAN
dbscan_dataset1 = cluster.DBSCAN(eps=1, min_samples=5, metric='euclidean').fit_predict(dataset1)
dbscan_dataset2 = cluster.DBSCAN(eps=0.1, min_samples=5, metric='euclidean').fit_predict(dataset2)
dbscan_dataset3 = cluster.DBSCAN(eps=1, min_samples=5, metric='euclidean').fit_predict(dataset3)
# noise points are assigned -1
print('Dataset1:')
print "Number of Noise Points: ",sum(dbscan_dataset1==-1),"(",len(dbscan_dataset1),")"

print('Dataset2:')
print "Number of Noise Points: ",sum(dbscan_dataset2==-1)," (",len(dbscan_dataset2),")"

print('Dataset3:')
print "Number of Noise Points: ",sum(dbscan_dataset3==-1)," (",len(dbscan_dataset3),")"

cluster_plots(dataset1, dataset2,dataset3, dbscan_dataset1, dbscan_dataset2, dbscan_dataset3)
