In [None]:
from __future__ import print_function
import numpy as np
import matplotlib.cm as cm
import matplotlib.pyplot as plt

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn import metrics

In [1]:
def create_dataset():
    # Generate sample points
    centers = [[3,5], [5,1], [8,2], [6,8], [9,7]]
    X, y = make_blobs(n_samples=1000,centers=centers,cluster_std=[0.5, 0.5, 1, 1, 1],random_state=3320) #1000 originally
    # plot data
    plt.suptitle("Scatter plot")
    plt.scatter(X[:, 0], X[:, 1], c=y)
    plt.show()
    plt.clf()
    return [X, y]

In [2]:
def my_clustering(X, y, n_clusters):
   
    M=[] #The list that store all m_i
    b = np.zeros((X.shape[0], n_clusters)) #the indicator matrix for each x and m
    initialized=[] #for checking if the cluster is assigned to same x

    #initialized k cluster vector
    for i in range(1,n_clusters+1):
        current=np.random.randint(n_clusters+1)
        while current in initialized:
            current=np.random.randint(n_clusters+1)
        initialized.append(current)
        M.append(X[current])

    #data structure for storing each data point
    data = [] #list of dict
    for idx,x in enumerate(X):
        temp = {
            'array':x,
            'x': x[0],
            'y': x[1],
             # Euclidean distance
            'dist':[ np.sqrt((x[0]-M[i][0])**2+(x[1]-M[i][1])**2) for i in range(0,n_clusters) ],
            'cluster': 0
        }
        temp['cluster']=temp['dist'].index(min(temp['dist']))
        data.append(temp)

    count_conver=0

    # check convergence
    while(count_conver<n_clusters):
        count_conver = 0
        # cluster assignment phase
        for idx,x in enumerate(data):
            for idm,m in enumerate(M):
                dist=np.sqrt((x["x"]-m[0])**2+(x["y"]-m[1])**2)
                x["dist"][idm]=dist
            x['cluster']=x['dist'].index(min(x['dist']))
            # intialize b value of the row
            b[idx,:]=0
            #assign the b value of x for corresponding cluster
            b[idx,x['cluster']]=1

        # centroid adjustment phase
        for idm,m in enumerate(M):
            count=0
            total = np.zeros(x['array'].shape)
            for idx,x in enumerate(data):
                if x['cluster'] == idm:
                    count=count+1
                    total=np.add(total,x['array'])
            if np.array_equal(m,total/count):
                # count number of cluster that doesnt update (terminate if all do not update)
                count_conver=count_conver+1
            else:
                M[idm]=total/count


    M=np.asarray(M)


    plt.scatter(X[:, 0], X[:, 1], c=y)
    plt.suptitle("My k mean with cluster="+str(n_clusters))
    plt.scatter(M[:, 0], M[:, 1], c='black', s=200, alpha=0.5)

    plt.show()
    plt.clf()


    ture_label=y
    my_label=[]
    for i in range(0,X.shape[0]):
        my_label.append((data[i]["cluster"]))
    my_label=np.asarray(my_label)


    ari_score=metrics.adjusted_rand_score(y, my_label)
    mri_score=metrics.adjusted_mutual_info_score(y, my_label)
    v_measure_score=metrics.v_measure_score(y, my_label)
    silhouette_avg=metrics.silhouette_score(X,my_label ,metric='euclidean')


    return [ari_score,mri_score,v_measure_score,silhouette_avg]  

In [None]:
def main():
    X, y = create_dataset()
    range_n_clusters = [2, 3, 4, 5, 6, 7]
    ari_score = [None] * len(range_n_clusters)
    mri_score = [None] * len(range_n_clusters)
    v_measure_score = [None] * len(range_n_clusters)
    silhouette_avg = [None] * len(range_n_clusters)


    for n_clusters in range_n_clusters:
        i = n_clusters - range_n_clusters[0]
        print("Number of clusters is: ", n_clusters)
        [ari_score[i], mri_score[i], v_measure_score[i], silhouette_avg[i]] = my_clustering(X, y, n_clusters)
        print('The ARI score is: ', ari_score[i])
        print('The MRI score is: ', mri_score[i])
        print('The v-measure score is: ', v_measure_score[i])
        print('The average silhouette score is: ', silhouette_avg[i])



    plt.suptitle("ari_score vs n_clusters")
    plt.plot(range_n_clusters,ari_score)
    plt.show()
    plt.clf()
    
    plt.suptitle("mri_score vs n_clusters")
    plt.plot(range_n_clusters, mri_score)
    plt.show()
    
    plt.suptitle("v_measure_score vs n_clusters")
    plt.plot(range_n_clusters, v_measure_score)
    plt.show()
    
    plt.suptitle("silhouette_avg vs n_clusters")
    plt.plot(range_n_clusters, silhouette_avg)
    plt.show()

if __name__ == '__main__':
    main()