In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from pyclust import KMedoids

The minimum supported version is 2.4.6



In [7]:
def kMedoids(D, k, tmax=100):
    # determine dimensions of distance matrix D
    m, n = D.shape

    if k > n:
        raise Exception('too many medoids')
    # randomly initialize an array of k medoid indices
    M = np.arange(n)
    np.random.shuffle(M)
    M = np.sort(M[:k])

    # create a copy of the array of medoid indices
    Mnew = np.copy(M)

    # initialize a dictionary to represent clusters
    C = {}
    for t in range(tmax):
        # determine clusters, i. e. arrays of data indices
        J = np.argmin(D[:,M], axis=1)
        for kappa in range(k):
            C[kappa] = np.where(J==kappa)[0]
        # update cluster medoids
        for kappa in range(k):
            J = np.mean(D[np.ix_(C[kappa],C[kappa])],axis=1)
            j = np.argmin(J)
            Mnew[kappa] = C[kappa][j]
        np.sort(Mnew)
        # check for convergence
        if np.array_equal(M, Mnew):
            break
        M = np.copy(Mnew)
    else:
        # final update of cluster memberships
        J = np.argmin(D[:,M], axis=1)
        for kappa in range(k):
            C[kappa] = np.where(J==kappa)[0]

    # return results
    return M, C

In [3]:
X = pd.read_csv('data.csv', header=None)

In [4]:
X.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2199,2200,2201,2202,2203,2204,2205,2206,2207,2208
0,0.077257,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055755,0.0,0.0
2,0.04225,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.093693,0.0,0.099296,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.069105,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.036811,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.041917,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02944,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.018781,0.068565,0.0,0.0,0.0,0.0,0.0,0.01034,0.030426,0.091669
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043129,0.126913,0.0
8,0.0,0.168955,0.0,0.0,0.0,0.0,0.085074,0.0,0.0,0.0,...,0.0,0.122644,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.089189,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Using the elbow method to find the optimal number of clusters

wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

In [None]:
wcss

In [None]:
# Plotting the Elbow Method

plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
# Fitting K-Means to the dataset

kmeans = KMeans(n_clusters = 3, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(X)

In [5]:
# Fitting K-Medoids to the dataset

#kmedoids = KMedoids(n_clusters = 3)
#y_kmedoids = kmedoids.fit(X)

from sklearn.metrics.pairwise import pairwise_distances

# distance matrix
D = pairwise_distances(X, metric='euclidean')

In [13]:
D.shape

(19924, 19924)

In [8]:
# split into 3 clusters
M, C = kMedoids(D, 3)

In [11]:
print(M)
print(C)

[4379 9964 6122]
{0: array([    0,     2,     5, ..., 19919, 19920, 19923]), 1: array([    1,     7,     8, ..., 19912, 19921, 19922]), 2: array([    3,     4,    11, ..., 19913, 19914, 19918])}


In [15]:
for point_idx in M:
    print( X[point_idx] )

TypeError: unhashable type: 'slice'

In [None]:
# Getting the Score

sc = silhouette_score(X, y_kmeans, metric='euclidean')
sc

In [None]:
# Reducing the dimension of the data
X_reduced = PCA(n_components = 2, random_state = 42).fit_transform(X)

# Visualising the clusters
plt.scatter(X_reduced[y_kmeans == 0, 0], X_reduced[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X_reduced[y_kmeans == 1, 0], X_reduced[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X_reduced[y_kmeans == 2, 0], X_reduced[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.show()