In [1]:
from __future__ import print_function
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
import random
np.random.seed(18)

means= [[2,2],[8,3],[3,6]]
cov=[[1,0],[0,1]]
N=500 #number of data points in each clusters
#random data points
X0= np.random.multivariate_normal(means[0],cov,N)
#np.mean(X0,axis=0) =>2,2
X1= np.random.multivariate_normal(means[1],cov,N)
#np.mean(X1,axis=0) =>8,3
X2= np.random.multivariate_normal(means[2],cov,N)
#np.mean(X2,axis=0) =>3,6

In [2]:
X= np.concatenate((X0,X1,X2),axis=0)
K= 3#number of cluster
original_label= np.asarray([0]*N+[1]*N+[2]*N).T#(0,0,0,...xN,1,1,1,...xN,2,2,2,...xN)

In [56]:
def kmeans_init_centroids(X,k):
    return X[np.random.choice(X.shape[0],k,replace=False)]
def kmeans_assign_labels(X,centroids):#find new labels
    D= cdist(X,centroids)
    return np.argmin(D,axis=1)#return index of minimum distance
def has_converged (centroids, new_centroids):#return true if two set of centroids are the same
    return set([tuple(a) for a in centroids])== set([tuple(a) for a in new_centroids])
def kmeans_update_centroids(X,labels, K): #to update centroid when knowning labels of each data point
    centroids= np.zeros((K,X.shape[1]))
    for k in range(K):
        #collect all points that assigned to k-th centroids
        Xk= X[labels==k,:]
        centroids[k,:]= np.mean(Xk, axis=0)
    return centroids

In [67]:
X.shape

(1500, 2)

In [57]:
def kmeans(X,K):
    centroids= [kmeans_init_centroids(X,K)]#choose k data point in X to put as centroids
    labels=[]
    it=0
    while True:
        #shape centroids[-1] is (3,2)
        labels.append(kmeans_assign_labels(X, centroids[-1])) #shape(1500,1)
        new_centroids= kmeans_update_centroids(X, labels[-1],K)
        if has_converged(centroids[-1],new_centroids):
            break
        centroids.append(new_centroids)
        it+=1
    return (centroids, labels, it)

In [53]:
(centroids, labels, it)= kmeans(X,K)
print('centroids found by out algorithm: \n', centroids[-1])#get the last centroids,[-1]
print('it:\n',it)


shape:  (1500, 2)
l:  [[-1.74918676e-02 -8.22495947e-03]
 [ 2.07417695e+00  5.71528152e+00]
 [ 8.54599772e+00  2.33506316e+00]]


In [71]:
#predict new data
print('predict new data',kmeans_assign_labels(np.array([[2.3,4.6],[1.5,8.9],[4.6,0.]]), centroids[-1]))

predict new data [1 1 0]


In [65]:
np.array([2.3,4.6]).reshape(1,-1).shape

(1, 2)

In [39]:
#use library
from sklearn.cluster import KMeans
model= KMeans(n_clusters=3, random_state=0).fit(X)
print('centroids found by sklearn: ')
print(model.cluster_centers_)
pred_label= model.predict(X)
print('predict label', pred_label)

centroids found by sklearn: 
[[8.07476866 3.01494931]
 [3.02702878 5.95686115]
 [1.9834967  1.96588127]]
predict label [1 2 2 ... 1 1 1]


In [38]:
?KMeans

In [6]:
import numpy as np
help(np.random.multivariate_normal)

Help on built-in function multivariate_normal:

multivariate_normal(...) method of mtrand.RandomState instance
    multivariate_normal(mean, cov[, size, check_valid, tol])
    
    Draw random samples from a multivariate normal distribution.
    
    The multivariate normal, multinormal or Gaussian distribution is a
    generalization of the one-dimensional normal distribution to higher
    dimensions.  Such a distribution is specified by its mean and
    covariance matrix.  These parameters are analogous to the mean
    (average or "center") and variance (standard deviation, or "width,"
    squared) of the one-dimensional normal distribution.
    
    Parameters
    ----------
    mean : 1-D array_like, of length N
        Mean of the N-dimensional distribution.
    cov : 2-D array_like, of shape (N, N)
        Covariance matrix of the distribution. It must be symmetric and
        positive-semidefinite for proper sampling.
    size : int or tuple of ints, optional
        Given a sha