In [2]:
%load_ext cython


In [67]:
%%cython -a
import numpy as np
import cython

@cython.boundscheck(False)
@cython.wraparound(False)

def dist_sq(double[:,:] a, double[:,:] b):
    res=np.sum((np.asarray(a)-np.asarray(b))**2)
    return res
#minimum distance square for every point to the centroid
def point_sq(double[:,:] data, double[:,:] centroid):
    res=[]
    for i in range(data.shape[0]):
        res.append(min(dist_sq(data[i],c) for c in centroid))
    return res

#calculate probability
def dist_prob(double[:] Dist):
    res=np.asarray(Dist)/np.sum(np.asarray(Dist))
    return res

In [62]:
%%cython -a
import numpy as np
import cython
from __main__ import dist_sq, point_sq, dist_prob
@cython.boundscheck(False)
@cython.wraparound(False)

def kmeansplusplus(double[:,:] data, int k):
    cdef int n
    #make a copy of the data
    data_copy=data.copy()
    n=data_copy.shape[0]
    #step 1: sample a point uniformly at random from x
    index=int(np.random.choice(n,1))
    centroid=data_copy[index]
    #once the centroid is determined, delete it from the copy 
    data_copy=np.delete(data_copy,index,axis=0)
    n=n-1
    #step 2: while c<k, sample x from X with probability d^2/phi_x(C)
    for number in range(k-1):
        #calculate the square difference for every point in the copy to its nearest center
        distance=point_sq(data_copy,centroid)
        #calculate the probability
        prob=dist_prob(distance).tolist()
        #randomly sample another centroid
        index=int(np.random.choice(n,1,prob))
        #add the new centroid
        centroid=np.vstack([centroid,data_copy[index]])
        #delete the new centroid from the copy
        data_copy=np.delete(data_copy,index,axis=0)
        n=n-1
    return centroid


In [63]:
#test 
data=np.array(DataFrame([np.random.random(10000),np.random.random(10000)]).transpose())

In [68]:
initial= kmeansplusplus(data,8)


ValueError: Buffer has wrong number of dimensions (expected 2, got 1)

In [16]:
#with the initialization of the centroids from the function kmeansplusplus
#plug in the original data(dataSet), initializtions(initial) and the dimension of the data(d)
import numpy as np
import pandas as pd
from pandas import DataFrame


def kmeans(dataSet, initial, d):
    centroids=initial
    k=centroids.shape[0]
    # Initialize book keeping vars.
    iterations = 0
    oldCentroids = np.zeros(initial.shape)
    
    # Run the main k-means algorithm
    while not shouldStop(oldCentroids, centroids, iterations):
        # Save old centroids for convergence test. Book keeping.
        oldCentroids = centroids
        iterations += 1
        
        # Assign labels to each datapoint based on centroids
        l= getLabels(dataSet, centroids)
        
        # Assign centroids based on datapoint labels
        centroids = getCentroids(dataSet, l, k, d)
        
    # We can get the labels too by calling getLabels(dataSet, centroids)
    return centroids, np.array(l)
# Function: Should Stop
# -------------
# Returns True or False if k-means is done. K-means terminates either
# because it has run a maximum number of iterations OR the centroids
# stop changing.
def shouldStop(oldCentroids, centroids, iterations):
    if iterations > 50: return True
    return oldCentroids.all == centroids.all

# Function: Get Labels
# -------------
# Returns a label for each piece of data in the dataset. 
def getLabels(dataSet, centroids, n):
    # For each element in the dataset, chose the closest centroid. 
    # Make that centroid the element's label.
    l=[]
    for i in range(dataSet.shape[0]):
        #arg min as the label
        l.append(np.argmin(list(dist_sq(dataSet[i],c) for c in centroids)))
    return l
# Function: Get Centroids
# -------------
# Returns k random centroids, each of dimension n.
def getCentroids(dataSet, labels, k, d):
    # Each centroid is the arithmetic mean of the points that
    # have that centroid's label. Important: If a centroid is empty (no points have
    # that centroid's label) you should randomly re-initialize it.
    data_new = DataFrame(dataSet.copy())
    data_new['Labels'] = labels
    data_new = np.array(data_new.groupby(['Labels']).mean().iloc[:,:2])
    # if a centroid is empty, reinitialize it 
    if len(np.unique(labels))<k:
        diff=k-len(np.unique(labels))
        data_new=np.vstack([data_new,np.random.random([diff,d])])    
    return data_new
    

In [17]:
import kmeans
kmeans.kmeans(data, initial,2)

(array([[ 0.84292243,  0.82618293],
        [ 0.24854533,  0.13506143],
        [ 0.17856107,  0.80227858],
        [ 0.83552787,  0.47874969],
        [ 0.75604327,  0.15036895],
        [ 0.48407297,  0.44246506],
        [ 0.52259147,  0.82068761],
        [ 0.14839289,  0.43849624]]), array([7, 1, 3, ..., 5, 4, 6]))