In [2]:
import kmeans_combined_revised, simulatedData, time
from kmeans_combined_revised import kmeansplusplus, kmeansparallel, kmeans
from simulatedData import generate_data

In [13]:
#test on simulated data
k=5
R=1
d=15
l=10
data=generate_data(k,R)
data

array([[ -2.86245798e-01,  -2.08826540e-01,  -1.13379169e+00, ...,
         -1.34778264e+00,  -2.39926062e-01,   3.71476148e-01],
       [  5.49136605e-01,   4.75352121e-01,   4.11610443e-01, ...,
         -1.12122868e+00,   4.45350210e-01,   1.23639785e+00],
       [  6.47759705e-01,  -4.75032638e-01,   7.31782244e-01, ...,
          5.53060319e-01,  -5.91701592e-01,   3.05862242e-01],
       ..., 
       [ -1.32779366e+00,  -5.07997976e-01,  -1.28351595e+00, ...,
         -2.53281719e-01,  -2.89509199e-01,  -6.26855534e-02],
       [  4.72296764e-01,   9.86041287e-01,  -1.05034560e+00, ...,
          6.15505928e-01,  -8.27498024e-01,   7.96331519e-01],
       [  1.44367371e+00,   4.55108564e-01,  -5.29888288e-01, ...,
          9.46264489e-04,   1.56215127e+00,   1.48746458e+00]])

In [14]:
%timeit kmeansplusplus(data,k,d)

1 loop, best of 3: 1.6 s per loop


In [15]:
%timeit kmeansparallel(data,k,l,d)

1 loop, best of 3: 7.76 s per loop


Optimization method 1:

Import kmeans_numba module

In [10]:
import kmeans_Numba_revised

In [16]:
%timeit kmeans_Numba_revised.kmeansparallel(data,k,l,d)

1 loop, best of 3: 2.01 s per loop


Optimization method 2:

Import kmeansCython module

In [17]:
%load_ext Cython

In [23]:
%%cython -a
import numpy as np
cimport numpy as np
import pandas as pd
from pandas import DataFrame
import math

#distance square
def dist_sq_c(a, b):
    return np.sum((a-b)**2)
#minimum distance square for every point to the centroid
def point_sq_c(data,centroid):
    dist=[]
    for i in range(data.shape[0]):
        dist.append(min(dist_sq_c(data[i],c) for c in centroid))
    return dist
        

#calculate probability
def dist_prob_parallel_c(Dist,l):
    return l*Dist/np.sum(Dist)


#step 2: calculate the cost and number of iterations(log(cost))
def log_cost_c(data_copy,centroid):
    cost=np.sum(point_sq_c(data_copy,centroid))
    iteration=math.ceil(np.log(cost))
    return iteration

    
    
#calculate weights
#step 4: assign the weights
def weight_prob_c(data_copy, centroid):
    w_size=centroid.shape[0]
    w=np.zeros(w_size)
    for i in range(data_copy.shape[0]):
        index_w=np.argmin(list(dist_sq_c(data_copy[i],c) for c in centroid))
        w[index_w]=w[index_w]+1
    return w



#step 5: recluster the weighted points in C into k clusters
#reinitialize k centroids
def reassign_centroids_c(centroid,k,d,w):
    new_centroid=np.zeros([k,d])
    for cluster in range(k):
        #according to the weights from step 4, calculate the probability that a point is sampled from C
        prob_w=list(w/sum(w))
        #sample a new centroid
        new_index=np.random.choice(centroid.shape[0],1,prob_w)
        #store the new centroid
        new_centroid[cluster]=centroid[new_index]
        #delete the new centroid from the centroid
        centroid=np.delete(centroid,new_index,axis=0)
        #delete the correponding weight
        w=np.delete(w,new_index,axis=0)
    return new_centroid


def kmeansparallel_c(data, k, l, d):
    #step 1: sample a point uniformly at random from X
    index=int(np.random.choice(data.shape[0],1))
    centroid=np.array(data[index])
    data_copy=data.copy()
    data_copy=np.delete(data_copy,index,axis=0)
    
    #step 2: calculate number of iteration
    iteration= log_cost_c(data_copy,centroid)
    
    #step 3: Get initial Centroids C
    for number in range(iteration):
        #calculate phi_X(C)
        distance=point_sq_c(data_copy,centroid)
        #calculate the probability
        prob=dist_prob_parallel_c(distance,l).tolist()
        for n in range(data_copy.shape[0]):
            #if the probability is greater than the random uniform
            if prob[n]>np.random.uniform():
                #add the point to C
                centroid=np.vstack([centroid,np.array(data_copy[n])])
                #delete that point from the copy
                data_copy=np.delete(data_copy,n,axis=0)
    
    #step 4: calculate the weight probability
    w=weight_prob_c(data_copy,centroid)
    
    #step 5: recluster the weighted points in C into k clusters
    #reinitialize k centroids
    new_centroids=reassign_centroids_c(centroid,k,d,w)
    
    return new_centroids

    
#with the initialization of the centroids from the function kmeansplusplus
#plug in the original data(dataSet), initializtions(initial) and the dimension of the data(d)
def kmeans_c(dataSet, initial, k, d):
    centroids=initial
    # Initialize book keeping vars.
    iterations = 0
    oldCentroids = np.zeros(initial.shape)
    
    # Run the main k-means algorithm
    while not shouldStop_c(oldCentroids, centroids, iterations):
        # Save old centroids for convergence test. Book keeping.
        oldCentroids = centroids
        iterations += 1
        
        # Assign labels to each datapoint based on centroids
        l= getLabels_c(dataSet, centroids)
        
        # Assign centroids based on datapoint labels
        centroids = getCentroids_c(dataSet, l, k, d)
        
    # We can get the labels too by calling getLabels(dataSet, centroids)
    return centroids, np.array(l)
# Function: Should Stop
# -------------
# Returns True or False if k-means is done. K-means terminates either
# because it has run a maximum number of iterations OR the centroids
# stop changing.
def shouldStop_c(oldCentroids, centroids, iterations):
    if iterations > 50: return True
    return oldCentroids.all == centroids.all
# Function: Get Labels
# -------------
# Returns a label for each piece of data in the dataset. 
def getLabels_c(dataSet, centroids):
    # For each element in the dataset, chose the closest centroid. 
    # Make that centroid the element's label.
    l=[]
    for i in range(dataSet.shape[0]):
        #arg min as the label
        l.append(np.argmin(list(dist_sq_c(dataSet[i],c) for c in centroids)))
    return l
# Function: Get Centroids
# -------------
# Returns k random centroids, each of dimension n.
def getCentroids_c(dataSet, labels, k, d):
    # Each centroid is the arithmetic mean of the points that
    # have that centroid's label. Important: If a centroid is empty (no points have
    # that centroid's label) you should randomly re-initialize it.
    data_new = DataFrame(dataSet.copy())
    data_new['Labels'] = labels
    data_new = np.array(data_new.groupby(['Labels']).mean().iloc[:,:d])
    # if a centroid is empty, reinitialize it 
    if len(np.unique(labels))<k:
        diff=k-len(np.unique(labels))
        data_new=np.vstack([data_new,np.random.random([diff,d])])    
    return data_new

In [21]:
%timeit kmeansparallel_c(data,10,10,15)

NameError: name 'np' is not defined