In [161]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import math
from multiprocessing import Pool, cpu_count
from functools import partial

#distance square
def dist_sq(a, b, axis=0):
    return np.sum((a-b)**2)


def cost(data,centroid):
    return np.sum([min([dist_sq(d,centroid,axis=1)]) for d in data])
        

#calculate probability
def dist_prob_parallel(data,centroid,l):
    phi= cost(data,centroid)
    return np.array([min([dist_sq(d, centroid,axis=1)])*l/phi for d in data])


#step 2: calculate the cost and number of iterations(log(cost))
#def log_cost(data,centroid):
    #pool = Pool(processes=cpu_count())
    #min_dist_sq = partial(point_sq, centroid)
    #cost = np.sum(pool.map(min_dist_sq, data))
    #iteration=math.ceil(np.log(cost))
    #pool.close()
    #pool.terminate()
    #return iteration

    
    
#calculate weights
#step 4: assign the weights
def weight_prob(centroid,data):
    min_c = [np.argmin(dist_sq(centroid, d, axis = 1)) for d in data];
    ## number of points which is closest to each s in c
    num_closest = np.array([min_c.count(i) for i in range(len(centroid))]).astype(float);
    ## return normalized weight
    return num_closest/np.sum(num_closest)



#step 5: recluster the weighted points in C into k clusters
#reinitialize k centroids
def reassign_centroids(data,centroid,k,l,w):
    c_final = data[np.random.choice(range(len(centroid)),size=1,p=w),]
    data_final = centroid
    for i in range(k-1):
        new_prb = dist_prob_parallel(data_final,c_final,l) * w
        c_fin_add = data[np.random.choice(range(len(centroid)),size=1,p=new_prb/np.sum(new_prb)),]
        c_final = np.vstack((c_final,c_fin_add))
    return c_final

    

def kmeansparallel(data, k, l, r):
    #step 1: sample a point uniformly at random from X
    centroid=np.array(data[np.random.choice(range(len(data)),1),])
    
    #step 2: calculate number of iteration
    iteration= np.ceil(np.log(cost(data,centroid))).astype(int)  
    
    #step 3: Get initial Centroids C
    for round in range(r):
        for i in range(iteration):
            centroid_added = data[dist_prob_parallel(data,centroid,l)>np.random.uniform(size = len(data)),]
            centroid = np.vstack((centroid,centroid_added))  
    
    print(centroid)
    
    #step 4: calculate the weight probability
    w=weight_prob(centroid,data)
    
    print(w)
    
    #step 5: recluster the weighted points in C into k clusters
    #reinitialize k centroids
    final_centroids=reassign_centroids(data,centroid,k,l,w)
    
    return final_centroids

    

#with the initialization of the centroids from the function kmeansplusplus
#plug in the original data(dataSet), initializtions(initial) and the dimension of the data(d)
def kmeans(dataSet, initial, k, d):
    centroids=initial
    # Initialize book keeping vars.
    iterations = 0
    oldCentroids = np.zeros(initial.shape)
    
    # Run the main k-means algorithm
    while not shouldStop(oldCentroids, centroids, iterations):
        # Save old centroids for convergence test. Book keeping.
        oldCentroids = centroids
        iterations += 1
        
        # Assign labels to each datapoint based on centroids
        l= getLabels(dataSet, centroids)
        
        # Assign centroids based on datapoint labels
        centroids = getCentroids(dataSet, l, k, d)
        
    # We can get the labels too by calling getLabels(dataSet, centroids)
    return centroids, np.array(l)
# Function: Should Stop
# -------------
# Returns True or False if k-means is done. K-means terminates either
# because it has run a maximum number of iterations OR the centroids
# stop changing.
def shouldStop(oldCentroids, centroids, iterations):
    if iterations > 50: return True
    return oldCentroids.all == centroids.all
# Function: Get Labels
# -------------
# Returns a label for each piece of data in the dataset. 
def getLabels(dataSet, centroids):
    # For each element in the dataset, chose the closest centroid. 
    # Make that centroid the element's label.
    l=[]
    for i in range(dataSet.shape[0]):
        #arg min as the label
        l.append(np.argmin(list(dist_sq(dataSet[i],c) for c in centroids)))
    return l
# Function: Get Centroids
# -------------
# Returns k random centroids, each of dimension n.
def getCentroids(dataSet, labels, k, d):
    # Each centroid is the arithmetic mean of the points that
    # have that centroid's label. Important: If a centroid is empty (no points have
    data_new = DataFrame(dataSet.copy())
    data_new['Labels'] = labels
    data_new = np.array(data_new.groupby(['Labels']).mean().iloc[:,:d])
  
    return data_new
    

In [9]:
import simulatedData
from simulatedData import generate_data

In [162]:
sim_data=generate_data(4,10,2)
k=3
l=2*k
d=2
r=1
centroid=sim_data[np.random.choice(range(100),k),]

In [163]:
kmeansparallel(sim_data, k, l, r)

[[-6.88605923 -0.46102536]
 [-2.41023317  0.4073007 ]
 [ 0.20423863 -6.22064562]
 [ 2.60355633 -2.326925  ]
 [ 1.08738458 -4.17231211]
 [ 2.36820119 -5.36085607]
 [-2.94472194 -1.06849135]
 [-0.52447596 -3.76790967]
 [-2.51625372 -0.39641968]
 [-6.47632756 -1.85614733]
 [-3.51646295 -0.39246455]
 [-2.03316996 -0.30672732]
 [-2.02325758  1.32889932]
 [-6.89483529  0.16144599]
 [-3.14712922  1.51783058]
 [ 1.65506777 -4.21384562]
 [-0.48998038  2.18875639]
 [-5.52132854 -1.38781602]
 [-3.8603341   0.71786869]
 [-6.54565721 -0.96369736]
 [-3.26832223  1.31575341]
 [-5.79935235  0.07717894]
 [-7.54265793 -1.83507491]
 [-5.57038219 -0.02781417]
 [ 1.99092988 -2.86659645]
 [-6.13963641 -1.94841029]
 [-6.6847535  -1.76833252]
 [-5.8253138  -1.34543069]
 [-6.29877532 -1.57361754]
 [-1.80330597 -0.05279039]
 [-0.83817793  1.67138705]
 [ 0.51169587 -3.71022503]
 [-4.74297784 -1.39285099]
 [-4.04421043  0.80311203]
 [-5.52674902  0.02205338]
 [ 0.06145655 -0.12920348]
 [-3.24870311 -0.51487893]
 

array([[-2.28507195, -0.42850533],
       [-2.28507195, -0.42850533],
       [-2.28507195, -0.42850533]])