In [1]:
#distance square
def dist_sq(a, b):
    return np.sum((a-b)**2)
#minimum distance square for every point to the centroid
def point_sq(data,centroid):
    dist=[]
    for i in range(data.shape[0]):
        dist.append(min(dist_sq(data[i],c) for c in centroid))
    return dist
        
#calculate probability
def dist_prob(Dist):
    return Dist/np.sum(Dist)

In [2]:

def kmeansplusplus(data, k, d):
    #make a copy of the data
    data_copy=data.copy()
    #step 1: sample a point uniformly at random from x
    index=int(np.random.choice(data_copy.shape[0],1))
    centroid=data_copy[index]
    #once the centroid is determined, delete it from the copy 
    data_copy=np.delete(data_copy,index,axis=0)
    #step 2: while c<k, sample x from X with probability d^2/phi_x(C)
    for number in range(k-1):
        #calculate the square difference for every point in the copy to its nearest center
        distance=point_sq(data_copy,centroid)
        #calculate the probability
        prob=dist_prob(distance).tolist()
        #randomly sample another centroid
        index=int(np.random.choice(data_copy.shape[0],1,prob))
        #add the new centroid
        centroid=np.vstack([centroid,data_copy[index]])
        #delete the new centroid from the copy
        data_copy=np.delete(data_copy,index,axis=0)
    return centroid


In [3]:
#with the initialization of the centroids from the function kmeansplusplus
#plug in the original data(dataSet), initializtions(initial) and the dimension of the data(d)
def kmeans(dataSet, initial, d):
    centroids=initial
    k=centroids.shape[0]
    # Initialize book keeping vars.
    iterations = 0
    oldCentroids = np.zeros(initial.shape)
    
    # Run the main k-means algorithm
    while not shouldStop(oldCentroids, centroids, iterations):
        # Save old centroids for convergence test. Book keeping.
        oldCentroids = centroids
        iterations += 1
        
        # Assign labels to each datapoint based on centroids
        l= getLabels(dataSet, centroids)
        
        # Assign centroids based on datapoint labels
        centroids = getCentroids(dataSet, l, k, d)
        
    # We can get the labels too by calling getLabels(dataSet, centroids)
    return centroids, np.array(l)

In [4]:
# Function: Should Stop
# -------------
# Returns True or False if k-means is done. K-means terminates either
# because it has run a maximum number of iterations OR the centroids
# stop changing.
def shouldStop(oldCentroids, centroids, iterations):
    if iterations > 50: return True
    return oldCentroids.all == centroids.all


In [5]:
# Function: Get Labels
# -------------
# Returns a label for each piece of data in the dataset. 
def getLabels(dataSet, centroids):
    # For each element in the dataset, chose the closest centroid. 
    # Make that centroid the element's label.
    l=[]
    for i in range(data.shape[0]):
        #arg min as the label
        l.append(np.argmin(list(dist_sq(data[i],c) for c in centroids)))
    return l

            


In [6]:
# Function: Get Centroids
# -------------
# Returns k random centroids, each of dimension n.
def getCentroids(dataSet, labels, k, d):
    # Each centroid is the arithmetic mean of the points that
    # have that centroid's label. Important: If a centroid is empty (no points have
    # that centroid's label) you should randomly re-initialize it.
    data_new = DataFrame(dataSet.copy())
    data_new['Labels'] = labels
    data_new = np.array(data_new.groupby(['Labels']).mean().iloc[:,:2])
    # if a centroid is empty, reinitialize it 
    if len(np.unique(labels))<k:
        diff=k-len(np.unique(labels))
        data_new=np.vstack([data_new,np.random.random([diff,d])])    
    return data_new
    

In [7]:
#test 
data=np.array(DataFrame([np.random.random(100),np.random.random(100)]).transpose())


In [8]:
initial= kmeansplusplus(data,8,2)
kmeans(data, initial,2)

(array([[ 0.71918083,  0.86783032],
        [ 0.2309956 ,  0.41364373],
        [ 0.39628784,  0.87422448],
        [ 0.88068577,  0.45512549],
        [ 0.10535794,  0.11895624],
        [ 0.67916171,  0.18225129],
        [ 0.54517364,  0.59593828],
        [ 0.10850523,  0.81978801]]),
 array([6, 6, 4, 4, 1, 2, 6, 2, 1, 2, 7, 1, 1, 0, 5, 5, 7, 7, 6, 6, 2, 0, 1,
        3, 7, 3, 5, 0, 6, 5, 0, 5, 2, 5, 0, 1, 2, 6, 5, 4, 3, 1, 1, 2, 3, 2,
        5, 2, 1, 1, 1, 7, 7, 5, 7, 5, 6, 5, 6, 4, 0, 1, 2, 0, 1, 4, 3, 6, 2,
        1, 6, 6, 5, 7, 7, 1, 5, 4, 0, 1, 2, 0, 3, 5, 7, 7, 6, 1, 4, 6, 5, 3,
        4, 0, 2, 5, 4, 1, 7, 3]))