In [1]:
import pandas as pd
import numpy as np
from random import randint

In [2]:
centroids=pd.DataFrame([(1,1.33,2.5),(2,3.33, 4)], columns=['id','x', 'y'])
centroids

Unnamed: 0,id,x,y
0,1,1.33,2.5
1,2,3.33,4.0


In [3]:
data=pd.DataFrame([(1,1,1.5),
                   (2,1,4.5),
                   (3,2,1.5),
                   (4,2,3.5),
                   (5,3,2.5),
                   (6,5,6)], columns=['id','x', 'y'])
data

Unnamed: 0,id,x,y
0,1,1,1.5
1,2,1,4.5
2,3,2,1.5
3,4,2,3.5
4,5,3,2.5
5,6,5,6.0


In [4]:
class Kmeans:
    def __init__(self, k, data):
        self.k=k
        self.initial_data = data.copy() 
        
        # initialize k centroids with mock values
        self.initial_centroids=pd.DataFrame(
            np.full((self.k,self.initial_data.shape[1]),-1),
            columns=self.initial_data.columns.values)
        self.initial_centroids['id']=np.arange(1,self.k+1)
        
        self.initial_data['k']=-1
        self.initial_data.columns += '_d'
        self.initial_centroids.columns += '_c'
        
        #create key to allow for join to get cartesian product
        self.initial_data['key']=1
        self.initial_centroids['key']=1
        
        self.data=self.initial_data.copy()
        self.centroids = self.initial_centroids.copy()
        
    
    def assign_centroids(self, dataframe, i):
        print('\n\nITER: '+str(i)+'\n\nPrzed assign: \n', self.data,'\n', self.centroids)
            
        # compute euclidan distance for all combinations
        df_tmp = dataframe.merge(self.centroids, how='outer')
        df_tmp['distance'] = df_tmp.apply(lambda x: np.linalg.norm(
            x[self.data.columns.values.tolist()[1:-2]].values
            -x[self.centroids.columns.values.tolist()[1:-1]].values),axis=1)

        # assign centroid that is closest to the record
        df_tmp=df_tmp.sort_values(['id_d','distance']).drop_duplicates(['id_d'],keep='first')
        df_tmp.reset_index(inplace=True)
        df_tmp['k_d']=df_tmp['id_c']
        df_tmp=df_tmp[self.data.columns.values.tolist()].copy()
        return df_tmp
    
    
    def calculate_centroids(self):
        cols=self.centroids.columns
        self.centroids=self.data.drop('id_d', axis=1).groupby(['k_d']).agg(['mean'])
        self.centroids.reset_index(inplace=True) 
        self.centroids.columns=cols
    
    
    def initialize_clusters(self, method='random'):
        #reset data and centroids
        self.data=self.initial_data.copy()
        self.centroids=self.initial_centroids.copy()
        
        if method == 'random':
            while any(init_k not in self.data['k_d'].values for init_k in range(1,self.k+1)): 
                self.data['k_d']=self.data['k_d'].apply(lambda x: randint(1,self.k))
                print(self.data)
                
        #calculate initial centroids       
        self.calculate_centroids()
        
    
    def Lloyd(self, iter):
        for i in range(iter):
            print('\n\nITER: '+str(i)+'\n\nPrzed assign: \n', self.data,'\n', self.centroids)
            
            self.data=self.assign_centroids(self.data, i)
            print('\nPo assign: \nDane\n', self.data)
            
            # update centroids
            self.calculate_centroids()
            print('\nPo calc cntroids: \nCentroidy\n', self.centroids)
            
    
    def McQuin(self):
        for index,row in self.data.iterrows():
            i=0
            # turn row of dataframe with updated point into a dataframe
            row = pd.DataFrame(row).T

            # assign centroid to updated point
            self.data.update(self.assign_centroids(row, i))
            print('\nPo assign: \nDane\n', self.data)
            
            # update centroids
            self.calculate_centroids()
            print('\nPo calc cntroids: \nCentroidy\n', self.centroids)


In [5]:
ex4 = Kmeans(2,data)

In [6]:
ex4.initialize_clusters()

   id_d  x_d  y_d  k_d  key
0     1    1  1.5    2    1
1     2    1  4.5    1    1
2     3    2  1.5    1    1
3     4    2  3.5    2    1
4     5    3  2.5    1    1
5     6    5  6.0    2    1


In [7]:
ex4.Lloyd(5)



ITER: 0

Przed assign: 
    id_d  x_d  y_d  k_d  key
0     1    1  1.5    2    1
1     2    1  4.5    1    1
2     3    2  1.5    1    1
3     4    2  3.5    2    1
4     5    3  2.5    1    1
5     6    5  6.0    2    1 
    id_c       x_c       y_c  key
0     1  2.000000  2.833333    1
1     2  2.666667  3.666667    1


ITER: 0

Przed assign: 
    id_d  x_d  y_d  k_d  key
0     1    1  1.5    2    1
1     2    1  4.5    1    1
2     3    2  1.5    1    1
3     4    2  3.5    2    1
4     5    3  2.5    1    1
5     6    5  6.0    2    1 
    id_c       x_c       y_c  key
0     1  2.000000  2.833333    1
1     2  2.666667  3.666667    1

Po assign: 
Dane
    id_d  x_d  y_d  k_d  key
0     1    1  1.5    1    1
1     2    1  4.5    2    1
2     3    2  1.5    1    1
3     4    2  3.5    1    1
4     5    3  2.5    1    1
5     6    5  6.0    2    1

Po calc cntroids: 
Centroidy
    id_c  x_c   y_c  key
0     1    2  2.25    1
1     2    3  5.25    1


ITER: 1

Przed assign: 
    id_d

In [11]:
ex4 = Kmeans(2,data)

In [12]:
ex4.initialize_clusters()

   id_d  x_d  y_d  k_d  key
0     1    1  1.5    1    1
1     2    1  4.5    1    1
2     3    2  1.5    1    1
3     4    2  3.5    1    1
4     5    3  2.5    1    1
5     6    5  6.0    1    1
   id_d  x_d  y_d  k_d  key
0     1    1  1.5    1    1
1     2    1  4.5    2    1
2     3    2  1.5    2    1
3     4    2  3.5    2    1
4     5    3  2.5    2    1
5     6    5  6.0    1    1


In [13]:
ex4.McQuin()



ITER: 0

Przed assign: 
    id_d  x_d  y_d  k_d  key
0     1    1  1.5    1    1
1     2    1  4.5    2    1
2     3    2  1.5    2    1
3     4    2  3.5    2    1
4     5    3  2.5    2    1
5     6    5  6.0    1    1 
    id_c  x_c   y_c  key
0     1    3  3.75    1
1     2    2  3.00    1

Po assign: 
Dane
    id_d  x_d  y_d  k_d  key
0   1.0  1.0  1.5  2.0  1.0
1   2.0  1.0  4.5  2.0  1.0
2   3.0  2.0  1.5  2.0  1.0
3   4.0  2.0  3.5  2.0  1.0
4   5.0  3.0  2.5  2.0  1.0
5   6.0  5.0  6.0  1.0  1.0

Po calc cntroids: 
Centroidy
    id_c  x_c  y_c  key
0   1.0  5.0  6.0  1.0
1   2.0  1.8  2.7  1.0


ITER: 0

Przed assign: 
    id_d  x_d  y_d  k_d  key
0   1.0  1.0  1.5  2.0  1.0
1   2.0  1.0  4.5  2.0  1.0
2   3.0  2.0  1.5  2.0  1.0
3   4.0  2.0  3.5  2.0  1.0
4   5.0  3.0  2.5  2.0  1.0
5   6.0  5.0  6.0  1.0  1.0 
    id_c  x_c  y_c  key
0   1.0  5.0  6.0  1.0
1   2.0  1.8  2.7  1.0

Po assign: 
Dane
    id_d  x_d  y_d  k_d  key
0   2.0  1.0  4.5  2.0  1.0
1   2.0  1.0  4.5  