In [2]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from scipy.spatial import distance

In [5]:
class myKMC:

    def __init__(self,n_clusters=5):
        self.n_clusters_ = n_clusters
        self.centroid_ = []

    def fit(self,x):
        x = x.copy()
        converge = 0
        rows = x.shape[0]
        cols = x.shape[1]
        centroid = []
        
        for i in range(self.n_clusters_):
            idx = np.random.randint(rows)
            centroid.append(x.iloc[idx].to_numpy())
            
        while converge == 0:
            x = x.drop(columns='cluster' , errors = 'ignore')
            centroid_old = np.vstack(centroid)
            distances = distance.cdist(x.to_numpy() ,centroid_old ,metric='euclidean')
            cluster = np.argmin(distances , axis=1)
            x['cluster'] = cluster
            centroid_new = []
            
            for i in range(self.n_clusters_):
                x_temp = x[x['cluster'] == i].drop(columns='cluster',axis=1)
                if len(x_temp) > 0:
                    centroid_new.append(x_temp.mean(axis=0).to_numpy())
                else:
                    idx = np.random.randint(rows)
                    centroid_new.append(x.iloc[idx,:-1].to_numpy())
                    
            if np.allclose(centroid_old,np.vstack(centroid_new)):
                converge = 1
                self.centroids_ = np.vstack(centroid_new)
            else:
                converge = 0
                centroid = centroid_new
                
        return x

    def predict(self,x_test):
        x_test = x_test.copy()
        distances = distance.cdist(x_test.to_numpy(),self.centroids_,metric='euclidean')
        x_test['cluster'] = np.argmin(distances , axis=1)
        return x_test

In [6]:
import pandas as pd
import numpy as np

# Sample data
df = pd.DataFrame({
    'x1': np.random.rand(100),
    'x2': np.random.rand(100)
})

# Train model
model = myKMC(n_clusters=3)
df_clustered = model.fit(df)

# Predict new data
x_test = pd.DataFrame({
    'x1': np.random.rand(5),
    'x2': np.random.rand(5)
})
x_test_clustered = model.predict(x_test)

print(x_test_clustered)


         x1        x2  cluster
0  0.748456  0.637989        0
1  0.080956  0.445371        2
2  0.641137  0.026645        2
3  0.464747  0.802802        1
4  0.767530  0.862277        1


In [7]:
print(df_clustered)

          x1        x2  cluster
0   0.587980  0.473112        0
1   0.879784  0.436085        0
2   0.667363  0.532081        0
3   0.843891  0.626494        0
4   0.860840  0.641427        0
..       ...       ...      ...
95  0.732500  0.913380        1
96  0.759968  0.713898        0
97  0.983792  0.206162        0
98  0.713128  0.169179        0
99  0.046449  0.006657        2

[100 rows x 3 columns]
