In [1]:
import random
import numpy as np
import pandas as pd
from math import ceil, inf, sqrt
import plotly.express as px
from sklearn.datasets import make_blobs
from scipy.spatial.distance import cdist

In [3]:
# X, y_true = make_blobs(n_samples = 400, centers = 5, cluster_std = 0.60, random_state = 21, center_box = (2, 6))
# X, y_true = make_blobs(n_samples = 400, centers = 4, cluster_std = 0.60, random_state = 21)
# df = pd.DataFrame(data = {'col1': X[:, 0], 'col2': X[:, 1]})

In [114]:
petals = np.random.normal(4.5, 2, size=(1, 140))[0]
petals_count = np.random.normal(15, 8, size=(1, 140))[0]
y = ['rose' for i in range(140)]

petals = np.append(petals, np.random.normal(4, 1.5, size=(1, 120))[0])
petals_count = np.append(petals_count, np.random.normal(6, 1, size=(1, 120))[0])
y += ['tulpan' for i in range(120)]

petals = np.append(petals, np.random.normal(13, 2, size=(1, 100))[0])
petals_count = np.append(petals_count, np.random.normal(6, 0.5, size=(1, 100))[0])
y += ['gladiolus' for i in range(100)]

petals = np.append(petals, np.random.normal(1.5, 1, size=(1, 120))[0])
petals_count = np.append(petals_count, np.random.normal(7.5, 1.5, size=(1, 120))[0])
y += ['fialk' for i in range(120)]



In [115]:
np.random.normal(3, 2.5, size=(2, 4))

array([[-0.22173957,  6.46893805,  7.38177934,  1.45285973],
       [ 0.79372263,  4.06598   ,  2.71488572,  4.13198408]])

In [116]:
df = pd.DataFrame(data = {'petals': petals, 'petals_count': petals_count})

In [117]:
df.head()

Unnamed: 0,petals,petals_count
0,2.81382,36.235939
1,4.758918,5.332751
2,5.957011,26.40017
3,1.884869,15.914858
4,5.530425,0.995912


In [118]:
class K_means:
    def __init__(self,
                 K: int,
                 data: pd.DataFrame,
                 feat: int,
                 iter_n = None,
                 dist_f = None,
                 power = None) -> np.array:
        self.K = K
        self.data = data.copy()
        self.feat = feat
        self.iter_n = iter_n
        self.dist_f = dist_f
        self.power = power
        
        
    def centroid(self):
        centr = []
        for k in range(self.K):
            centr.append(np.array(self.data.iloc[random.randint(0, self.data.shape[0])].tolist()))        
        return centr    
    
    
    def evclid(self, point):
        dist = inf 
        point = np.array(point)
        for c in self.centr:
            dist_p = sqrt(sum((point-c)**2))
            if dist_p < dist:
                dist = dist_p
                cluster = c
        return np.where(self.centr == cluster)[0][0] 
    
    
    def rootevclid(self, point):
        dist = inf 
        point = np.array(point)
        for c in self.centr:
            dist_p = sum((point-c)**2)
            if dist_p < dist:
                dist = dist_p
                cluster = c
        return np.where(self.centr == cluster)[0][0]  
    
    
    def chebyshev(self, point):
        dist = inf 
        point = np.array(point)
        for c in self.centr:
            dist_p = abs(point - c).max()
            if dist_p < dist:
                dist = dist_p
                cluster = c
        return np.where(self.centr == cluster)[0][0]  
    
    
    def powerful(self, point):
        dist = inf 
        point = np.array(point)
        for c in self.centr:
            dist_p = sqrt(sum((point-c)**self.power))
            if dist_p < dist:
                dist = dist_p
                cluster = c
        return np.where(self.centr == cluster)[0][0] 
    

    def optim_centr(self):
        f = [i for i in range(self.feat)]
        
        if self.dist_f == 'evclid' or self.dist_f is None:
            self.data['cluster'] = self.data.apply(lambda x: self.evclid(x.iloc[f].values.flatten().tolist()), axis=1)
        elif self.dist_f == 'rootevclid':
            self.data['cluster'] = self.data.apply(lambda x: self.rootevclid(x.iloc[f].values.flatten().tolist()), axis=1)
        elif self.dist_f == 'chebyshev':
            self.data['cluster'] = self.data.apply(lambda x: self.chebyshev(x.iloc[f].values.flatten().tolist()), axis=1)
        elif self.dist_f == 'powerful' and self.power:
            self.data['cluster'] = self.data.apply(lambda x: self.powerful(x.iloc[f].values.flatten().tolist()), axis=1)
        else:
            assert("Please write power for function in arg 'power'")
        return self.data 
    
    
    def new_centroid(self):
        new_center = []
        for k in range(self.K):
            new_c = []
            for f in range(self.feat):
                c = self.data[self.data['cluster'] == k].iloc[:,f].mean()
                if c is np.nan:
                    new_c.append(self.data.iloc[random.randint(0, self.data.shape[0]-1), f].tolist()) 
                else:
                    new_c.append(c)
                
            new_center.append(np.array(new_c))
        return np.array(new_center)
        
            
    def fit(self):
        self.centr = self.centroid()
        if self.iter_n:
            for i in range(self.iter_n):
                self.optim_centr()
                self.centr = self.new_centroid()
            return self.centr
        else:
            while True:
                self.optim_centr()
                new_centr = self.new_centroid()
                if np.array_equal(new_centr, self.centr):
                    self.data = self.data.drop(['cluster'], axis=1)
                    return self.centr
                else:
                    self.centr = new_centr
                    
                    
    def plot(self, x, y, y_true):
        fig = px.scatter(self.data, x=x, y=y, color = y_true)
        fig.add_scatter(x=self.centr[:,0], y=self.centr[:,1], mode="markers", marker=dict(size=10))
        fig.show()


In [119]:
distortions = []
for k in range(1, 10):
    # Building and fitting the model
    kmeanModel =  K_means(k, df, 2)
    kmeanModel.fit()
  
    distortions.append(sum(np.min(cdist(df, kmeanModel.centr,
                                        'euclidean'), axis=1)) / df.shape[0])
fi = px.line(x=[i for i in range(1,10)], y=distortions)
fi.show()

In [126]:
k_mean = K_means(4, df, 2)
k_mean.fit();

In [127]:
k_mean.plot('petals', 'petals_count', y)

In [215]:
def centroid(K, feat, data):
    centr = []
    c = []
    for k in range(K):
        for f in range(feat):
            maximum = data.iloc[:,[f]].max()
            minimum = data.iloc[:,[f]].min()
            c.append(random.randint(ceil(minimum), ceil(maximum)))
        centr.append(np.array(c))
        c = []
    return centr

    

In [216]:
def nearest_center(center, point):
    dist = math.inf 
    point = np.array(point)
    for c in center:
        dist_p = sum((point-c)**2)
        if dist_p < dist:
            dist = dist_p
            cluster = c
    return np.where(center == cluster)[0][0]    


        

In [217]:
def optim_centr(center, data):
    f = [i for i in range(2)]
    data['cluster'] = data.apply(lambda x: nearest_center(center, x.iloc[f].values.flatten().tolist()), axis=1)
    
    return data


In [292]:
def new_centroid(data, K, feat):
    new_center = []
    for k in range(K):
        c = []
        for f in range(feat):
            c = df[df['cluster'] == k].iloc[:,f].mean()
            if c:
                c_new.append(c)
            else:
                
        new_center.append(c_new)
    return np.array(new_center)
            
    

IndentationError: expected an indented block after 'else' statement on line 9 (1832634860.py, line 11)