# Library Import

In [1]:
import numpy as np
import pandas as pd
import copy

# Loading the data

In [2]:
data = pd.read_csv("banknote-dataset.csv")
data.head()

Unnamed: 0,variance of Wavelet Transformed image,skewness of Wavelet Transformed image,curtosis of Wavelet Transformed image,entropy of image,class
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0


In [3]:
x = data.iloc[:,:4]
y = data.iloc[:,4]

# Kmeans Algorithm from Scratch

In [5]:
class K_means:
    def __init__(self,dim,k,threshold):
        self.dim = dim
        self.k = k
        self.threshold = threshold
        self.__create_cluster()

    def __create_cluster(self):
        self.k_info ={}
        for each_k in range(self.k):
            self.k_info.update({each_k:{'mean':0,'var':0}})

    def predict(self,data):
        self.data = np.hstack((data.to_numpy(),np.zeros(len(data)).reshape(-1,1)))
        centeroids = np.random.randint(0,len(data),self.k)
        centeroids = [self.data[cent,:-1] for cent in centeroids]

        i=0
        while i<1e5:
            for point in self.data:
                dist = []
                for index in range(len(centeroids)):
                    dist.append(self.__get_dist(centeroids[index], point[:-1]))
                cluster_id = np.argmin(dist)
                point[-1] = cluster_id
                self.__calc_mean()
                centeroids[cluster_id] = self.k_info[cluster_id]['mean']
            hist = copy.deepcopy(self.k_info)
            self.__update_var()
            i+=1
            if self.__check_var(hist):
                break
        return self.data
            

    def __calc_mean(self):
        for k in range(self.k):
            mem = self.data[self.data[:,-1]==k, :-1]
            mean = np.mean(mem,axis=0)
            self.k_info[k]['mean'] = mean
        
    
    def __update_var(self):
        for k in range(self.k):
            mem = self.data[self.data[:,-1]==k, :-1]
            var = np.var(mem)
            self.k_info[k]['var'] = var

    def __check_var(self, hist):
        for k in range(self.k):
            if np.abs(self.k_info[k]['var']-hist[k]['var']) <= np.abs(self.threshold):
                return True
        return False

        

    def __get_dist(self,p1,p2):
        return np.sqrt(np.sum(np.power((p1-p2),2)))


# Initialization and predictions

In [7]:
predictor = K_means(4,2,1e-10)

res = predictor.predict(x)[:,-1]

In [8]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y,res)

array([[570, 192],
       [340, 270]])

# Using Sklearn Library Function

In [10]:
from sklearn.cluster import KMeans

km = KMeans(
    n_clusters=2, init='random',
    n_init=10, max_iter=300, 
    tol=1e-04, random_state=0
)

y_km = km.fit_predict(x)

In [11]:
confusion_matrix(y,y_km)

array([[570, 192],
       [340, 270]])