In [0]:
import pandas as pd
import numpy as np
import math
from scipy.spatial import distance

# Dataset

In [0]:
data=pd.read_csv(r'Absenteeism_at_work.csv')

In [0]:
data.shape

(740, 21)

In [0]:
data

Unnamed: 0,ID,Reason for absence,Month of absence,Day of the week,Seasons,Transportation expense,Distance from Residence to Work,Service time,Age,Work load Average/day,Hit target,Disciplinary failure,Education,Son,Social drinker,Social smoker,Pet,Weight,Height,Body mass index,Absenteeism time in hours
0,11,26,7,3,1,289,36,13,33,239554,97,0,1,2,1,0,1,90,172,30,4
1,36,0,7,3,1,118,13,18,50,239554,97,1,1,1,1,0,0,98,178,31,0
2,3,23,7,4,1,179,51,18,38,239554,97,0,1,0,1,0,0,89,170,31,2
3,7,7,7,5,1,279,5,14,39,239554,97,0,1,2,1,1,0,68,168,24,4
4,11,23,7,5,1,289,36,13,33,239554,97,0,1,2,1,0,1,90,172,30,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
735,11,14,7,3,1,289,36,13,33,264604,93,0,1,2,1,0,1,90,172,30,8
736,1,11,7,3,1,235,11,14,37,264604,93,0,3,1,0,0,1,88,172,29,4
737,4,0,0,3,1,118,14,13,40,271219,95,0,1,1,1,0,8,98,170,34,0
738,8,0,0,4,2,231,35,14,39,271219,95,0,1,2,1,0,2,100,170,35,0


In [0]:
data=data.values

# Initial seed selection processes

In [0]:
def seed_selection(n):
    k=[0]*n
    seed=np.random.randint(data.shape[0], size=1)
    for i in range(n):
        if i==0:
            k[0]=math.floor(seed)
        elif (i==1):
            k[1]=math.floor(k[0]/2)
        else:
            k[i]=math.floor(sum(k)/len(k))
    return(k)

In [0]:
#initial centroids(serial No)
k=seed_selection(2)
k

[677, 338]

In [0]:
dist_mat = np.zeros((len(k),740))

# Distance matrix

In [0]:
def eclid_distance(centroid):
    for i in range(len(centroid)):
        for j in range(data.shape[0]):
              dist_mat[i][j] =np.sqrt(np.sum(np.square(centroid[i]-data[j:j+1,])))       
    return(dist_mat)

# K-mean

In [0]:
def kmean(k):
    centroid=[]
    for i in range(len(k)):
        centroid.append(data[k[i]])
    dist_mat=eclid_distance(centroid)
    for i in range(21):
        x = [[] for _ in range(len(k))]
        for i in range(data.shape[0]):
            for j in range(len(k)):
                if (dist_mat[:,i])[j]==dist_mat[:,i].min():
                    x[j].append(data[i])
        for i in range(len(k)):
            x[i]=np.array(x[i])
            centroid[i]=x[i].mean(axis=0)
    return x,centroid

In [0]:
cls,centroid = kmean(k)

In [0]:
# clusters with their data items belonging to the respective clusters with 4th cluster containing maximum data items.
cls

[array([[ 11,  26,   7, ..., 172,  30,   4],
        [ 36,   0,   7, ..., 178,  31,   0],
        [  3,  23,   7, ..., 170,  31,   2],
        ...,
        [ 17,  25,   5, ..., 170,  22,   8],
        [ 14,  10,   5, ..., 196,  25,   8],
        [ 28,  11,   5, ..., 169,  24,   1]]),
 array([[  3,  23,  11, ..., 170,  31,   1],
        [ 28,  23,  11, ..., 169,  24,   1],
        [  3,  13,  11, ..., 170,  31,   8],
        ...,
        [  4,   0,   0, ..., 170,  34,   0],
        [  8,   0,   0, ..., 170,  35,   0],
        [ 35,   0,   0, ..., 175,  25,   0]])]

In [0]:
len(cls[0][0])

21

In [0]:
#final centroids
centroid

[array([1.67507987e+01, 2.04824281e+01, 5.97124601e+00, 3.84345048e+00,
        2.33865815e+00, 2.19571885e+02, 3.09169329e+01, 1.30191693e+01,
        3.69648562e+01, 2.38557556e+05, 9.54057508e+01, 4.47284345e-02,
        1.32268371e+00, 1.00638978e+00, 6.13418530e-01, 7.66773163e-02,
        6.03833866e-01, 7.96293930e+01, 1.71725240e+02, 2.70095847e+01,
        6.67412141e+00]),
 array([1.89461358e+01, 1.82880562e+01, 6.58313817e+00, 3.96721311e+00,
        2.69555035e+00, 2.22618267e+02, 2.86885246e+01, 1.22131148e+01,
        3.60725995e+01, 2.95630583e+05, 9.39882904e+01, 6.08899297e-02,
        1.26932084e+00, 1.02810304e+00, 5.33957845e-01, 7.02576112e-02,
        8.50117096e-01, 7.85995316e+01, 1.72400468e+02, 2.64332553e+01,
        7.10772834e+00])]

In [0]:
cls[0][0]

array([    11,     26,      7,      3,      1,    289,     36,     13,
           33, 239554,     97,      0,      1,      2,      1,      0,
            1,     90,    172,     30,      4])

In [0]:
#squared error for first cluster
sse_1 = 0
for i in range(len(cls[0])):
    sse_1 = sse_1 + np.square(distance.euclidean(centroid[0],cls[0][i]))

In [0]:
sse_1

50631916320.74126

In [0]:
#sqaured error for second cluster
sse_2 = 0
for i in range(len(cls[1])):
    sse_2 = sse_2 + np.square(distance.euclidean(centroid[1],cls[1][i]))

In [0]:
sse_2

488438799872.1359

In [0]:
#final squared error
sse = sse_1 + sse_2
sse

539070716192.8772

In [0]:

from sklearn.cluster import KMeans
import numpy as np

kmeans = KMeans(n_clusters=2, random_state=0).fit(data)

kmeans.cluster_centers_


array([[1.76805054e+01, 1.93880866e+01, 7.00722022e+00, 3.90613718e+00,
        2.56137184e+00, 2.22146209e+02, 2.99512635e+01, 1.25108303e+01,
        3.65902527e+01, 2.52753984e+05, 9.48176895e+01, 5.59566787e-02,
        1.32490975e+00, 1.00180505e+00, 5.75812274e-01, 6.67870036e-02,
        7.74368231e-01, 7.93122744e+01, 1.71873646e+02, 2.68574007e+01,
        6.74007220e+00],
       [1.90215054e+01, 1.87043011e+01, 4.29032258e+00, 3.94086022e+00,
        2.49462366e+00, 2.18897849e+02, 2.86774194e+01, 1.26827957e+01,
        3.60322581e+01, 3.27296059e+05, 9.39032258e+01, 4.83870968e-02,
        1.19354839e+00, 1.06989247e+00, 5.43010753e-01, 9.13978495e-02,
        6.61290323e-01, 7.82096774e+01, 1.72833333e+02, 2.61397849e+01,
        7.47311828e+00]])

In [0]:
#sum of squared errors
kmeans.inertia_

353637366625.63495