In [None]:
import numpy as np
import pandas as pd

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
''' Load data '''
df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Machine Learning/excelData/Clustering_Example_data.csv")
data = np.array(df.loc[:, 'x':'y'])
Lebel = np.array(df.loc[:, 'group'])

# **Clustering**

In [None]:
''' Step 0 : normalize '''
for i in range(data.shape[1]):
    data[:, i] = (data[:, i] - np.min(data[:, i]))/(np.max(data[:, i]) - np.min(data[:, i]))

In [None]:
''' Step 1 : initial center '''
k = 3  # number of cluster
numP = data.shape[0]
dimP = data.shape[1]
# print(data.shape)

center = np.zeros((k, dimP), dtype=np.float32)
np.random.seed(3)   # Set seed number for pseudo-random
center = np.random.randint(1, 1000, (k, dimP))/1000

In [None]:
''' Step 2 : clustering '''
cluster = np.zeros(numP, dtype=np.int32)
cluster_prev = cluster.copy()
iterations = 0
while True:
    cluster = np.zeros(numP, dtype=np.int32)
    for i in range(numP):
        dist = np.zeros(k)
        for j in range(k):
            d = np.power(data[i, :] - center[j, :], 2)
            dist[j] = np.sqrt(np.sum(d))

        clusterP = np.where(dist == np.amin(dist))
        cluster[i] = int(clusterP[0])

    for i in range(k):
        idx_cluster = np.where(cluster == i)[0]
        if idx_cluster.size != 0:
            center[i] = np.mean(data[idx_cluster], axis=0)

    if np.array_equal(cluster_prev, cluster) or (iterations >= 100):
        print('break in loop :', iterations)
        break

    cluster_prev = cluster.copy()
    iterations += 1

print('center : \n', center)
print(cluster)
print(np.bincount(cluster))

break in loop : 2
center : 
 [[0.75384615 0.16934337]
 [0.2        0.01669113]
 [0.86666667 0.85309092]]
[0 2 0 0 1 1 1 1 1 1 1 0 0 0 2 1 0 0 0 1 1 1 2 0 0 0 1 0 1 1]
[13 14  3]


## **One hot encoding**

In [None]:
X_cluster = np.zeros((cluster.shape[0], k))
for i, j in enumerate(cluster):
    X_cluster[i, j] = 1
# print(X_cluster)

Y_class = np.zeros((Lebel.shape[0], 3))
for i, j in enumerate(Lebel):
    Y_class[i, (j-1)] = 1
# print(Y_class)

## **Find Pseudo inverse**

$A = X^{-1}\cdot Y$

In [None]:
X_inv = np.dot(np.linalg.inv(np.dot(X_cluster.T, X_cluster)), X_cluster.T)    # find pseudo inverse
A = X_inv.dot(Y_class)
A

array([[0.23076923, 0.46153846, 0.30769231],
       [0.42857143, 0.21428571, 0.35714286],
       [0.33333333, 0.33333333, 0.33333333]])

# **Test new Data**

In [None]:
data = np.array(df.loc[:, 'x':'y'])
X = np.array([[4, 35]], dtype=np.float32)

# Normalize
for i in range(X.shape[1]):
    X[:, i] = (X[:, i] - np.min(data[:, i]))/(np.max(data[:, i]) - np.min(data[:, i]))
print(X)
print(center)

[[0.6        0.04775931]]
[[0.75384615 0.16934337]
 [0.2        0.01669113]
 [0.86666667 0.85309092]]


**K-mean Clustering**

In [None]:
dist = np.zeros(k)
for j in range(k):
    d = np.power(X - center[j, :], 2)
    dist[j] = np.sqrt(np.sum(d))

clusterP = np.where(dist == np.amin(dist))
Xcluster = int(clusterP[0])
Xcluster

0

**One hot encoding**

In [None]:
X_cluster = np.zeros((X.shape[0], k))
for i, j in enumerate([Xcluster]):
    X_cluster[i, j] = 1
X_cluster

array([[1., 0., 0.]])

**Predict**

In [None]:
A

array([[0.23076923, 0.46153846, 0.30769231],
       [0.42857143, 0.21428571, 0.35714286],
       [0.33333333, 0.33333333, 0.33333333]])

In [None]:
y_predict = X_cluster.dot(A)

Y_cluster = np.zeros((X.shape[0], k))
for i, j in enumerate(np.where(np.max(y_predict))):
    Y_cluster[i, j] = 1
Y_cluster

array([[1., 0., 0.]])