# K-Means Algorithm

This notebook will present the development of the k-means algorithm done by my incredible self !

The goal here is not to use scikit-learn straight ahead, but rather to learn by myself how k-means works under the hood and what are the different mechanisms ruling it.

So Now lets go baby !

In [68]:
import numpy
import sklearn
from sklearn import datasets

### Initialization ###

# First set k
k = 3
# Retrieve the data
X = datasets.load_iris().data
# First initialization of mu (the matrix of centroids)
n_rows = X.shape[0]
random_indices = numpy.random.choice(n_rows,k,replace=False)
mu = X[random_indices,:] # (k,4) matrix
# Affectation matrix
A = numpy.zeros((X.shape[0],k))
# Distance matrix distances[i,j] is the distance between the datapoint i and the centroid j.
distances = numpy.zeros((len(X),k))

def calc_affectation_matrix(debug: bool = False) -> numpy.ndarray:
    # First, compute the distances of every datapoint to every centroides
    for i in range(len(X)):
        for j in range(k):
            distances[i,j] = numpy.sum(numpy.square(X[i] - mu[j]))
    # Print that just for debug purpose
    if debug: print(distances, distances.shape)
    
    # Then, retrieve the argmin (ie. the index of the smallest element in each row)
    argmin = numpy.argmin(distances, axis=1)
    # Print that just for debug purpose
    if debug: print(argmin, argmin.shape)
    
    # Initialize an affectation matrix to 0 of shape (n,k)
    A = numpy.zeros((X.shape[0],k))
    for i, j in enumerate(argmin):
        # Set only the smallest element to 1
        A[i,j] = 1
      
    # Return it  
    return A
    
def calc_centroïds_matrix():
    # Update every centroids j
    for j in range(k):
        sum_vector_cluster_j = 0
        div = 0
        for i in range(n_rows):
            sum_vector_cluster_j += A[i,j] * X[i]
            div += A[i,j]
        mu[j] = sum_vector_cluster_j / div
        
def loss_function():
    loss = 0
    for i in range(n_rows):
        for j in range(k):
            if A[i,j]:
                loss += distances[i,j]
    return loss

def verif():
    possible_mappings = [[0]*3 for _ in range(3)]
    a = numpy.argmax(A, axis=1)
    b = datasets.load_iris().target
    for el_a, el_b in zip(a, b):
        possible_mappings[el_a][el_b] += 1
        
    mapping = numpy.argmax(possible_mappings, axis=1)
    return sum(mapping[predicted] == b[i] for i, predicted in enumerate(a))
       
previous_loss = -1
curr_loss = loss_function()
while curr_loss != previous_loss:
    print(mu, loss_function())
    print()
    A = calc_affectation_matrix()
    calc_centroïds_matrix()
    previous_loss, curr_loss = curr_loss, loss_function()
    
print(f"{verif()} predictions were correct out of {n_rows} datapoints")


[[4.4 2.9 1.4 0.2]
 [5.7 2.8 4.1 1.3]
 [7.9 3.8 6.4 2. ]] 0

[[5.006      3.428      1.462      0.246     ]
 [6.00394737 2.79210526 4.56842105 1.52631579]
 [7.07916667 3.125      5.975      2.15      ]] 180.8300000000001

[[5.006      3.428      1.462      0.246     ]
 [5.95588235 2.76470588 4.46323529 1.46176471]
 [6.9125     3.1        5.846875   2.13125   ]] 83.86535287780859

[[5.006      3.428      1.462      0.246     ]
 [5.93230769 2.75538462 4.42923077 1.43846154]
 [6.87428571 3.08857143 5.79142857 2.11714286]] 79.86398439527464

[[5.006      3.428      1.462      0.246     ]
 [5.9016129  2.7483871  4.39354839 1.43387097]
 [6.85       3.07368421 5.74210526 2.07105263]] 79.19714263977782

[[5.006      3.428      1.462      0.246     ]
 [5.9016129  2.7483871  4.39354839 1.43387097]
 [6.85       3.07368421 5.74210526 2.07105263]] 78.851441426146

134 predictions were correct out of 150 datapoints


In [51]:
import numpy as np

a = np.array([0, 0, 0, 1, 1, 2, 2])
b = np.array([2, 2, 2, 0, 0, 1, 1])

# Calcul de la distance euclidienne
distance = np.linalg.norm(a - b)

print("Distance euclidienne :", distance)


Distance euclidienne : 4.0
