In [1]:
import numpy as np
import math
import copy

# Week 1: Introduction to Clustering Algorithms

In [2]:
'''
Code Challenge: Implement the FarthestFirstTraversal clustering heuristic.
Input: Integers k and m followed by a set of points Data in m-dimensional space.
Output: A set Centers consisting of k points (centers) resulting from applying FarthestFirstTraversal(Data, k), where the first point from Data is chosen as the first center to initialize the algorithm.
'''

def Euclidean_Distance(point1, point2):
    sum_square_delta = 0
    for v, w in zip(point1, point2):
        sum_square_delta = sum_square_delta + (v - w)**2
    distance = math.sqrt(sum_square_delta)
    return(distance)

def Farthest_First_Traversal(k, m, points_data):
    if type(points_data) == str:
        points      = []
        points_data = points_data.split('\n')
        for point_data in points_data:
            point_data = point_data.split(' ')
            point_data = list(map(float, point_data))
            points.append(point_data)
            
    cluster_centers   = [points[0]]
    
    while len(cluster_centers) != k:
        
        max_distance_centers_point = -float('Inf')
        
        for point in points:
            min_distance_center_point = float('Inf')
            
            for center in cluster_centers:
                distance = Euclidean_Distance(center, point)
                if distance < min_distance_center_point:
                    min_distance_center_point = distance
            
            if min_distance_center_point > max_distance_centers_point:
                max_distance_centers_point = min_distance_center_point
                max_point = point
                
        cluster_centers.append(max_point)      
    
    return(cluster_centers)

# Test
k, m = 3,2
points_data = '''0.0 0.0
5.0 5.0
0.0 5.0
1.0 1.0
2.0 2.0
3.0 3.0
1.0 2.0'''

Farthest_First_Traversal(k, m, points_data)

[[0.0, 0.0], [5.0, 5.0], [0.0, 5.0]]

In [3]:
'''
Code Challenge: Solve the Squared Error Distortion Problem.
Input: Integers k and m, followed by a set of centers Centers and a set of points Data.
Output: The squared error distortion Distortion(Data, Centers).
'''

def Squared_Error_Distortion(points_data, centers_data):
    if type(points_data) == str:
        points      = []
        points_data = points_data.split('\n')
        for point_data in points_data:
            point_data = point_data.split(' ')
            point_data = list(map(float, point_data))
            points.append(point_data)
    else:
        points = points_data
        
    if type(centers_data) == str:
        centers      = []
        centers_data = centers_data.split('\n')
        for center_data in centers_data:
            center_data = center_data.split(' ')
            center_data = list(map(float, center_data))
            centers.append(center_data)
    else: 
        centers = centers_data
    
    distortion = 0
    
    for point in points:
        min_distance_center_point = float('Inf')
        for center in centers:
            distance = Euclidean_Distance(center, point)
            if distance < min_distance_center_point:
                min_distance_center_point = distance
                
        distortion = distortion + min_distance_center_point**2
        
    distortion = distortion / len(points)
    
    return(distortion)     

# Test
centers_data = '''2.31 4.55
5.96 9.08'''

points_data = '''3.42 6.03
6.23 8.25
4.76 1.64
4.47 4.33
3.95 7.61
8.93 2.97
9.74 4.03
1.73 1.28
9.72 5.01
7.27 3.77'''

Squared_Error_Distortion(points_data, centers_data)

18.24556

In [4]:
'''
Code Challenge: Implement the Lloyd algorithm for k-means clustering.
Input: Integers k and m followed by a set of points Data in m-dimensional space.
Output: A set Centers consisting of k points (centers) resulting from applying the Lloyd algorithm to Data and Centers, where the first k points from Data are selected as the first k centers.
'''

def Center_of_Gravity(points):
    n_col = points.shape[1]
    n_row = points.shape[0]
    center = []
    for j in range(n_col):
        col = points[:, j]
        center.append(np.sum(col) / n_row)
    return(center)

def Lloyd_k_Means(k, m, points_data):
    if type(points_data) == str:
        points      = []
        points_data = points_data.split('\n')
        for point_data in points_data:
            point_data = point_data.split(' ')
            point_data = list(map(float, point_data))
            points.append(point_data)
    else:
        points = points_data
    
    points  = np.array(points).reshape(len(points_data), m)
    centers = points[: k]
    error   = Squared_Error_Distortion(points, centers)
    
    while True:

        cluster_belonging = []
        for point in points:
            min_distance_center_point = float('Inf')
            
            for j in range(len(centers)):
                distance = Euclidean_Distance(centers[j], point)
                if distance < min_distance_center_point:
                    min_distance_center_point = distance
                    min_center = int(j)
            
            cluster_belonging.append(min_center)
        cluster_belonging = np.array(cluster_belonging)

        current_centers = []

        for j in range(len(centers)):

            current_cluster_points = points[cluster_belonging == j]
            current_centers.append(Center_of_Gravity(current_cluster_points)) 
        
        
        current_error = Squared_Error_Distortion(points, current_centers)
        
        if abs(current_error - error) < 0.000001:
            return(current_centers)
        else:

            centers = current_centers
            error   = current_error
            
# Test
k, m = 2, 2
points_data = '''1.3 1.1
1.3 0.2
0.6 2.8
3.0 3.2
1.2 0.7
1.4 1.6
1.2 1.0
1.2 1.1
0.6 1.5
1.8 2.6
1.2 1.3
1.2 1.0
0.0 1.9'''

Lloyd_k_Means(k, m, points_data)

[[1.8, 2.8666666666666667], [1.0599999999999998, 1.1400000000000001]]