In [1]:
import numpy as np
import math
import copy

# Week 1: Introduction to Clustering Algorithms

In [2]:
'''
Code Challenge: Implement the FarthestFirstTraversal clustering heuristic.
Input: Integers k and m followed by a set of points Data in m-dimensional space.
Output: A set Centers consisting of k points (centers) resulting from applying FarthestFirstTraversal(Data, k), where the first point from Data is chosen as the first center to initialize the algorithm.
'''

def Euclidean_Distance(point1, point2):
    sum_square_delta = 0
    for v, w in zip(point1, point2):
        sum_square_delta = sum_square_delta + (v - w)**2
    distance = math.sqrt(sum_square_delta)
    return(distance)

def Farthest_First_Traversal(k, m, points_data):
    if type(points_data) == str:
        points      = []
        points_data = points_data.split('\n')
        for point_data in points_data:
            point_data = point_data.split(' ')
            point_data = list(map(float, point_data))
            points.append(point_data)
            
    cluster_centers   = [points[0]]
    
    while len(cluster_centers) != k:
        
        max_distance_centers_point = -float('Inf')
        
        for point in points:
            min_distance_center_point = float('Inf')
            
            for center in cluster_centers:
                distance = Euclidean_Distance(center, point)
                if distance < min_distance_center_point:
                    min_distance_center_point = distance
            
            if min_distance_center_point > max_distance_centers_point:
                max_distance_centers_point = min_distance_center_point
                max_point = point
                
        cluster_centers.append(max_point)      
    
    return(cluster_centers)

# Test
k, m = 3,2
points_data = '''0.0 0.0
5.0 5.0
0.0 5.0
1.0 1.0
2.0 2.0
3.0 3.0
1.0 2.0'''

Farthest_First_Traversal(k, m, points_data)

[[0.0, 0.0], [5.0, 5.0], [0.0, 5.0]]

In [3]:
'''
Code Challenge: Solve the Squared Error Distortion Problem.
Input: Integers k and m, followed by a set of centers Centers and a set of points Data.
Output: The squared error distortion Distortion(Data, Centers).
'''

def Squared_Error_Distortion(points_data, centers_data):
    if type(points_data) == str:
        points      = []
        points_data = points_data.split('\n')
        for point_data in points_data:
            point_data = point_data.split(' ')
            point_data = list(map(float, point_data))
            points.append(point_data)
    else:
        points = points_data
        
    if type(centers_data) == str:
        centers      = []
        centers_data = centers_data.split('\n')
        for center_data in centers_data:
            center_data = center_data.split(' ')
            center_data = list(map(float, center_data))
            centers.append(center_data)
    else: 
        centers = centers_data
    
    distortion = 0
    
    for point in points:
        min_distance_center_point = float('Inf')
        for center in centers:
            distance = Euclidean_Distance(center, point)
            if distance < min_distance_center_point:
                min_distance_center_point = distance
                
        distortion = distortion + min_distance_center_point**2
        
    distortion = distortion / len(points)
    
    return(distortion)     

# Test
centers_data = '''2.31 4.55
5.96 9.08'''

points_data = '''3.42 6.03
6.23 8.25
4.76 1.64
4.47 4.33
3.95 7.61
8.93 2.97
9.74 4.03
1.73 1.28
9.72 5.01
7.27 3.77'''

Squared_Error_Distortion(points_data, centers_data)

18.24556

In [4]:
'''
Code Challenge: Implement the Lloyd algorithm for k-means clustering.
Input: Integers k and m followed by a set of points Data in m-dimensional space.
Output: A set Centers consisting of k points (centers) resulting from applying the Lloyd algorithm to Data and Centers, where the first k points from Data are selected as the first k centers.
'''

def Center_of_Gravity(points):
    n_col = points.shape[1]
    n_row = points.shape[0]
    center = []
    for j in range(n_col):
        col = points[:, j]
        center.append(np.sum(col) / n_row)
    return(center)

def Lloyd_k_Means(k, m, points_data):
    if type(points_data) == str:
        points      = []
        points_data = points_data.split('\n')
        for point_data in points_data:
            point_data = point_data.split(' ')
            point_data = list(map(float, point_data))
            points.append(point_data)
    else:
        points = points_data
    
    points  = np.array(points).reshape(len(points_data), m)
    centers = points[: k]
    error   = Squared_Error_Distortion(points, centers)
    
    while True:

        cluster_belonging = []
        for point in points:
            min_distance_center_point = float('Inf')
            
            for j in range(len(centers)):
                distance = Euclidean_Distance(centers[j], point)
                if distance < min_distance_center_point:
                    min_distance_center_point = distance
                    min_center = int(j)
            
            cluster_belonging.append(min_center)
        cluster_belonging = np.array(cluster_belonging)

        current_centers = []

        for j in range(len(centers)):

            current_cluster_points = points[cluster_belonging == j]
            current_centers.append(Center_of_Gravity(current_cluster_points)) 
        
        
        current_error = Squared_Error_Distortion(points, current_centers)
        
        if abs(current_error - error) < 0.000001:
            return(current_centers)
        else:

            centers = current_centers
            error   = current_error
            
# Test
k, m = 2, 2
points_data = '''1.3 1.1
1.3 0.2
0.6 2.8
3.0 3.2
1.2 0.7
1.4 1.6
1.2 1.0
1.2 1.1
0.6 1.5
1.8 2.6
1.2 1.3
1.2 1.0
0.0 1.9'''

Lloyd_k_Means(k, m, points_data)

[[1.8, 2.8666666666666667], [1.0599999999999998, 1.1400000000000001]]

In [5]:
points = np.array([[2,8], [2,5], [6,9] ,[7,5],[5,2]])
hidden_matrix = np.transpose(np.array([[0.5, 0.3, 0.8, 0.4, 0.9],[0.5, 0.7, 0.2, 0.6, 0.1]]))

# Week 2: Advanced Clustering Techniques

In [6]:
'''
Code Challenge: Implement the expectation maximization algorithm for soft k-means clustering.
Input: Integers k and m, followed by a stiffness parameter β, followed by a set of points Data in m-dimensional space.
Output: A set Centers consisting of k points (centers) resulting from applying the expectation maximization algorithm for soft
    k-means clustering. Select the first k points from Data as the first centers for the algorithm and run the algorithm for 100
    E-steps and 100 M-steps.
'''

def E_Steps(centers, points, beta):
    hidden_matrix = np.full((points.shape[0], centers.shape[0]), 0, float)
    
    for i in range(points.shape[0]):
        for j in range(centers.shape[0]):
            hidden_matrix[i, j] = np.exp(-1 * beta * Euclidean_Distance(points[i], centers[j]))
        hidden_matrix[i, :] = hidden_matrix[i, :] / np.sum(hidden_matrix[i, :])
    
    return(hidden_matrix)

def M_Steps(hidden_matrix, points):
    centers = np.full((hidden_matrix.shape[1], points.shape[1]), 0, float)
    
    for i in range(hidden_matrix.shape[1]):
        for j in range(points.shape[1]):
            centers[i, j] = np.dot(hidden_matrix[:, i], points[:, j]) / np.sum(hidden_matrix[:, i])
            
    return(centers)
            

def Soft_k_Means(k, m, beta, points_data):
    if type(points_data) == str:
        points      = []
        points_data = points_data.split('\n')
        for point_data in points_data:
            point_data = point_data.split(' ')
            point_data = list(map(float, point_data))
            points.append(point_data)
    else:
        points = points_data
    
    points  = np.array(points).reshape(len(points_data), m)
    centers = points[: k]

    for i in range(100):
        hidden_matrix = E_Steps(centers, points, beta)
        centers       = M_Steps(hidden_matrix, points)
    
    return(centers)

# Test
k, m = 2, 2
beta = 2.7
points_data = '''1.3 1.1
1.3 0.2
0.6 2.8
3.0 3.2
1.2 0.7
1.4 1.6
1.2 1.0
1.2 1.1
0.6 1.5
1.8 2.6
1.2 1.3
1.2 1.0
0.0 1.9'''

Soft_k_Means(k, m, beta, points_data)

array([[ 1.66212921,  2.62313009],
       [ 1.07543903,  1.14787873]])

In [7]:
'''
Code Challenge: Implement HierarchicalClustering.
Input: An integer n, followed by an n x n distance matrix.
Output: The result of applying HierarchicalClustering to this distance matrix (using Davg), with each newly created cluster listed on each line.
'''

def Hierarchical_Clustering(n, distance_matrix):
    if type(distance_matrix) == str:
        distance_matrix = distance_matrix.replace('\n', ' ')
        distance_matrix = distance_matrix.split(' ')
        while '' in distance_matrix:
            distance_matrix.remove('')
        distance_matrix = list(map(float, distance_matrix))
        distance_matrix = np.array(distance_matrix).reshape(n, n)
        
    
    clusters = []
    for i in range(n):
        clusters.append([i + 1])
        
    while len(clusters) != 1:
        n = distance_matrix.shape[0]
        
        # find minimun length
        min_length   = float('Inf')
        for i in range(n):
            for j in range(n):
                if (distance_matrix[i, j] != 0) & (distance_matrix[i, j] <= min_length) & (i < j):
                    min_length   = distance_matrix[i, j]
                    min_location = [i, j]
                    
        i_index, j_index = min_location
        
        i = clusters[i_index]
        j = clusters[j_index]
        
        #make new cluster
        new_cluster = i + j
        
        # print new cluster
        tmp_print   = new_cluster
        tmp_print   = list(map(str, tmp_print))
        print(' '.join(tmp_print))
        
        # make new colomn
        new_col = []
        for vi, vj in zip(distance_matrix[:,i_index], distance_matrix[:,j_index]):
            if (vi * vj) != 0:
                new_value = (vi * len(clusters[i_index]) + vj * len(clusters[j_index])) / (len(clusters[i_index]) + len(clusters[j_index]))
                new_col.append(new_value)
        
        #update matrix
        distance_matrix = np.delete(distance_matrix, [i_index, j_index], 0)
        distance_matrix = np.delete(distance_matrix, [i_index, j_index], 1)
        
        distance_matrix = np.vstack((distance_matrix,np.array(new_col).reshape(1, len(new_col))))
        new_col.append(0)
        distance_matrix = np.hstack((distance_matrix,np.array(new_col).reshape(len(new_col), 1)))

        # update cluster
        clusters.remove(i)
        clusters.remove(j)
        clusters.append(new_cluster)

# Test
n = 7
distance_matrix = '''0.00 0.74 0.85 0.54 0.83 0.92 0.89
0.74 0.00 1.59 1.35 1.20 1.48 1.55
0.85 1.59 0.00 0.63 1.13 0.69 0.73
0.54 1.35 0.63 0.00 0.66 0.43 0.88
0.83 1.20 1.13 0.66 0.00 0.72 0.55
0.92 1.48 0.69 0.43 0.72 0.00 0.80
0.89 1.55 0.73 0.88 0.55 0.80 0.00'''

Hierarchical_Clustering(n, distance_matrix)

4 6
5 7
3 4 6
1 2
5 7 3 4 6
1 2 5 7 3 4 6
