In [1]:
from sklearn import datasets
import math
import numpy as np

In [2]:
data = datasets.load_iris().data

## Distance Manager

In [3]:
def euclidean_distance(data1, data2):
    sum = 0
    if (len(data1) == len(data2)):
        for x1, x2 in zip(data1, data2):
            sum += (x1 - x2)**2
        dist = math.sqrt(sum)
        return dist
    else:
        raise Exception('Length doesn\'t match')
        
def manhattan_distance(data1, data2):
    sum = 0
    if (len(data1) == len(data2)):
        for x1, x2 in zip(data1, data2):
            sum += abs(x1 - x2)
        return sum
    else:
        raise Exception('Length doesn\'t match')
        
def cosine_distance(data1, data2):
    return np.dot(data1, data2) / (np.linalg.norm(data1) * np.linalg.norm(data2))
        
def get_distance(data1, data2, metrics):
    if (metrics == 'euclidean'):
        dist = euclidean_distance(data1, data2)
    elif (metrics == 'manhattan'):
        dist = manhattan_distance(data1, data2)
    elif (metrics == 'cosine'):
        dist = cosine_distance(data1, data2)
    else:
        raise Exception('Metrics not defined')
    return dist

In [4]:
def calculate_distance_matrix(data, metrics):
    dist_matrix = []
    for idx1, data1 in enumerate(data):
        curr_dist_matrix = []
        for idx2, data2 in enumerate(data):
            if (idx1 > idx2):
                curr_dist_matrix.append(dist_matrix[idx2][idx1])
            else:
                dist = get_distance(data1, data2, metrics)
                curr_dist_matrix.append(dist)
        dist_matrix.append(curr_dist_matrix)
    return dist_matrix

## Linkage Manager

In [5]:
def complete_linkage(cluster1, cluster2, dist_matrix):
    max_dist = 0
    for v1 in cluster1:
        for v2 in cluster2:
            if (max_dist < dist_matrix[v1][v2]):
                max_dist = dist_matrix[v1][v2]
    return max_dist

def single_linkage(cluster1, cluster2, dist_matrix):
    min_dist = None
    for v1 in cluster1:
        for v2 in cluster2:
            if (min_dist is None) or (min_dist > dist_matrix[v1][v2]):
                min_dist = dist_matrix[v1][v2]
    return min_dist

def average_linkage(cluster1, cluster2, dist_matrix):
    sum_dist = 0
    count_dist = 0
    for v1 in cluster1:
        for v2 in cluster2:
            sum_dist += dist_matrix[v1][v2]
            count_dist += 1
    return float(sum_dist)/float(count_dist)

def group_average_linkage(cluster1, cluster2, data, distance):
    data1 = [data[i] for i in cluster1]
    data2 = [data[i] for i in cluster2]
    
    avg1 = np.mean(data1, axis = 0)
    avg2 = np.mean(data2, axis = 0)
    
    return get_distance(avg1, avg2, distance)

## Computing Cluster

In [6]:
metrics_used = 'cosine'
data_used = data[:50]
dist_matrix = calculate_distance_matrix(data_used, metrics_used)

In [7]:
clusters = [[i] for i, c in enumerate(data_used)]
n_clusters = 4
linkage = 'complete'

while(len(clusters) > n_clusters):
    min_dist = None
    merge_pair = (0, 0)
    for idx1, c1 in enumerate(clusters):
        for idx2, c2 in enumerate(clusters[(idx1 + 1) :]):
            if (linkage == 'single'):
                dist = single_linkage(c1, c2, dist_matrix)
            elif (linkage == 'complete'):
                dist = complete_linkage(c1, c2, dist_matrix)
            elif (linkage == 'average'):
                dist = average_linkage(c1, c2, dist_matrix)
            elif (linkage == 'average_group'):
                dist = group_average_linkage(c1, c2, data_used, metrics_used)
            else:
                raise Exception('Linkage not defined')
            if (min_dist == None) or (dist < min_dist):
                min_dist = dist
                merge_pair = (idx1, idx1 + 1 + idx2)
    
    result_cluster = []
    for idx, c in enumerate(clusters):
        if idx not in merge_pair:
            result_cluster.append(c)
    
    result_cluster.append(clusters[merge_pair[0]] + clusters[merge_pair[1]])
    
    clusters = result_cluster

print(clusters)
result_per_item = np.zeros(len(data_used), dtype= np.int8)
for idx, clust in enumerate(clusters):
    result_per_item[clust] = idx

result_per_item

[[7, 31, 37, 48, 9, 14, 44, 3, 40, 5, 28, 23, 33], [2, 11, 1, 15, 38, 21, 34, 13, 29], [27, 16, 30, 35, 46, 10, 19, 20, 18, 4, 12, 43], [17, 25, 32, 8, 6, 36, 24, 22, 41, 47, 49, 0, 26, 39, 42, 45]]


array([3, 1, 1, 0, 2, 0, 3, 0, 3, 0, 2, 1, 2, 1, 0, 1, 2, 3, 2, 2, 2, 1,
       3, 0, 3, 3, 3, 2, 0, 1, 2, 0, 3, 0, 1, 2, 3, 0, 1, 3, 0, 3, 3, 2,
       0, 3, 2, 3, 0, 3], dtype=int8)

## Comparation with sklearn's Agglomerative Clustering

In [8]:
from sklearn.cluster import AgglomerativeClustering

cls = AgglomerativeClustering(n_clusters= 4, linkage='complete', affinity='cosine')
cls.fit_predict(data[:50])

array([0, 1, 0, 2, 0, 0, 0, 2, 2, 1, 0, 2, 1, 0, 0, 0, 0, 0, 2, 0, 1, 0,
       0, 2, 2, 1, 2, 2, 1, 2, 2, 1, 0, 0, 1, 0, 0, 0, 2, 2, 0, 3, 0, 2,
       2, 1, 0, 2, 0, 1])

# Class

In [9]:
import numpy as np
import math

class AgglomerativeClustering:
    
    n_clusters = 2
    linkage = 'complete'
    metrics = 'euclidean'
    
    available_metrics = ['euclidean', 'manhattan', 'cosine']
    available_linkage = ['complete', 'single', 'group_average', 'average']
    
    def __init__(self, n_clusters=n_clusters, linkage=linkage, metrics=metrics):
        
        if n_clusters <= 0:
            raise Exception('n_clusters must be higher than 0')
        if metrics not in self.available_metrics:
            raise Exception('No metrics \'' + str(metrics) + '\'. Available metrics '+ str(self.available_metrics))
        if linkage not in self.available_linkage:
            raise Exception('No linkage \'' + str(linkage) + '\'. Available linkage '+ str(self.available_linkage))
        self.metrics = metrics
        self.n_clusters = n_clusters
        self.linkage = linkage
        
    def __euclidean_distance(self, data1, data2):
        sum = 0
        if (len(data1) == len(data2)):
            for x1, x2 in zip(data1, data2):
                sum += (x1 - x2)**2
            dist = math.sqrt(sum)
            return dist
        else:
            raise Exception('Length doesn\'t match')

    def __manhattan_distance(self, data1, data2):
        sum = 0
        if (len(data1) == len(data2)):
            for x1, x2 in zip(data1, data2):
                sum += abs(x1 - x2)
            return sum
        else:
            raise Exception('Length doesn\'t match')
            
    def __cosine_distance(self, data1, data2):
        return np.dot(data1, data2) / (np.linalg.norm(data1) * np.linalg.norm(data2))

    def __get_distance(self, data1, data2, metrics):
        if (metrics == 'euclidean'):
            dist = self.__euclidean_distance(data1, data2)
        elif (metrics == 'manhattan'):
            dist = self.__manhattan_distance(data1, data2)
        elif (metrics == 'cosine'):
            dist = self.__cosine_distance(data1, data2)
        else:
            raise Exception('Metrics not defined')
        return dist
    
    def __complete_linkage(self, cluster1, cluster2, dist_matrix):
        max_dist = 0
        for v1 in cluster1:
            for v2 in cluster2:
                if (max_dist < dist_matrix[v1][v2]):
                    max_dist = dist_matrix[v1][v2]
        return max_dist

    def __single_linkage(self, cluster1, cluster2, dist_matrix):
        min_dist = None
        for v1 in cluster1:
            for v2 in cluster2:
                if (min_dist is None) or (min_dist > dist_matrix[v1][v2]):
                    min_dist = dist_matrix[v1][v2]
        return min_dist

    def __average_linkage(self, cluster1, cluster2, dist_matrix):
        sum_dist = 0
        count_dist = 0
        for v1 in cluster1:
            for v2 in cluster2:
                sum_dist += dist_matrix[v1][v2]
                count_dist += 1
        return float(sum_dist)/float(count_dist)

    def __group_average_linkage(self, cluster1, cluster2, data, distance):
        data1 = [data[i] for i in cluster1]
        data2 = [data[i] for i in cluster2]

        avg1 = np.mean(data1, axis = 0)
        avg2 = np.mean(data2, axis = 0)

        return self.__get_distance(avg1, avg2, distance)
    
    def __calculate_distance_matrix(self, data, metrics):
        dist_matrix = []
        for idx1, data1 in enumerate(data):
            curr_dist_matrix = []
            for idx2, data2 in enumerate(data):
                if (idx1 > idx2):
                    curr_dist_matrix.append(dist_matrix[idx2][idx1])
                else:
                    dist = self.__get_distance(data1, data2, metrics)
                    curr_dist_matrix.append(dist)
            dist_matrix.append(curr_dist_matrix)
        return dist_matrix
        
    def fit_predict(self, data):
        dist_matrix = self.__calculate_distance_matrix(data, self.metrics)
        clusters = [[i] for i, c in enumerate(data)]

        while(len(clusters) > self.n_clusters):
            min_dist = None
            merge_pair = (0, 0)
            for idx1, c1 in enumerate(clusters):
                for idx2, c2 in enumerate(clusters[(idx1 + 1) :]):
                    if (self.linkage == 'single'):
                        dist = self.__single_linkage(c1, c2, dist_matrix)
                    elif (self.linkage == 'complete'):
                        dist = self.__complete_linkage(c1, c2, dist_matrix)
                    elif (self.linkage == 'average'):
                        dist = self.__average_linkage(c1, c2, dist_matrix)
                    elif (self.linkage == 'average_group'):
                        dist = self.__group_average_linkage(c1, c2, data, self.metrics)
                    else:
                        raise Exception('Linkage not defined')
                    if (min_dist == None) or (dist < min_dist):
                        min_dist = dist
                        merge_pair = (idx1, idx1 + 1 + idx2)

            result_cluster = []
            for idx, c in enumerate(clusters):
                if idx not in merge_pair:
                    result_cluster.append(c)

            result_cluster.append(clusters[merge_pair[0]] + clusters[merge_pair[1]])

            clusters = result_cluster

#         print(clusters)
#         result_per_item = np.zeros(len(data))
        result_per_item = np.full(len(data), 0)
        for idx, clust in enumerate(clusters):
            result_per_item[clust] = idx

        return result_per_item

In [10]:
aglo = AgglomerativeClustering(n_clusters=4, linkage='complete', metrics='euclidean')
pred_a = aglo.fit_predict(data[:50])
pred_a

array([3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 2, 2, 3, 3, 1, 1, 1, 3, 2, 2, 2, 2,
       3, 2, 2, 3, 2, 3, 3, 3, 3, 2, 1, 1, 3, 3, 2, 3, 3, 3, 3, 0, 3, 2,
       2, 3, 2, 3, 2, 3])

In [11]:
from sklearn.cluster import AgglomerativeClustering

cls = AgglomerativeClustering(n_clusters= 4, linkage='complete', affinity='euclidean')
cls.fit_predict(data[:50])

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 2, 0, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 2, 2, 0, 0, 1, 0, 0, 0, 0, 3, 0, 1,
       1, 0, 1, 0, 1, 0])