## Clustering

### K Means Clustering



In [4]:
import argparse
import pandas as pd
import numpy as np
import pickle
from pathlib import Path
from collections import defaultdict

In [5]:
import math

def euclidean_distance(point1, point2):
    distance = math.sqrt((point1[0]-point2[0])**2 + (point1[1]-point2[1])**2)
    return distance

def assign_clusters(data, centroids):
    clusters = [[] for _ in range(len(centroids))]
    print(clusters)
    for point in data:
        distances = [euclidean_distance(point, centroid) for centroid in centroids]
        print('distances: '+str(distances))
        closest_centroid = distances.index(min(distances))
        print('closest_centroid: '+str(closest_centroid))
        clusters[closest_centroid].append(point)
    return clusters

def update_centroids(clusters):
    centroids = []
    for cluster in clusters:
        if cluster:
            x_values = [point[0] for point in cluster]
            y_values = [point[1] for point in cluster]
            centroid_x = sum(x_values)/len(cluster)
            centroid_y = sum(y_values)/len(cluster)
            centroid = [centroid_x, centroid_y]
            centroids.append(centroid)
    return centroids



def k_means_clustering(centroids, dataset):

#   Description: Perform k means clustering for 2 iterations given as input the dataset and centroids.
#   Input:
#       1. centroids - A list of lists containing the initial centroids for each cluster. 
#       2. dataset - A list of lists denoting points in the space.
#   Output:
#       1. results - A dictionary where the key is iteration number and store the cluster assignments in the 
#           appropriate clusters. Also, update the centroids list after each iteration.

    result = {
        '1': { 'cluster1': [], 'cluster2': [], 'cluster3': [], 'centroids': []},
        '2': { 'cluster1': [], 'cluster2': [], 'cluster3': [], 'centroids': []}
    }
    
    centroid1, centroid2, centroid3 = centroids[0], centroids[1], centroids[2]
    
    for iteration in range(2):
        # your code here
        
        clusters = assign_clusters(dataset, centroids)
        print('Clusters:: '+str(clusters))
        centroids = update_centroids(clusters)
        print('centroids:: '+str(centroids))
        result[str(iteration+1)]['centroids'] = centroids
        for i in range(len(clusters)):
            result[str(iteration+1)]['cluster'+str(i+1)] = clusters[i]

        print('result:: '+str(result))
        
    return result


In [6]:
with open ('data/sample_dataset_kmeans.pickle', 'rb') as f:
    dataset = pickle.load(f)
print('Kmeans Data: '+str(dataset))
with open ('./data/sample_centroids_kmeans.pickle', 'rb') as f:
    centroids = pickle.load(f)
print('Centroid Data: '+str(centroids))

k_means_clustering(centroids,dataset)

Kmeans Data: [[46, 33], [26, 21], [23, 96], [82, 20], [25, 42], [29, 99], [30, 64], [57, 51], [12, 68], [25, 9]]
Centroid Data: [[12, 68], [46, 33], [25, 42]]
[[], [], []]
distances: [48.79549159502341, 0.0, 22.847319317591726]
closest_centroid: 1
distances: [49.040799340956916, 23.323807579381203, 21.02379604162864]
closest_centroid: 2
distances: [30.083217912982647, 67.06713054842886, 54.037024344425184]
closest_centroid: 0
distances: [84.87638069569178, 38.27531841800928, 61.09828148156051]
closest_centroid: 1
distances: [29.068883707497267, 22.847319317591726, 0.0]
closest_centroid: 2
distances: [35.35533905932738, 68.15423684555495, 57.14017850864661]
closest_centroid: 0
distances: [18.439088914585774, 34.88552708502482, 22.561028345356956]
closest_centroid: 0
distances: [48.104053883222775, 21.095023109728988, 33.24154027718932]
closest_centroid: 1
distances: [0.0, 48.79549159502341, 29.068883707497267]
closest_centroid: 0
distances: [60.41522986797286, 31.89043743820395, 33.0]
c

{'1': {'cluster1': [[23, 96], [29, 99], [30, 64], [12, 68]],
  'cluster2': [[46, 33], [82, 20], [57, 51], [25, 9]],
  'cluster3': [[26, 21], [25, 42]],
  'centroids': [[23.5, 81.75], [52.5, 28.25], [25.5, 31.5]]},
 '2': {'cluster1': [[23, 96], [29, 99], [30, 64], [12, 68]],
  'cluster2': [[46, 33], [82, 20], [57, 51]],
  'cluster3': [[26, 21], [25, 42], [25, 9]],
  'centroids': [[23.5, 81.75],
   [61.666666666666664, 34.666666666666664],
   [25.333333333333332, 24.0]]}}