In [1]:
import numpy as np
import distance_utils

In [2]:
def read_data(file_name):
    data_set = []
    with open(file_name, "rt") as f:
        for line in f:
            line = line.replace("\n", "")
            tokens = line.split(",")
            label = tokens[0]
            attribs = []
            for i in range(784):
                attribs.append(tokens[i + 1])
            data_set.append([label, attribs])
    return data_set


def get_labels(data_set):
    labels = [int(row[0]) for row in data_set]
    return labels


def get_features(data_set):
    features = [[int(datapoint) for datapoint in row[1]] for row in data_set]
    return features


In [3]:
validation_set = read_data("valid.csv")
labels = np.array(get_labels(validation_set))
features = np.array(get_features(validation_set))

In [4]:
features

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [5]:
n_clusters = 10

In [6]:
random_indices = np.random.choice(
            features.shape[0], n_clusters, replace=False
        )

In [7]:
len(random_indices)

10

In [8]:
centroids = features[random_indices]

In [9]:
centroids

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [10]:
previous_centroids = np.zeros_like(centroids)

In [11]:
previous_centroids

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [12]:
np.allclose(centroids, previous_centroids)

False

In [13]:
previous_centroids = centroids

In [14]:
metric = 'euclidean'

In [15]:
distance_metric = getattr(distance_utils, metric + "_distance")

In [17]:
distances = distance_metric(features, centroids)

In [18]:
distances.shape

(200, 10)

In [19]:
cluster_assignments = np.argmin(distances, axis=1)

In [20]:
cluster_assignments

array([0, 6, 0, 3, 0, 5, 9, 0, 6, 5, 0, 6, 0, 0, 0, 8, 4, 2, 2, 0, 6, 0,
       0, 6, 0, 6, 3, 0, 8, 0, 0, 6, 6, 5, 0, 3, 0, 1, 8, 9, 5, 0, 8, 8,
       2, 6, 0, 3, 5, 0, 8, 4, 0, 0, 0, 4, 8, 2, 4, 0, 7, 0, 9, 0, 0, 6,
       0, 3, 5, 6, 3, 0, 6, 0, 0, 6, 6, 0, 3, 0, 6, 3, 6, 6, 5, 4, 6, 7,
       5, 0, 3, 0, 4, 8, 0, 0, 7, 0, 8, 1, 6, 4, 8, 0, 0, 6, 7, 1, 4, 0,
       8, 6, 4, 6, 0, 7, 3, 0, 6, 8, 8, 0, 2, 8, 5, 0, 4, 3, 6, 8, 0, 0,
       2, 2, 3, 8, 0, 0, 8, 6, 9, 0, 0, 2, 9, 5, 6, 0, 5, 8, 0, 6, 8, 3,
       4, 0, 0, 4, 4, 0, 0, 5, 6, 6, 0, 5, 0, 8, 4, 0, 9, 0, 8, 0, 0, 0,
       4, 6, 5, 6, 1, 0, 2, 5, 4, 0, 6, 8, 7, 8, 0, 8, 6, 2, 0, 2, 0, 3,
       2, 3])

In [23]:
centroids.shape

(10, 784)

In [24]:
centroids = np.array([
                np.mean(features[cluster_assignments == i], axis=0)
                for i in range(n_clusters)
            ])

In [27]:
centroids.shape

(10, 784)