#### student ID : 20141445

##### import librarys

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import random
import sys
import collections

##### Load dataset and define constants

In [2]:
file_data		= "mnist_train3.csv"
handle_file	= open(file_data, "r")
data        		= handle_file.readlines()
handle_file.close()

file_data_test		= "mnist_test.csv"
handle_file_test	= open(file_data_test, "r")
data_test        		= handle_file_test.readlines()
handle_file_test.close()

size_row	= 28    # height of the image
size_col  	= 28    # width of the image

num_image	= len(data)
count       	= 0     # count for the number of images

Ks = [5,10,15,20] # K for K-mean

##### Define functions <br>
normalize : $$ D' = {D - D_{min} \over D_{max} - D_{min} } $$ <br>
average (based L2 norm) : $$ result = normalize(\sqrt{\sum_{i=1}^{n} image_i^2}) $$

In [3]:
# normalize the values of the input data to be [0, 1]
def normalize(data):
    return (data - min(data))/(max(data)-min(data))

# return average of input images
def average(images):
    if len(images) == 0:
        return np.zeros((size_row * size_col), dtype=float)
    return normalize(np.sqrt(np.sum(np.asarray(images)**2,axis=0)))

# return distance of two images
def distance(one, other):
    return np.sqrt(np.sum((one - other)**2,axis=0))

# return energy
def energy(average_images, labeled_images):
    result = 0
    for k in range(len(labeled_images)):
        for t in range(len(labeled_images[k])):
            result += distance(average_images[k], labeled_images[k][t])
    return result

# return accuracy
def accuracy(matched_clusters, original_labels, cluster_number):
    members_in_clusters = []
    result = 0
    for i in range(cluster_number):
        members_in_clusters.append(list())
    for i in range(len(matched_clusters)):
        members_in_clusters[matched_clusters[i]].append(original_labels[i])
    for i in range(cluster_number):
        counter = collections.Counter(members_in_clusters[k])
        if len(counter.values()) > 0:
            major_freq = max(counter.values())
        else:
            major_freq = 0
        result += major_freq
    return float(result) / len(matched_clusters)

##### Random Number Labeling

In [4]:
def k_mean_clustering(k):
    # make a matrix each column of which represents an images in a vector form 
    list_images  = np.empty((size_row * size_col, num_image), dtype=float)
    list_label  = np.empty(num_image, dtype=int)

    count = 0
    cluster_number = Ks[3]
    original_labels = []

    for line in data:
        line_data = line.split(',')
        original_labels.append(int(line_data[0]))
        label = random.choice(range(cluster_number))
        im_vector   = np.asfarray(line_data[1:])
        im_vector   = normalize(im_vector)
        list_label[count]       = label
        list_images[:, count]    = im_vector    
        count += 1

    labeled_images = []
    for i in range(cluster_number):
        labeled_images.append(list())

    for idx in range(count):
        (labeled_images[list_label[idx]]).append(list_images[:, idx])

    average_images = []
    energies = []
    accuracies = []

    for k in range(cluster_number): # calculate average image for each cluster
        average_images.append(average(labeled_images[k]))

##### K-mean clustering iteration

In [5]:
    matched_clusters = [None] * count
    before_matched_clusters = []
    iteration_num = 0;
    while True: # iteration loop
        iteration_num+=1
        print("iteration#" + str(iteration_num))
        labeled_images = []
        for i in range(cluster_number):
            labeled_images.append(list())
        for i in range(count): # clustering loop
            min_distance = sys.maxsize
            matched_clusters[i] = -1
            for k in range(cluster_number): # cluster matching loop
                distance_between_cluster = distance(average_images[k], list_images[:, i])
                if distance_between_cluster < min_distance:
                    min_distance = distance_between_cluster
                    matched_clusters[i] = k
            labeled_images[matched_clusters[i]].append(list_images[:, i])
        for k in range(cluster_number): # calculate average image for each cluster
            average_images[k] = average(labeled_images[k])
        energies.append(energy(average_images, labeled_images))
        accuracies.append(accuracy(matched_clusters, original_labels, cluster_number))
        if before_matched_clusters == matched_clusters:
            print("K-mean iteration end.")
            break
        before_matched_clusters = matched_clusters.copy()

iteration#1


NameError: name 'cluster_number' is not defined

##### Visualize K centroid images

In [None]:
    for i in range(cluster_number):
        im_matrix   = average_images[i].reshape((size_row, size_col))

        plt.subplot(1, cluster_number, i+1)
        plt.title(i)
        plt.imshow(im_matrix, cmap='Greys', interpolation='None')

        frame   = plt.gca()
        frame.axes.get_xaxis().set_visible(False)
        frame.axes.get_yaxis().set_visible(False)

    plt.show()

##### Plot the training energy

In [None]:
    plt.figure()
    plt.xlim(0,len(energies))
    plt.plot(energies,color='black',label='energy')
    plt.legend()
    plt.show()
    print("energies" + str(energies))

##### Plot the training accuracy

In [None]:
    plt.figure()
    plt.xlim(0,len(accuracies))
    plt.plot(accuracies,color='black',label='accuracy')
    plt.legend()
    plt.show()
    print("accuracies" + str(accuracies))

In [None]:
for k in Ks:
    print("K-mean clustering for k = " + str(k))
    K_mean_clustering(k)