In [1]:
# load mnist data 
import numpy as np
import struct
def load_images(file_name):
    ##   在读取或写入一个文件之前，你必须使用 Python 内置open()函数来打开它。##
    ##   file object = open(file_name [, access_mode][, buffering])          ##
    ##   file_name是包含您要访问的文件名的字符串值。                         ##
    ##   access_mode指定该文件已被打开，即读，写，追加等方式。               ##
    ##   0表示不使用缓冲，1表示在访问一个文件时进行缓冲。                    ##
    ##   这里rb表示只能以二进制读取的方式打开一个文件                        ##
    binfile = open(file_name, 'rb')
    ##   从一个打开的文件读取数据
    buffers = binfile.read()
    ##   读取image文件前4个整型数字
    magic,num,rows,cols = struct.unpack_from('>IIII',buffers, 0)
    ##   整个images数据大小为60000*28*28
    bits = num * rows * cols
    ##   读取images数据
    images = struct.unpack_from('>' + str(bits) + 'B', buffers, struct.calcsize('>IIII'))
    ##   关闭文件
    binfile.close()
    ##   转换为[60000,784]型数组
    images = np.reshape(images, [num, rows * cols])
    return images

def load_labels(file_name):
    ##   打开文件
    binfile = open(file_name, 'rb')
    ##   从一个打开的文件读取数据
    buffers = binfile.read()
    ##   读取label文件前2个整形数字，label的长度为num
    magic,num = struct.unpack_from('>II', buffers, 0)
    ##   读取labels数据
    labels = struct.unpack_from('>' + str(num) + "B", buffers, struct.calcsize('>II'))
    ##   关闭文件
    binfile.close()
    ##   转换为一维数组
    labels = np.reshape(labels, [num])
    return labels
filename_train_images = 'train-images.idx3-ubyte'
filename_train_labels = 'train-labels.idx1-ubyte'
filename_test_images = 't10k-images.idx3-ubyte'
filename_test_labels = 't10k-labels.idx1-ubyte'
train_images=load_images(filename_train_images)
train_labels=load_labels(filename_train_labels)
test_images=load_images(filename_test_images)
test_labels=load_labels(filename_test_labels)

train_images = train_images / 255.0
test_images = test_images / 255.0

print("train_images.shape: " + str(train_images.shape))
print("train_labels.shape: " + str(train_labels.shape))
print("test_images.shape: " + str(test_images.shape))
print("test_labels.shape: " + str(test_labels.shape))

print("type of train_images: " + str(type(train_images)))
print("type of train_labels: " + str(type(train_labels)))


train_images.shape: (60000, 784)
train_labels.shape: (60000,)
test_images.shape: (10000, 784)
test_labels.shape: (10000,)
type of train_images: <class 'numpy.ndarray'>
type of train_labels: <class 'numpy.ndarray'>


In [2]:
# pyplot is a submodule of matplotlib and not immediately imported when you import matplotlib
from matplotlib import pyplot as plt
print('image show a train_image')
plt.imshow(train_images[17,:].reshape(28,28))

image show a train_image


<matplotlib.image.AxesImage at 0xb6b74d0>

In [3]:
import random 
def random_select_m(m, rangeLimit=60000):
    '''
    return list of m randomly selected numbers from 0~rangeLimit-1
    '''
    assert isinstance(m,int)
    assert m>=1 and m<=rangeLimit
    allrange = np.arange(rangeLimit)
    np.random.shuffle(allrange)
    ans = allrange[0:m]
    return list(ans)

print(random_select_m(5, 10))


[0, 3, 5, 2, 1]


In [4]:
import time
tic = time.process_time()
toc = time.process_time()
print ("Computation time = " + str(1000*(toc - tic)) + "ms")
del(tic)
del(toc)

Computation time = 0.0ms


In [5]:
from sklearn.neighbors import KNeighborsClassifier

# example of using sklearn KNeighborsClassifier
features = np.array([[1,2],[3,4],[2,2],[-3,-3],[-4,-5],[-5,-4]])
label = np.array([0,1,2,3,4,5])

# Classifier implementing the k-nearest neighbors vote.
# algorithm : {‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}
# ‘auto’ will attempt to decide the most appropriate algorithm based on the values passed to fit method.
model = KNeighborsClassifier(n_neighbors=1)

# Train the model using the training sets
model.fit(features,label)

#Predict Output
prediction = model.predict([[2,3],[0,8]])
print("prediction: " + str(prediction))
print("probability: " + str(model.predict_proba([[2,3],[0,8]])))

del(model)
del(prediction)
del(features)
del(label)




prediction: [2 1]
probability: [[0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]]


In [6]:
M_all = [1000, 5000 ,10000]
num_train_image = train_images.shape[0]
num_test_image = test_images.shape[0]

In [22]:
'''
where returns tuple
a = np.array([1, 2, 3])
# element is np.ndarray
np.where(a>0)[0].tolist()
Out[36]: [0, 1, 2]
'''


print('randomly select M training image to train 1NN classifier')

for M in M_all:
    sum_accur = 0
    for i in range(5):
        pick_list = random_select_m(M)
        selected_train_images = train_images[pick_list,:]
        assert selected_train_images.shape == (M,28*28)
        selected_train_label = train_labels[pick_list]
        assert selected_train_label.shape == (M,)
        model = KNeighborsClassifier(n_neighbors=1)
        model.fit(selected_train_images,selected_train_label)
        tic = time.process_time()
        predict_all_Test = model.predict(test_images)
        toc = time.process_time()
        print("M = {} Computation time = ".format(M) + str(1000*(toc - tic)) + "ms")
        correctPrediction = predict_all_Test-test_labels == 0
        accuracy = sum(correctPrediction) / num_test_image * 100
        print("M = {} has {}% accuracy".format(M,accuracy))
        sum_accur += accuracy
    print('average accuracy for M={} is {}%'.format(M,sum_accur/5))
    

randomly select M training image to train 1NN classifier
M = 1000 Computation time = 18953.125ms
M = 1000 has 88.46000000000001% accuracy
M = 1000 Computation time = 18859.375ms
M = 1000 has 88.88000000000001% accuracy
M = 1000 Computation time = 18687.5ms
M = 1000 has 88.03% accuracy
M = 1000 Computation time = 19718.75ms
M = 1000 has 88.34% accuracy
M = 1000 Computation time = 20562.5ms
M = 1000 has 88.61% accuracy
average accuracy for M=1000 is 88.46400000000001%
M = 5000 Computation time = 87250.0ms
M = 5000 has 93.30000000000001% accuracy
M = 5000 Computation time = 86687.5ms
M = 5000 has 93.47% accuracy
M = 5000 Computation time = 92093.75ms
M = 5000 has 93.33% accuracy
M = 5000 Computation time = 96187.5ms
M = 5000 has 93.54% accuracy
M = 5000 Computation time = 90921.875ms
M = 5000 has 93.56% accuracy
average accuracy for M=5000 is 93.44000000000001%
M = 10000 Computation time = 197984.375ms
M = 10000 has 94.81% accuracy
M = 10000 Computation time = 187578.125ms
M = 10000 has 9

In [7]:
# example of using sklearn kmeans
from sklearn.cluster import KMeans
data = np.array([[1, 2], [1, 4], [1, 0], [4, 2], [4, 4], [4, 0]])
kmeansModel = KMeans(n_clusters=2, random_state=0)
kmeansModel.fit(data)
print("cluster_centers: " + str(kmeansModel.cluster_centers_))
print("each data classify cluster: " + str(kmeansModel.labels_))
print("classify [0,0], [4,4]: " + str(kmeansModel.predict([[0, 0], [4, 4]])))
del(data)
del(kmeansModel)


cluster_centers: [[1. 2.]
 [4. 2.]]
each data classify cluster: [0 0 0 1 1 1]
classify [0,0], [4,4]: [0 1]


In [8]:
label_dict = dict()
for i in range(10):
    label_dict['label{}'.format(i)] = list()
for i in range(num_train_image):
    label_dict['label{}'.format(train_labels[i])].append(i)
for i in range(10):
    print("label{} first 10: ".format(i) + str(label_dict['label{}'.format(i)][0:10]) )

label0 first 10: [1, 21, 34, 37, 51, 56, 63, 68, 69, 75]
label1 first 10: [3, 6, 8, 14, 23, 24, 40, 59, 67, 70]
label2 first 10: [5, 16, 25, 28, 76, 82, 109, 117, 120, 122]
label3 first 10: [7, 10, 12, 27, 30, 44, 49, 50, 74, 86]
label4 first 10: [2, 9, 20, 26, 53, 58, 60, 61, 64, 89]
label5 first 10: [0, 11, 35, 47, 65, 100, 132, 138, 145, 173]
label6 first 10: [13, 18, 32, 36, 39, 62, 66, 73, 83, 90]
label7 first 10: [15, 29, 38, 42, 52, 71, 79, 84, 91, 96]
label8 first 10: [17, 31, 41, 46, 55, 85, 94, 97, 125, 137]
label9 first 10: [4, 19, 22, 33, 43, 45, 48, 54, 57, 80]


In [26]:
# example of finding nearest to kmeans cluster center
def get_close_center(data,num_clusters,num_closest):
    assert isinstance(data,np.ndarray)
    assert isinstance(num_clusters,int)
    assert isinstance(num_closest,int)
    
    kmeansModel = KMeans(n_clusters=num_clusters,random_state=0)
    kmeansModel.fit(data)
    cluster_labels = kmeansModel.labels_
    print('cluster_labels: ' + str(cluster_labels))
    assert len(cluster_labels) == data.shape[0]
    centers = kmeansModel.cluster_centers_
    print('centers:' + str(centers))
    cluster_dict = dict()
    cluster_len_dict = dict()
    for i in range(num_clusters):
        cluster_dict['cluster{}'.format(i)] = list()
        cluster_len_dict['cluster{}'.format(i)] = list()
    for i in range(data.shape[0]):
        cluster_dict['cluster{}'.format(cluster_labels[i])].append(i)
        cluster_len_dict['cluster{}'.format(cluster_labels[i])].append(\
            np.linalg.norm(data[i,:]-centers[cluster_labels[i]]))
    
    selected_data = list()
    for i in range(num_clusters):
        sort_index = np.argsort(np.array(cluster_len_dict['cluster{}'.format(i)]))
        first_index = sort_index[0:num_closest]
        selected_data = selected_data + data[np.array(cluster_dict['cluster{}'.format(i)])[first_index],:].tolist()
    
    return selected_data
        
    
    
print(get_close_center(np.array([[1],[2],[3],[4],[5],[6],[7],[8],[9],[10],[11],[12],[13],[14],[15]]),3,2))
    
    
    
    
        
            
    
    

cluster_labels: [1 1 1 1 1 1 2 2 2 2 0 0 0 0 0]
centers:[[13. ]
 [ 3.5]
 [ 8.5]]
[[13], [12], [3], [4], [8], [9]]


In [14]:
num_cluster = 25


for M in M_all:
    selected_train_images = []
    selected_train_label = []
    for i in range(10):
        cluster_dict = dict()
        cluster_len_dict = dict()

        kmeansModel = KMeans(n_clusters=num_cluster,random_state=0)
        kmeansModel.fit(train_images[label_dict['label{}'.format(i)],:])
        cluster_labels = kmeansModel.labels_
        centers = kmeansModel.cluster_centers_


        for j in range(num_cluster):
            cluster_dict['cluster{}'.format(j)] = list()
            cluster_len_dict['cluster{}'.format(j)] = list()


        for k in range(len(label_dict['label{}'.format(i)])):
            cluster_dict['cluster{}'.format(cluster_labels[k])].append(label_dict['label{}'.format(i)][k])
            c = centers[cluster_labels[k]]
            cluster_len_dict['cluster{}'.format(cluster_labels[k])].\
                append(np.linalg.norm(train_images[label_dict['label{}'.format(i)][k],:] - c))


        for z in range(num_cluster):

            first_ith = np.array(cluster_dict['cluster{}'.format(z)])\
                [np.argsort(np.array(cluster_len_dict['cluster{}'.format(z)]))[0:M//10//num_cluster]]
            #print(first_ith)
            #print(cluster_dict['cluster{}'.format(z)])

            selected_train_images = selected_train_images + train_images[first_ith,:].tolist()
            selected_train_label = selected_train_label + train_labels[first_ith].tolist()


        del(kmeansModel)
        del(cluster_dict)
        del(cluster_labels)

    selected_train_images = np.array(selected_train_images)
    selected_train_label = np.array(selected_train_label)
    assert selected_train_images.shape == (M,28*28)
    assert selected_train_label.shape == (M,)

    model = KNeighborsClassifier(n_neighbors=1)
    model.fit(selected_train_images,selected_train_label)
    tic = time.process_time()
    predict_all_Test = model.predict(test_images)
    toc = time.process_time()
    print("M = {} Computation time = ".format(M) + str(1000*(toc - tic)) + "ms")
    correctPrediction = predict_all_Test-test_labels == 0
    accuracy = sum(correctPrediction) / num_test_image * 100
    print("M = {} has {}% accuracy".format(M,accuracy))
    del(selected_train_images)
    del(selected_train_label)
        

M = 1000 Computation time = 19359.375ms
M = 1000 has 91.97% accuracy
M = 5000 Computation time = 85531.25ms
M = 5000 has 93.87% accuracy
M = 10000 Computation time = 162046.875ms
M = 10000 has 94.61% accuracy


In [None]:





'''
randomly select M training image to train 1NN classifier
M = 1000 Computation time = 18953.125ms
M = 1000 has 88.46000000000001% accuracy
M = 1000 Computation time = 18859.375ms
M = 1000 has 88.88000000000001% accuracy
M = 1000 Computation time = 18687.5ms
M = 1000 has 88.03% accuracy
M = 1000 Computation time = 19718.75ms
M = 1000 has 88.34% accuracy
M = 1000 Computation time = 20562.5ms
M = 1000 has 88.61% accuracy
average accuracy for M=1000 is 88.46400000000001%
M = 5000 Computation time = 87250.0ms
M = 5000 has 93.30000000000001% accuracy
M = 5000 Computation time = 86687.5ms
M = 5000 has 93.47% accuracy
M = 5000 Computation time = 92093.75ms
M = 5000 has 93.33% accuracy
M = 5000 Computation time = 96187.5ms
M = 5000 has 93.54% accuracy
M = 5000 Computation time = 90921.875ms
M = 5000 has 93.56% accuracy
average accuracy for M=5000 is 93.44000000000001%
M = 10000 Computation time = 197984.375ms
M = 10000 has 94.81% accuracy
M = 10000 Computation time = 187578.125ms
M = 10000 has 94.65% accuracy
M = 10000 Computation time = 177500.0ms
M = 10000 has 94.69% accuracy
M = 10000 Computation time = 166515.625ms
M = 10000 has 95.08% accuracy
M = 10000 Computation time = 168734.375ms
M = 10000 has 94.57% accuracy
average accuracy for M=10000 is 94.75999999999999%

num_clus = 10
M = 1000 Computation time = 18171.875ms
M = 1000 has 91.09% accuracy
M = 5000 Computation time = 74062.5ms
M = 5000 has 93.16% accuracy
M = 10000 Computation time = 136921.875ms
M = 10000 has 94.19999999999999% accuracy

num_clus = 20
M = 1000 Computation time = 19078.125ms
M = 1000 has 91.75999999999999% accuracy
M = 5000 Computation time = 79500.0ms
M = 5000 has 93.83% accuracy
M = 10000 Computation time = 150937.5ms
M = 10000 has 94.8% accuracy

M = 1000 Computation time = 19359.375ms
M = 1000 has 91.97% accuracy
M = 5000 Computation time = 85531.25ms
M = 5000 has 93.87% accuracy
M = 10000 Computation time = 162046.875ms
M = 10000 has 94.61% accuracy
'''
