In [1]:
import six.moves.cPickle as pickle
import gzip
import os
import numpy as np
from PIL import Image
from scipy import stats
import matplotlib.pyplot as plt
import scipy.misc

In [2]:
def load_data(dataset):
    ''' Loads the dataset

    :type dataset: string
    :param dataset: the path to the dataset (here MNIST)
    
    copied from http://deeplearning.net/ and revised by hchoi
    '''

    # Download the MNIST dataset if it is not present
    data_dir, data_file = os.path.split(dataset)
    if data_dir == "" and not os.path.isfile(dataset):
        # Check if dataset is in the data directory.
        new_path = os.path.join(
            os.path.split(__file__)[0],
            dataset
        )
        if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
            dataset = new_path

    if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':
        from six.moves import urllib
        origin = (
            'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
        )
        print('Downloading data from %s' % origin)
        urllib.request.urlretrieve(origin, dataset)

    print('... loading data')

    # Load the dataset
    with gzip.open(dataset, 'rb') as f:
        try:
            train_set, valid_set, test_set = pickle.load(f, encoding='latin1')
        except:
            train_set, valid_set, test_set = pickle.load(f)
    # train_set, valid_set, test_set format: tuple(input, target)
    # input is a numpy.ndarray of 2 dimensions (a matrix)
    # where each row corresponds to an example. target is a
    # numpy.ndarray of 1 dimension (vector) that has the same length as
    # the number of rows in the input. It should give the target
    # to the example with the same index in the input.

    return train_set, valid_set, test_set

In [3]:
def KNN(train_data, test_data, train_label, test_label, k):
    correct = 0
    
    for i in range(test_data.shape[0]):
        distance = np.linalg.norm(test_data[i] - train_data, axis = 1)
        
        index = np.argsort(distance)
        
        pred = stats.mode(train_label[index[:k]])[0][0]
        
        if test_label[i] == pred:
            correct = correct + 1
        
    print(correct/test_data.shape[0])

In [4]:
train_set, val_set, test_set = load_data('mnist.pkl.gz')

train_x, train_y = train_set
test_x, test_y = test_set

... loading data


In [5]:
KNN(train_x, test_x, train_y, test_y, 1)
KNN(train_x, test_x, train_y, test_y, 5)
KNN(train_x, test_x, train_y, test_y, 10)

0.9666


KeyboardInterrupt: 

In [None]:
cov = np.cov(train_x.T)
eigenvalues, eigenvectors = np.linalg.eig(cov)
eigenvectors_dim2 = eigenvectors.T[:2].T
eigenvectors_dim5 = eigenvectors.T[:5].T
eigenvectors_dim10 = eigenvectors.T[:10].T

projection_train_dim2 = np.dot(train_x, eigenvectors_dim2)
projection_train_dim5 = np.dot(train_x, eigenvectors_dim5)
projection_train_dim10 = np.dot(train_x, eigenvectors_dim10)

projection_test_dim2 = np.dot(test_x, eigenvectors_dim2)
projection_test_dim5 = np.dot(test_x, eigenvectors_dim5)
projection_test_dim10 = np.dot(test_x, eigenvectors_dim10)

In [39]:
KNN(projection_train_dim2, projection_test_dim2, train_y, test_y, 1)
KNN(projection_train_dim2, projection_test_dim2, train_y, test_y, 5)
KNN(projection_train_dim2, projection_test_dim2, train_y, test_y, 10)

0.3765
0.4201
0.4346


In [40]:
KNN(projection_train_dim5, projection_test_dim5, train_y, test_y, 1)
KNN(projection_train_dim5, projection_test_dim5, train_y, test_y, 5)
KNN(projection_train_dim5, projection_test_dim5, train_y, test_y, 10)

0.6885
0.7425
0.7587


In [41]:
KNN(projection_train_dim10, projection_test_dim10, train_y, test_y, 1)
KNN(projection_train_dim10, projection_test_dim10, train_y, test_y, 5)
KNN(projection_train_dim10, projection_test_dim10, train_y, test_y, 10)

0.9111
0.9254
0.929


In [6]:
from sklearn.ensemble import RandomForestClassifier

In [14]:
forest = RandomForestClassifier(n_estimators = 150, max_depth = 50)
forest.fit(train_x, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=50, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=150,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [15]:
accuracy = np.mean(np.equal(forest.predict(test_x), test_y))
print(accuracy)

0.9701
