In [1]:
import time
from sklearn.metrics import accuracy_score

from product_quantization import ProductQuantizationKNN

# Evaluate on MNIST data set

### Read the data

Borrowed from; https://www.cntk.ai/pythondocs/CNTK_103A_MNIST_DataLoader.html

In [2]:
import os
import gzip
import struct
import numpy as np

try:
    from urllib.request import urlretrieve
except ImportError:
    from urllib import urlretrieve

In [3]:
# Functions to load MNIST images and unpack into train and test set.
# - loadData reads a image and formats it into a 28x28 long array
# - loadLabels reads the corresponding label data, one for each image
# - load packs the downloaded image and label data into a combined format to be read later by
#   the CNTK text reader

def loadData(src, cimg):
    print ('Downloading ' + src)
    gzfname, h = urlretrieve(src, './delete.me')
    print ('Done.')
    try:
        with gzip.open(gzfname) as gz:
            n = struct.unpack('I', gz.read(4))
            # Read magic number.
            if n[0] != 0x3080000:
                raise Exception('Invalid file: unexpected magic number.')
            # Read number of entries.
            n = struct.unpack('>I', gz.read(4))[0]
            if n != cimg:
                raise Exception('Invalid file: expected {0} entries.'.format(cimg))
            crow = struct.unpack('>I', gz.read(4))[0]
            ccol = struct.unpack('>I', gz.read(4))[0]
            if crow != 28 or ccol != 28:
                raise Exception('Invalid file: expected 28 rows/cols per image.')
            # Read data.
            res = np.fromstring(gz.read(cimg * crow * ccol), dtype = np.uint8)
    finally:
        os.remove(gzfname)
    return res.reshape((cimg, crow * ccol))

def loadLabels(src, cimg):
    print ('Downloading ' + src)
    gzfname, h = urlretrieve(src, './delete.me')
    print ('Done.')
    try:
        with gzip.open(gzfname) as gz:
            n = struct.unpack('I', gz.read(4))
            # Read magic number.
            if n[0] != 0x1080000:
                raise Exception('Invalid file: unexpected magic number.')
            # Read number of entries.
            n = struct.unpack('>I', gz.read(4))
            if n[0] != cimg:
                raise Exception('Invalid file: expected {0} rows.'.format(cimg))
            # Read labels.
            res = np.fromstring(gz.read(cimg), dtype = np.uint8)
    finally:
        os.remove(gzfname)
    return res.reshape((cimg, 1))

def try_download(dataSrc, labelsSrc, cimg):
    data = loadData(dataSrc, cimg)
    labels = loadLabels(labelsSrc, cimg)
    return np.hstack((data, labels))


In [4]:
# URLs for the train image and label data
url_train_image = 'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz'
url_train_labels = 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz'
num_train_samples = 60000

print("Downloading train data")
train = try_download(url_train_image, url_train_labels, num_train_samples)

# URLs for the test image and label data
url_test_image = 'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz'
url_test_labels = 'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz'
num_test_samples = 10000

print("Downloading test data")
test = try_download(url_test_image, url_test_labels, num_test_samples)

Downloading train data
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Done.


  res = np.fromstring(gz.read(cimg * crow * ccol), dtype = np.uint8)


Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Done.
Downloading test data
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz


  res = np.fromstring(gz.read(cimg), dtype = np.uint8)


Done.
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Done.


In [5]:
train_labels = train[:,-1]
train_data = train[:,:-1]
test_labels = test[:,-1]
test_data = test[:,:-1]
print('Train features shape:', train_data.shape)
print('Test features shape:', test_data.shape)

Train features shape: (60000, 784)
Test features shape: (10000, 784)


### The hyper parameters

In [6]:
# Number of nearest-neighbors
k = 10

## Evaluate PQKNN approach

### Multi core

In [7]:
pqknn = ProductQuantizationKNN(7, 4)

start = time.time()
pqknn.compress(train_data, train_labels)
end = time.time()
print('Compressing the train_data to PQKNN classifier took ' + str(end - start) + ' seconds.')

Compressing the train_data to PQKNN classifier took 25.09275221824646 seconds.


Since $c$=4, we use np.uint8 (thus 1 byte) to store the centroid_ids in the compressedData array

In [8]:
print('Compressed data shape:', pqknn.compressed_data.shape)
print('Compressed data in bytes:', pqknn.compressed_data.nbytes)
print('Original data in bytes:', train_data.nbytes)
print('Compression factor:', train_data.nbytes / pqknn.compressed_data.nbytes)

Compressed data shape: (60000, 7)
Compressed data in bytes: 420000
Original data in bytes: 47040000
Compression factor: 112.0


In [9]:
start = time.time()
preds = pqknn.predict(test_data, k)
end = time.time()
print('Predicting the test_data with PQKNN classifier took ' + str(end - start) + ' seconds.')

Predicting the test_data with PQKNN classifier took 7.444230079650879 seconds.


In [10]:
print('Accuracy: ' + str(accuracy_score(test_labels, preds)*100) + '%')

Accuracy: 92.35%


#### If we increase the number of clusters ($c$), then the accuracy increases (together with the storage)

In [11]:
pqknn = ProductQuantizationKNN(7, 9)

start = time.time()
pqknn.compress(train_data, train_labels)
end = time.time()
print('Compressing the train_data to PQKNN classifier took ' + str(end - start) + ' seconds.')

Compressing the train_data to PQKNN classifier took 132.7194163799286 seconds.


Since $c$=9, we use np.uint16 (thus 2 bytes) to store the centroid_ids in the compressedData array  
-> Resulting in twice the storage size of the example above

In [12]:
print('Compressed data shape:', pqknn.compressed_data.shape)
print('Compressed data in bytes:', pqknn.compressed_data.nbytes)
print('Original data in bytes:', train_data.nbytes)
print('Compression factor:', train_data.nbytes / pqknn.compressed_data.nbytes)

Compressed data shape: (60000, 7)
Compressed data in bytes: 840000
Original data in bytes: 47040000
Compression factor: 56.0


In [13]:
start = time.time()
preds = pqknn.predict(test_data, k)
end = time.time()
print('Predicting the test_data with PQKNN classifier took ' + str(end - start) + ' seconds.')

Predicting the test_data with PQKNN classifier took 15.732435703277588 seconds.


In [14]:
print('Accuracy: ' + str(accuracy_score(test_labels, preds)*100) + '%')

Accuracy: 96.6%


#### With some significantly smaller space we obtain a (good) accuracy between the first and the second example. With a very good compression factor and fast compression and predict time!

In [15]:
pqknn = ProductQuantizationKNN(4, 8)

start = time.time()
pqknn.compress(train_data, train_labels)
end = time.time()
print('Compressing the train_data to PQKNN classifier took ' + str(end - start) + ' seconds.')

Compressing the train_data to PQKNN classifier took 58.23953890800476 seconds.


Since $c$=8, we use np.uint8 (thus 1 byte) to store the centroid_ids in the compressedData array

In [16]:
print('Compressed data shape:', pqknn.compressed_data.shape)
print('Compressed data in bytes:', pqknn.compressed_data.nbytes)
print('Original data in bytes:', train_data.nbytes)
print('Compression factor:', train_data.nbytes / pqknn.compressed_data.nbytes)

Compressed data shape: (60000, 4)
Compressed data in bytes: 240000
Original data in bytes: 47040000
Compression factor: 196.0


In [17]:
start = time.time()
preds = pqknn.predict(test_data, k)
end = time.time()
print('Predicting the test_data with PQKNN classifier took ' + str(end - start) + ' seconds.')

Predicting the test_data with PQKNN classifier took 7.140597343444824 seconds.


In [18]:
print('Accuracy: ' + str(accuracy_score(test_labels, preds)*100) + '%')

Accuracy: 96.08%


## Evaluate SKlearn K-NN approach on data

### Single core

In [19]:
from sklearn.neighbors import KNeighborsClassifier
kNN = KNeighborsClassifier(n_neighbors=k) #n_jobs is now 1

start = time.time()
kNN.fit(train_data, train_labels)
end = time.time()
print('Fitting the train_data to SKlearn KNN classifier took ' + str(end - start) + ' seconds.')

Fitting the train_data to SKlearn KNN classifier took 13.833713054656982 seconds.


In [20]:
start = time.time()
preds = kNN.predict(test_data)
end = time.time()
print('Predicting the test_data with SKlearn KNN classifier took ' + str(end - start) + ' seconds.')

Predicting the test_data with SKlearn KNN classifier took 561.5949828624725 seconds.


In [21]:
print('Accuracy: ' + str(accuracy_score(test_labels, preds)*100) + '%')

Accuracy: 96.65%


### Multi core

In [22]:
from sklearn.neighbors import KNeighborsClassifier
kNN = KNeighborsClassifier(n_neighbors=k, n_jobs=-1)

start = time.time()
kNN.fit(train_data, train_labels)
end = time.time()
print('Fitting the train_data to SKlearn KNN classifier took ' + str(end - start) + ' seconds.')

Fitting the train_data to SKlearn KNN classifier took 12.972641468048096 seconds.


In [23]:
start = time.time()
preds = kNN.predict(test_data)
end = time.time()
print('Predicting the test_data with SKlearn KNN classifier took ' + str(end - start) + ' seconds.')

Predicting the test_data with SKlearn KNN classifier took 209.55382823944092 seconds.


In [24]:
print('Accuracy: ' + str(accuracy_score(test_labels, preds)*100) + '%')

Accuracy: 96.65%
