In [1]:
import numpy as np

from sklearn import datasets
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split

import math

import matplotlib.pyplot as plt

from scipy.spatial import distance

mbyte = 1048576
norm_l1 = Normalizer(norm='l1')
norm_l2 = Normalizer(norm='l2')

# Set your dataset path

In [2]:
path = '' # /home/username/datasets/
path = '/home/kleist/file/datasets/'

# Tools

In [3]:
from hashlib import md5

def find_nearest_neighbor(P, q):
    dis = np.array([distance.euclidean(p, q) for p in P])
    return np.argsort(dis)[0]

def fvecs_read(filename, c_contiguous=True):
    fv = np.fromfile(filename, dtype=np.float32)
    if fv.size == 0:
        return np.zeros((0, 0))
    dim = fv.view(np.int32)[0]
    assert dim > 0
    fv = fv.reshape(-1, 1 + dim)
    if not all(fv.view(np.int32)[:, 0] == dim):
        raise IOError("Non-uniform vector sizes in " + filename)
    fv = fv[:, 1:]
    if c_contiguous:
        fv = fv.copy()
    return fv


def store_dataset(title, train, test, K, distance_function):
    data = dict()
    data['title'] = title
    data['train'] = train
    data['test'] = test
    data['K'] = K
    data['n_train'] = train.shape[0]
    data['n_test'] = test.shape[0]
    
    k_near_neighbors = []
    nearest_neighbor = []

    for i, q in enumerate(test):
        d = np.array([distance_function(q, p) for p in train])
        order_d = d.argsort()
        
        k_near_neighbors.append(order_d[:K])
        nearest_neighbor.append(order_d[0])
        
    data['k_near_neighbors'] = k_near_neighbors
    data['nearest_neighbor'] = nearest_neighbor
    
    return data

# Datasets Arguments

In [4]:
Datasets = dict()

# SIFT
[website](http://corpus-texmex.irisa.fr/)

## 10K
[download](ftp://ftp.irisa.fr/local/texmex/corpus/siftsmall.tar.gz)

In [None]:
train = fvecs_read(path + 'sift/sifts-10K/sift_base.fvecs').astype(np.int)
test = fvecs_read(path + 'sift/sifts-10K/sift_query.fvecs').astype(np.int)

In [None]:
%%time
sift = store_dataset(title='SIFT-10K', train=train, test=test, K=50, 
                     distance_function=distance.euclidean)

In [None]:
Datasets['SIFT-10K'] = sift

In [None]:
norm_distance = np.sqrt(np.sum(train.astype(np.int64) ** 2, axis=1))
plt.hist(norm_distance, bins=100)
plt.ylabel('number')
plt.xlabel('distance')
plt.show()

## data structure

In [None]:
q = test[0]

In [None]:
%%time
d2 = np.array([distance.cosine(q, p) for p in train])
d2 = np.arccos(np.array([distance.cosine(q, p) for p in train]))
plt.hist(d2, bins=100, color='r', alpha=0.5, density=True)

In [None]:
%%time
d2 = np.array([distance.euclidean(q, p) for p in train])
plt.hist(d2, bins=100, color='r', alpha=0.5, density=True)

# MNIST

In [None]:
mnist = datasets.fetch_mldata('mnist-original')['data']

In [None]:
norm_distance = np.sqrt(np.sum(mnist.astype(np.int64) ** 2, axis=1))
plt.hist(norm_distance, bins=100)
plt.ylabel('number')
plt.xlabel('distance')
plt.show()

## Hamming

In [None]:
mnist_hamming = mnist.copy()

In [None]:
mnist_hamming[mnist_hamming > 0] = 1

In [None]:
train, test = train_test_split(mnist_hamming, test_size=500, random_state=1)
train.shape, test.shape

In [None]:
q = test[0]
d2 = np.array([distance.hamming(q, p) for p in train])
plt.hist(d2, bins=100, color='r', alpha=0.5, density=True)

In [None]:
plt.hist(np.sum(train, axis=1), bins=100, color='r', alpha=0.5, density=True)

In [None]:
data_mnist_hamming = store_dataset(title='MNIST-Hamming', train=train, test=test, K=100, 
                                   distance_function=distance.hamming)

In [None]:
Datasets['MNIST-Hamming'] = data_mnist_hamming

# Store Dataset

In [None]:
# 最近邻数据
%store Datasets

**Read data in other notebook.**

In [None]:
# %store -r Datasets