Download vectors from http://ann-benchmarks.com/glove-200-angular.hdf5

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [None]:
import h5py
import numpy as np
data = h5py.File('glove-100-angular.hdf5', 'r')

In [None]:
train_embeddings = data['train'][:]
test_embeddings = data['test'][:]

In [None]:
train_embeddings = train_embeddings / np.linalg.norm(train_embeddings, axis=1).reshape(-1, 1)
test_embeddings = test_embeddings / np.linalg.norm(test_embeddings, axis=1).reshape(-1, 1)

In [None]:
exact_indices = data['neighbors'][:, 0]

In [None]:
d = test_embeddings.shape[1]
d, train_embeddings.shape, test_embeddings.shape

In [None]:
from fast_ivf import FastIVF
nlist = 2048
index = FastIVF(d, nlist=nlist, nprobe=10, compression_ndim=64)
index.compressor.learning_rate = 0.001
index.compressor.steps_per_epoch = 10000
index.compressor.epochs = 10
index.compressor.activation = None
index.compressor.dropout = 0.0

index.train(train_embeddings)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
plt.plot(index.kmeans._history)
plt.yscale("log")
plt.subplot(1, 2, 2)
indices, counts = np.unique(index.kmeans._labels, return_counts=True)
plt.hist(counts, 100);

In [None]:
import numba as nb

index.nprobe = 10
index.rescore_num_samples = 20
distances, indices = index.search(test_embeddings, k=500, rescore=True)

idx = 0
query = test_embeddings[idx]
exact_distances = train_embeddings[indices[idx]] @ query

plt.scatter(exact_distances, distances[idx])

In [None]:
index.nprobe = 250
index.ratio_threshold = 0.0000
index.rescore_num_samples = -1
distances, indices = index.search(test_embeddings, k=500, rescore=True, sort=True)


accuracies = []
for i, query_points_indices in enumerate(indices):
    # acc = exact_indices[i] in query_points_indices
    acc = exact_indices[i] in query_points_indices[:1]
    accuracies.append(acc)
np.mean(accuracies)

In [None]:
import faiss
from collections import defaultdict

d = train_embeddings.shape[1]
nlist = 1024

quantizer = faiss.IndexFlatIP(d)
faiss_index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT)
faiss_index.nprobe = 5
faiss_index.train(train_embeddings)
faiss_index.add(train_embeddings)

In [None]:
faiss_index.nprobe = 100
distances, indices = faiss_index.search(test_embeddings, 100)
(indices[:, 0] == np.array(exact_indices)).mean()