<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Profile-pqlite" data-toc-modified-id="Profile-pqlite-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Profile pqlite</a></span><ul class="toc-item"><li><span><a href="#Time" data-toc-modified-id="Time-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Time</a></span></li><li><span><a href="#Quality" data-toc-modified-id="Quality-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Quality</a></span><ul class="toc-item"><li><span><a href="#Plotting-pq-neighbors-vs-true-neighbors" data-toc-modified-id="Plotting-pq-neighbors-vs-true-neighbors-1.2.1"><span class="toc-item-num">1.2.1&nbsp;&nbsp;</span>Plotting <code>pq neighbors</code> vs <code>true neighbors</code></a></span></li></ul></li><li><span><a href="#precision,-recall,-query_time-vs-n_subvectors-&amp;--n_cells" data-toc-modified-id="precision,-recall,-query_time-vs-n_subvectors-&amp;--n_cells-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span><code>precision, recall, query_time</code> vs <code>n_subvectors</code> &amp;  <code>n_cells</code></a></span></li></ul></li></ul></div>

## Profile pqlite

In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import pqlite
print(pqlite.__path__)
import time

import jina
from jina.math.distance import cdist

import sklearn
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split

['/Users/davidbuchaca1/Documents/jina_stuff/pqlite/pqlite']


In [3]:
import random
import numpy as np
from pqlite import PQLite

Nt = 12_500 
Nq = 1
D = 128
top_k = 100
n_cells = 18
n_subvectors = 128

np.random.seed(123)
Xtr, Xte = train_test_split(make_blobs(n_samples = Nt, n_features = D)[0].astype(np.float32), test_size=20)

pq = PQLite(d_vector=D, 
            n_cells=n_cells,
            n_subvectors=n_subvectors, 
            columns=[('x', float, True)])

pq.fit(Xtr)
pq.add(Xtr, ids=list(range(len(Xtr))))

2021-11-15 13:14:59.867 | INFO     | pqlite.index:fit:95 - => start training VQ codec with 12480 data...
2021-11-15 13:15:00.325 | INFO     | pqlite.index:fit:98 - => start training PQ codec with 12480 data...
2021-11-15 13:16:31.817 | INFO     | pqlite.index:fit:101 - => pqlite is successfully trained!
2021-11-15 13:16:32.992 | DEBUG    | pqlite.storage.cell:_expand:148 - => total storage capacity is expanded by 0 for 18 cells
2021-11-15 13:16:32.994 | DEBUG    | pqlite.storage.cell:insert:90 - => 12480 new items added


In [4]:
Nq = 1
query = Xte[[10]] 

# without filtering
pq_dists, ids = pq.search(query, k=5)

In [19]:
pq.search(query, k=5)

(array([[165.26141, 165.34329, 167.90637, 169.76   , 169.87177]],
       dtype=float32),
 array([[b'1192', b'3580', b'7', b'4826', b'12076']], dtype='|S36'))

The the search in pqlite is done in two steps:
    
- 1) Find the best cell computing `cdist(query,  pq.vq_codec.codebook, metric=pq.metric)`

- 2) Performing NNsearch on the elements of the best cell.`

In [None]:
from jina.math.distance import cdist

%timeit cdist(query,  pq.vq_codec.codebook, metric=pq.metric)

In [None]:
pq.search(query,  k=top_k)

### Time

In [None]:
%timeit  pq.search(query,  k=top_k)

In [None]:
%timeit res = cdist(query, Xtr, metric='euclidean')

### Quality

Manually observing slices of a high dimensional space it seems that the
retrieved items from pqlite are nearby the query and the best values from the exact and exhaustive distance computations.

In [None]:
def _precision(predicted, relevant, eval_at):
    """
    fraction of retrieved documents that are relevant to the query
    """
    if eval_at == 0:
        return 0.0
    predicted_at_k = predicted[:eval_at]
    n_predicted_and_relevant = len(set(predicted_at_k).intersection(set(relevant)))

    return n_predicted_and_relevant / len(predicted)

def _recall(predicted, relevant, eval_at):
    """
    fraction of the relevant documents that are successfully retrieved
    """
    if eval_at == 0:
        return 0.0
    predicted_at_k = predicted[:eval_at]
    n_predicted_and_relevant = len(set(predicted_at_k).intersection(set(relevant)))
    return n_predicted_and_relevant/ len(relevant)


In [None]:
query = Xte[[11]]  
true_distances = cdist(query, Xtr, metric='euclidean').flatten()

true_ids = np.argsort(true_distances)[0:top_k]
true_dists = true_distances[true_ids]

In [None]:
true_ids.sort()
true_ids

In [None]:
pq_dists, pq_ids = pq.search(query,  k=top_k)
pq_ids = np.array([int(x) for x in pq_ids[0]])

In [None]:
pq_ids.sort()
pq_ids

In [None]:
print(_precision(true_ids, pq_ids, top_k))
print(_recall(true_ids, pq_ids, top_k))

#### Plotting `pq neighbors` vs `true neighbors`

In [None]:
import matplotlib.pyplot as plt

def paint_slice(Xtr, query, feat1, feat2):
    plt.scatter(Xtr[:,feat1], Xtr[:,feat2], color='blue', alpha=0.2)

    for pq_id in pq_ids:
        plt.scatter(Xtr[pq_id, feat1], Xtr[pq_id, feat2], color='black')

    for true_id in true_ids:
        plt.scatter(Xtr[true_id, feat1], Xtr[true_id, feat2], color='orange')

    plt.scatter(query[:, feat1], query[:, feat2], color='red')
    

In [None]:
feat1, feat2 = 0, 1
paint_slice(Xtr, query, feat1, feat2)

In [None]:
feat1, feat2 = 8, 100
paint_slice(Xtr, query, feat1, feat2)

In [None]:
feat1, feat2 = 3,4
paint_slice(Xtr, query, feat1, feat2)


###  `precision, recall, query_time` vs `n_subvectors` &  `n_cells` 

In [None]:
import time
import numpy as np
from pqlite import PQLite

from jina.math.distance import cdist
from jina.math.helper import top_k as _top_k
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split

def _precision(predicted, relevant, eval_at):
    """
    fraction of retrieved documents that are relevant to the query
    """
    if eval_at == 0:
        return 0.0
    predicted_at_k = predicted[:eval_at]
    n_predicted_and_relevant = len(set(predicted_at_k).intersection(set(relevant)))

    return n_predicted_and_relevant / len(predicted)

def _recall(predicted, relevant, eval_at):
    """
    fraction of the relevant documents that are successfully retrieved
    """
    if eval_at == 0:
        return 0.0
    predicted_at_k = predicted[:eval_at]
    n_predicted_and_relevant = len(set(predicted_at_k).intersection(set(relevant)))
    return n_predicted_and_relevant/ len(relevant)

def evaluate(predicts, relevants, eval_at):
    recall = 0
    precision = 0
    for _predict, _relevant in zip(predicts, relevants):
        _predict = np.array([int(x) for x in _predict])
        recall += _recall(_predict, _relevant, top_k)
        precision += _precision(_predict, _relevant, top_k)

    return recall / len(predicts), precision / len(predicts)


#N = 100_000 # number of data points
Nt = 125_000
Nq = 1
D = 128 # dimentionality / number of features
top_k = 10
n_cells = 64
n_subvectors = 64
n_queries = 1000

# 2,000 128-dim vectors for training
np.random.seed(123)
Xtr, Xte = train_test_split(make_blobs(n_samples = Nt, n_features = D)[0].astype(np.float32), test_size=20)
print(f'Xtr: {Xtr.shape} vs Xte: {Xte.shape}')

precision_per_query = []
recall_per_query = []
results = []

for n_cells in [8, 16, 32, 64, 128]:
    for n_subvectors in [32, 64, 128]:

        pq = PQLite(d_vector=D,
                    n_cells=n_cells,
                    n_subvectors=n_subvectors)

        t0 = time.time()
        pq.fit(Xtr[:20480])
        train_time = abs(time.time() - t0)

        t0 = time.time()
        pq.add(Xtr, ids=list(range(len(Xtr))))
        index_time = abs(t0 - time.time())

        dists = cdist(Xte, Xtr, metric='euclidean')
        true_dists, true_ids = _top_k(dists, top_k, descending=False)

        t0 = time.time()
        pq_dists, pq_ids = pq.search(Xte, k=top_k)
        query_time = abs(t0 - time.time())

        recall, precision = evaluate(pq_ids, true_ids, top_k)

        results_dict = {'precision': precision,
                        'recall': recall,
                        'train_time': train_time,
                        'index_time': index_time,
                        'query_time': query_time,
                        'indexer_hyperparams': {'n_cells': n_cells,
                                                'n_subvectors': n_subvectors}
                        }
        print(results_dict)

        results.append(results_dict)

In [None]:
import pandas as pd
results_df = pd.DataFrame(results)
results_df.sort_values('recall', ascending=False)