# PQ Recall

## product quantization

In [None]:
import nanopq
import numpy as np

n1,n2, D = 10000, 2000, 128
np.random.seed(15)
X = np.random.randn(n1, D).astype(np.float32)  
queries = np.random.randn(n2,D).astype(np.float32)  

# Instantiate with M=8 sub-spaces,Ks=256 codewords in each sub-space
M,Ks=8,256
pq = nanopq.PQ(M=M,Ks=Ks)

# Train codewords
pq.fit(X)

# Encode to PQ-codes
X_code = pq.encode(X)  # (10000, 8) 

### compute recall

In [2]:
from evaluationRecall import Recall_PQ

# M (int): The number of sub-space
# Ks (int): The number of codewords for each subspace
#     (typically 256, so that each sub-vector is quantized
#     into 256 bits = 1 byte = uint8)
# D (int): The dim of each vector
# pq_codebook (np.ndarray): shape=(M, Ks, Ds) with dtype=np.float32.
#     codebook[m][ks] means ks-th codeword (Ds-dim) for m-th subspace
# pq_codes (np.ndarray): PQ codes with shape=(n, M) and dtype=np.int
# metric (str): dot_product or l2_distance   

rpq = Recall_PQ(M=M, Ks=Ks, D=D, pq_codebook = pq.codewords, pq_codes = X_code, metric="l2_distance")

# This will get the true nearest neighbor of the queries by  brute force search.
ground_truth = rpq.brute_force_search(X, queries, metric = "l2_distance") 

In [3]:
# This will get topk neighbors(rpq.neighbors_matrix) of queries and compute the recall
rpq.pq_recall(queries=queries, topk=100, ground_truth=ground_truth)

recall 1@100 = 0.528


# AQ Recall

The following is not additive quantization, only the codebooks and codes have the same structure as additive quantization

In [7]:
import numpy as np
from scipy.cluster.vq import kmeans2

n, nq, D = 10000, 2000, 128
np.random.seed(15)
X = np.random.randn(n, D).astype(np.float32)  
queries = np.random.randn(nq,D).astype(np.float32)
M,K = 8,256

centroid, code = kmeans2(X, K, minit='points')
centroid.shape  # shape = (256,128)

codebooks = centroid
codes = code 
RX = X
for i in range(1,M):
    RX = RX - centroid[code]

    centroid , code = kmeans2(RX, K)

    codebooks = np.r_[codebooks,centroid]
    codes = np.c_[codes,code]
print(codebooks.shape)
print(codes.shape)

(2048, 128)
(10000, 8)


## compute recall

In [8]:
from evaluationRecall import Recall_AQ

# M (int): The number of codebooks  
# K (int): The number of codewords for each codebook  
# D (int): The dim of each vector  
# aq_codebooks (np.ndarray): shape=(M*K, D) with dtype=np.float32.  
#     aq_codebooks[0:K,:] represents the K codewords in the first codebook  
#     aq_codebooks[(m-1)*K:mK,:] represents the K codewords in the m-th codebook  
# aq_codes (np.ndarray): AQ codes with shape=(n, M) and dtype=np.int, where n is the number of encoded datapoints.  
    # aq_codes[i,j] is in {0,1,...,K-1} for all i,j
# metric (str): dot_product or l2_distance 

raq = Recall_AQ(M = M, K = K, D = D, aq_codebooks = codebooks, aq_codes = codes, metric="l2_distance")

# This will get the true nearest neighbor of the queries by  brute force search.
ground_truth = raq.brute_force_search(X,queries,metric="l2_distance")

In [9]:
# This will get topk neighbors(raq.neighbors_matrix) of queries and compute the recall
raq.aq_recall(queries=queries, topk=100, ground_truth=ground_truth)

recall 1@100 = 0.6285
