In [18]:
# Import necessary libraries
import numpy as np
from nanopq import PQ
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from tqdm import tqdm



In [19]:
# 1. Define text passages
passages = {
    "doc1": "Artificial intelligence is transforming industries.",
    "doc2": "Machine learning helps computers learn from data.",
    "doc3": "Quantum computing is a new paradigm.",
    "doc4": "The future of AI includes ethical challenges."
}



In [20]:
# 2. Load GTR-T5-Base sentence encoder
model = SentenceTransformer("sentence-transformers/gtr-t5-base")
docids = list(passages.keys())
texts = list(passages.values())
embeddings = model.encode(texts, convert_to_numpy=True)



In [21]:
embeddings.shape, embeddings.dtype, embeddings

((4, 768),
 dtype('float32'),
 array([[ 0.00273099, -0.0302944 ,  0.05884   , ...,  0.03470925,
         -0.01520736,  0.03500362],
        [-0.02251787, -0.05412124,  0.05713512, ...,  0.00543885,
          0.00965429, -0.01286693],
        [ 0.01178704, -0.01564313,  0.01263544, ..., -0.0136919 ,
         -0.00466294,  0.01595624],
        [ 0.01346409, -0.00766769,  0.01716973, ..., -0.00420209,
         -0.02281265,  0.05193746]], dtype=float32))

In [22]:
# 3. Setup Product Quantization
# Ensure the number of training vectors is sufficient for Ks
pq = PQ(M=6, Ks=2)  # 6 subspaces of 128-dim vectors = 768 total dim
pq.fit(embeddings)

# 4. Encode with PQ
codes = pq.encode(embeddings)



M: 6, Ks: 2, metric : <class 'numpy.uint8'>, code_dtype: l2
iter: 20, seed: 123
Training the subspace: 0 / 6
Training the subspace: 1 / 6
Training the subspace: 2 / 6
Training the subspace: 3 / 6
Training the subspace: 4 / 6
Training the subspace: 5 / 6
Encoding the subspace: 0 / 6
Encoding the subspace: 1 / 6
Encoding the subspace: 2 / 6
Encoding the subspace: 3 / 6
Encoding the subspace: 4 / 6
Encoding the subspace: 5 / 6


In [24]:
codes.shape, codes.dtype, codes

((4, 6),
 dtype('uint8'),
 array([[1, 1, 0, 1, 1, 1],
        [1, 0, 0, 1, 0, 1],
        [0, 0, 0, 1, 0, 1],
        [0, 1, 1, 0, 1, 0]], dtype=uint8))

In [25]:
# 5. Simulate T5 vocab offset
mock_vocab_size = 32128

# 6. Print encoded docids
print("Quantized PQ DocIDs:")
encoded_ids = []
for idx, code in enumerate(codes):
    new_doc_code = [int(x) + i * 256 for i, x in enumerate(code)]
    encoded = ','.join(str(x + mock_vocab_size) for x in new_doc_code)
    encoded_ids.append(encoded)
    print(f"{docids[idx]}\t{encoded}")



Quantized PQ DocIDs:
doc1	32129,32385,32640,32897,33153,33409
doc2	32129,32384,32640,32897,33152,33409
doc3	32128,32384,32640,32897,33152,33409
doc4	32128,32385,32641,32896,33153,33408


In [26]:
# 7. Decode the encoded IDs back to quantized codes
decoded_codes = []
for encoded in encoded_ids:
    new_doc_code = [int(x) - mock_vocab_size for x in encoded.split(',')]
    quantized_code = [new_doc_code[i] % pq.Ks for i in range(pq.M)]  # Ensure values are within [0, Ks-1]
    decoded_codes.append(quantized_code)



In [27]:
# Convert to NumPy array with the correct shape and dtype
decoded_codes = np.array(decoded_codes, dtype=np.uint8)

# Verify the shape and dtype of decoded_codes
assert decoded_codes.shape[1] == pq.M, f"Expected {pq.M} subspaces, got {decoded_codes.shape[1]}"
assert decoded_codes.dtype == pq.code_dtype, f"Expected dtype {pq.code_dtype}, got {decoded_codes.dtype}"



In [30]:
# 8. Reconstruct embeddings using PQ
reconstructed_embeddings = pq.decode(decoded_codes)

# 9. Normalize embeddings for cosine similarity
embeddings = normalize(embeddings, axis=1)
reconstructed_embeddings = normalize(reconstructed_embeddings, axis=1)


In [31]:
reconstructed_embeddings.shape, reconstructed_embeddings.dtype, reconstructed_embeddings

((4, 768),
 dtype('float32'),
 array([[-0.01120163, -0.04778889,  0.06565516, ...,  0.00998482,
         -0.00385562,  0.01437663],
        [-0.01128512, -0.04814507,  0.0661445 , ...,  0.01005924,
         -0.00388436,  0.01448378],
        [ 0.0145317 , -0.01341508,  0.0171525 , ...,  0.01015014,
         -0.00391946,  0.01461466],
        [ 0.01323483, -0.01221786,  0.01562174, ..., -0.00440487,
         -0.02391352,  0.0544438 ]], dtype=float32))

In [32]:

# 10. Compute cosine similarity
similarities = cosine_similarity(reconstructed_embeddings, embeddings)


In [33]:
# 11. Print all decoded IDs and their closest texts
print("\nDecoded IDs and their closest texts:")
for i, sim in enumerate(similarities):
    closest_idx = np.argmax(sim)  # Index of the most similar embedding
    print(f"Decoded text for doc {i + 1}: {texts[closest_idx]}")
    print(f"Similarity scores for doc {i + 1}:")
    for j, score in enumerate(sim):
        print(f"  Text {j + 1}: {texts[j]} (Similarity: {score:.4f})")


Decoded IDs and their closest texts:
Decoded text for doc 1: Artificial intelligence is transforming industries.
Similarity scores for doc 1:
  Text 1: Artificial intelligence is transforming industries. (Similarity: 0.8896)
  Text 2: Machine learning helps computers learn from data. (Similarity: 0.7894)
  Text 3: Quantum computing is a new paradigm. (Similarity: 0.7352)
  Text 4: The future of AI includes ethical challenges. (Similarity: 0.7704)
Decoded text for doc 2: Machine learning helps computers learn from data.
Similarity scores for doc 2:
  Text 1: Artificial intelligence is transforming industries. (Similarity: 0.8200)
  Text 2: Machine learning helps computers learn from data. (Similarity: 0.8717)
  Text 3: Quantum computing is a new paradigm. (Similarity: 0.8220)
  Text 4: The future of AI includes ethical challenges. (Similarity: 0.6684)
Decoded text for doc 3: Quantum computing is a new paradigm.
Similarity scores for doc 3:
  Text 1: Artificial intelligence is transform