In [2]:
# This notebook is for Item–Query Embedding Retrieval practice 
# (embedding was trained in the previous notebook, this one focuses on retrieval)

!pip install tensorflow-metadata==1.13.0
!pip install tensorflow-recommenders==0.7.3
!pip install annoy hnswlib

Collecting annoy
  Downloading annoy-1.17.3.tar.gz (647 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m25.2 MB/s[0m  [33m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting hnswlib
  Downloading hnswlib-0.8.0.tar.gz (36 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: annoy, hnswlib
[33m  DEPRECATION: Building 'annoy' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'annoy'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m  Build

In [3]:
# Set local data path
rootpath = "../Data/"

In [4]:
import pandas as pd

In [5]:
train = pd.read_csv("../Data/train.csv", encoding="ISO-8859-1")

In [6]:
items = train[['product_uid', 'product_title']].drop_duplicates()

In [8]:
items

Unnamed: 0,product_uid,product_title
0,100001,Simpson Strong-Tie 12-Gauge Angle
2,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...
3,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...
5,100006,Whirlpool 1.9 cu. ft. Over the Range Convectio...
8,100007,Lithonia Lighting Quantum 2-Light Black LED Em...
...,...,...
74062,206638,Atlantic Windowpane 576 CD or 192 DVD Blu-Ray ...
74063,206639,Philips 40-Watt Halogen R20 Flood Light Bulb (...
74064,206641,Schlage Camelot In-Active Aged Bronze Handlese...
74065,206648,Plastec 11 in. x 24 in. Rose Garden Wall Decor...


In [14]:
import numpy as np

In [16]:
# Load pre-computed item embeddings from local Data folder
with open("../Data/embedding.npy", "rb") as f:
    embeddings = np.load(f)

In [17]:
embeddings.shape

(54682, 16)

In [18]:
import tensorflow as tf

In [19]:
import tensorflow_recommenders as tfrs

In [20]:
embedding_ds = tf.data.Dataset.from_tensor_slices(embeddings)

2025-08-29 03:19:45.388320: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3
2025-08-29 03:19:45.388364: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-08-29 03:19:45.388389: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-08-29 03:19:45.388546: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-08-29 03:19:45.388579: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [21]:
for i in embedding_ds.take(2).batch(1).as_numpy_iterator():
    print(i)

[[-0.38924745 -0.39065138 -0.56249106  0.22164929 -0.58737195  0.57823646
  -0.04811786 -0.12114553  0.10812668  0.39210144 -0.6589462  -0.48557952
  -0.00273136  0.31075844 -0.03955091 -0.12801108]]
[[-0.4242165  -0.40162492 -0.5722065   0.22789298 -0.5901483   0.591919
  -0.03703084 -0.10564349 -0.802482   -0.6809432   0.4003952   0.65503156
  -0.7878822  -0.45608845 -0.66498524 -0.8190717 ]]


In [22]:
# Create a tf.data.Dataset from the items dictionary and extract only product_uid
uid_ds = tf.data.Dataset.from_tensor_slices(dict(items)) \
            .map(lambda x: x["product_uid"])

In [24]:
new_model = tf.keras.models.load_model("../Data/siamese_white_space")

# Compile manually if you want to continue training
new_model.compile(optimizer="adam")



In [26]:
from annoy import AnnoyIndex

# Build an Annoy retrieval index using the trained embeddings
dim = embeddings.shape[1]
ann_index = AnnoyIndex(dim, 'angular')  # 'angular' ≈ cosine similarity

# Index the item embeddings with their corresponding indices
for i, vec in enumerate(embeddings):
    ann_index.add_item(i, vec)

# Build the index (10 trees, can increase for better recall)
ann_index.build(10)

print("Annoy index built successfully.")

Annoy index built successfully.


In [28]:
# Encode query into embedding (pass string tensor directly)
query_vec = new_model(tf.constant(["table"]))

# Retrieve top-5 nearest items from Annoy index
idxs, dists = ann_index.get_nns_by_vector(query_vec[0].numpy(), 5, include_distances=True)

print("Retrieved indices:", idxs)
print("Distances:", dists)

Retrieved indices: [21896, 34258, 13332, 26254, 6044]
Distances: [0.06581147015094757, 0.06581147015094757, 0.08025649935007095, 0.09286623448133469, 0.09926249086856842]


In [30]:
def annoy_search(query_text, k=10):
    # Encode query into embedding
    query_vec = new_model(tf.constant([query_text]))[0].numpy()
    
    # Search top-k nearest items
    idxs, dists = ann_index.get_nns_by_vector(query_vec, k, include_distances=True)
    
    # Convert distances to similarity-like scores
    scores = [1 - d for d in dists]  
    
    # Wrap in Tensors to mimic ScaNN output format
    return (
        tf.constant([scores], dtype=tf.float32),
        tf.constant([ [items["product_uid"][i] for i in idxs] ], dtype=tf.int64)
    )

# Example usage
scores, ids = annoy_search("desk", k=10)
print(scores)
print(ids)

tf.Tensor(
[[1.         1.         0.9412531  0.9342265  0.9310173  0.92739373
  0.92525333 0.9237086  0.92249984 0.9169545 ]], shape=(1, 10), dtype=float32)
tf.Tensor([[103139 116311 126536 107753 112135 147145 103352 150437 142134 105487]], shape=(1, 10), dtype=int64)


In [31]:
# Save Annoy index to file
ann_index.save("../Data/retrieval.ann")
print("Annoy index saved to ../Data/retrieval.ann")

Annoy index saved to ../Data/retrieval.ann
