# Item–Query Embedding for Search Recommendation

In this e-commerce project, we focus on **embedding-based search recommendation**.  
Queries and items are encoded with a **two-tower model**, where item embeddings are pre-computed and stored.  
At runtime, the query embedding is compared against item embeddings using **ANN (Approximate Nearest Neighbor) search** to efficiently retrieve the most relevant products.

In [4]:
!pip install tensorflow-recommenders



In [5]:
# Set local data path
rootpath = "../Data/"

In [6]:
!pip install "dask[complete]"



In [7]:
import dask.dataframe as dd
import pandas as pd

In [8]:
# Read train.csv with Dask DataFrame
train = dd.read_csv("../Data/train.csv", encoding="ISO-8859-1")
train.head()

Unnamed: 0,id,product_uid,product_title,search_term,relevance
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67


In [9]:
# Count unique values in the "relevance" column
values = train['relevance'].value_counts()

# Trigger actual computation (since Dask is lazy)
values.compute()

relevance
1.00     2105
1.25        4
1.33     3006
1.50        5
1.67     6780
1.75        9
2.00    11730
2.25       11
2.33    16060
2.50       19
2.67    15202
2.75       11
3.00    19125
Name: count, dtype: int64

In [10]:
# Keep only rows where relevance >= 2.0 (considered "positive" matches)
positive_only_train = train[train.relevance >= 2.0]

# Drop id and relevance columns, keep only product/query fields
dataset = positive_only_train.drop(['id', 'relevance'], axis=1)

## Features
- **title** (Siamese network input)  
- **id**  
- **description**  

## Preprocessing
- **TextVectorization** (whitespace tokenization)  
- **N-grams**  
- **Other tokenizers**  

## Models
- **Bidirectional RNNs**  
- **Transformers**

In [11]:
import tensorflow as tf
import tensorflow_recommenders as tfrs
print(tf.__version__)
print(tfrs.__version__)

2.15.0
v0.7.3


In [12]:
ds = tf.data.Dataset.from_tensor_slices(dict(dataset.compute()))

2025-08-29 03:14:08.252239: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3
2025-08-29 03:14:08.252257: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-08-29 03:14:08.252261: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-08-29 03:14:08.252646: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-08-29 03:14:08.252992: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [13]:
product_title = ds.map(lambda x: x['product_title'])
query = ds.map(lambda x: x['search_term'])
text_ds = product_title.concatenate(query)

In [14]:
for i in ds.take(2).batch(2).as_numpy_iterator():
    print(i)

{'product_uid': array([100001, 100001]), 'product_title': array([b'Simpson Strong-Tie 12-Gauge Angle',
       b'Simpson Strong-Tie 12-Gauge Angle'], dtype=object), 'search_term': array([b'angle bracket', b'l bracket'], dtype=object)}


---
- Split based on whitespace

In [15]:
vectorization = tf.keras.layers.TextVectorization(
    max_tokens=1000,  # very small vocabulary size
    output_mode='int',
    output_sequence_length=15  # pad or truncate sequences to length 15
)

In [16]:
vectorization.adapt(
    text_ds.batch(256)
)

2025-08-29 03:14:08.441110: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


In [17]:
vocabs = {key: value for key, value in enumerate(vectorization.get_vocabulary())}

In [18]:
import numpy as np

In [19]:
# Inspect a single example to verify tokenization and vocabulary mapping
for i in ds.take(1).batch(1).as_numpy_iterator():
    # Show raw product title
    print(i['product_title'])

    # Vectorize the product title -> integer token IDs
    vector = np.squeeze(vectorization(i['product_title']))
    print(vector)

    # Map token IDs back to vocabulary strings (for readability/debugging)
    print([vocabs[idx] for idx in vector])

    # Vectorize the search term as well -> integer token IDs
    vector = np.squeeze(vectorization(i['search_term']))
    print(vector)

    # Map search-term token IDs back to vocabulary strings
    print([vocabs[idx] for idx in vector])

[b'Simpson Strong-Tie 12-Gauge Angle']
[441 608   1 379   0   0   0   0   0   0   0   0   0   0   0]
['simpson', 'strongtie', '[UNK]', 'angle', '', '', '', '', '', '', '', '', '', '', '']
[379 301   0   0   0   0   0   0   0   0   0   0   0   0   0]
['angle', 'bracket', '', '', '', '', '', '', '', '', '', '', '', '', '']


In [20]:
# Share the same tower for both inputs
text_model = tf.keras.Sequential([
    vectorization,  # [batch, 1] -> [batch, 15]
    tf.keras.layers.Embedding(1001, 8),  # [batch, 15] -> [batch, 15, 8]
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(8))  # outputs 16 (8 + 8)
])

In [21]:
# Take 2 examples from the dataset, batch them, and convert to NumPy
for i in ds.take(2).batch(2).as_numpy_iterator():
    # Pass product_title through the text_model and print the embeddings
    print(text_model(i['product_title']))

tf.Tensor(
[[ 0.01280924 -0.00461081 -0.00430214 -0.01854529  0.04298591  0.00845828
  -0.00602569 -0.00293244  0.00237849  0.01971872 -0.02133274 -0.00399088
   0.00746611  0.01153365  0.00877599  0.00580332]
 [ 0.01280924 -0.00461081 -0.00430214 -0.01854529  0.04298591  0.00845828
  -0.00602569 -0.00293244  0.00237849  0.01971872 -0.02133274 -0.00399088
   0.00746611  0.01153365  0.00877599  0.00580332]], shape=(2, 16), dtype=float32)


In [22]:
# Define a Siamese Network using TensorFlow Recommenders
class SiameseNetwork(tfrs.Model):
    def __init__(self, model, **kwargs):
        super().__init__(**kwargs)
        self.model = model
        # Define retrieval task with Top-K metrics
        self.task = tfrs.tasks.Retrieval(
            metrics=[tfrs.metrics.FactorizedTopK(
                product_title.batch(128).map(model))]
        )

    def compute_loss(self, features, training=False):
        # Encode search_term into query embeddings
        query_embedding = self.model(features['search_term'])
        # Encode product_title into item embeddings
        title_embedding = self.model(features['product_title'])

        # Compute retrieval loss between query and item embeddings
        return self.task(query_embedding, title_embedding)

In [23]:
# Initialize Siamese network with the shared text_model
siamese = SiameseNetwork(text_model)

# Compile the model with Adam optimizer
siamese.compile(optimizer='adam')

In [24]:
# Prepare the training dataset
ds_train = (
    ds.map(lambda x: {
        "search_term": x['search_term'],       # input query
        "product_title": x['product_title']    # target product
    })
    .shuffle(1000)   # shuffle dataset with buffer size 1000
    .batch(256)      # batch into groups of 256
    .cache()         # cache for faster subsequent epochs
)

In [25]:
siamese.fit(ds_train, epochs=1)



<keras.src.callbacks.History at 0x30bea4d90>

In [26]:
siamese.model.save(rootpath + "siamese_white_space")

INFO:tensorflow:Assets written to: ../Data/siamese_white_space/assets


INFO:tensorflow:Assets written to: ../Data/siamese_white_space/assets


In [27]:
items = train[['product_uid', 'product_title']].drop_duplicates().compute()

In [28]:
items_ds = tf.data.Dataset.from_tensor_slices(dict(items))

In [29]:
embeddings = siamese.model.predict(items_ds.map(lambda x: x['product_title']).batch(256))



In [30]:
# Save computed embeddings as a NumPy binary file
with open("../Data/embedding.npy", "wb") as f:
    np.save(f, embeddings)

In [31]:
new_model = tf.keras.models.load_model(rootpath + "siamese_white_space")





In [32]:
new_model.predict(np.array(['desk']))



array([[-0.07317442, -0.21368533, -0.39363   , -0.06072724, -0.7076145 ,
         0.48943055,  0.5004328 ,  0.23205033, -0.16328764,  0.17384724,
        -0.46296415, -0.22964033, -0.20018947,  0.260493  , -0.13107312,
        -0.3654328 ]], dtype=float32)