# Minimal working example using BERT (`distBERT` model)

In [1]:
pip install pandas torch transformers faiss-cpu

Note: you may need to restart the kernel to use updated packages.


## Setup

In [12]:
import pandas as pd
import numpy as np
import torch # primarily deals with neural networks - ability to do autodifferentiation
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel # using pre-trained models
import faiss

## Pseudo data creation

In [13]:
# ----------------------------
# 1) Sample pseudo "data"
# ----------------------------
items = pd.DataFrame([
    {"item_id":"i1","title":"Noise Cancelling Headphones","description":"Wireless noise-cancelling headphones with 30-hour battery life","category":"electronics"},
    {"item_id":"i2","title":"Mechanical Keyboard","description":"Mechanical keyboard with RGB and hot-swappable switches","category":"electronics"},
    {"item_id":"i3","title":"Running Shoes","description":"Running shoes designed for long distance comfort and stability","category":"sports"},
    {"item_id":"i4","title":"Vegetarian Cookbook","description":"Cookbook featuring quick vegetarian recipes for busy weeknights","category":"books"},
    {"item_id":"i5","title":"Fitness Smartwatch","description":"Smartwatch with heart rate monitoring, GPS, and sleep tracking","category":"electronics"},
])

events = pd.DataFrame([
    {"user_id":"u1","item_id":"i1","ts":"2026-01-10"},
    {"user_id":"u1","item_id":"i5","ts":"2026-01-11"},
    {"user_id":"u2","item_id":"i2","ts":"2026-01-12"},
    {"user_id":"u3","item_id":"i3","ts":"2026-01-13"},
    {"user_id":"u3","item_id":"i4","ts":"2026-01-14"},
])
events["ts"] = pd.to_datetime(events["ts"])


## BERT encoder (What makes BERT work?)

In [4]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "distilbert-base-uncased"  # for illustration, 66M model

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
encoder = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)


In [14]:
encoder.eval()

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

**Evaluate the BERT encoder**

In [15]:
# ----------------------------
# 2) BERT encode helper
# ----------------------------

# turn off gradient calculation (no back propagation in using BERT)
@torch.no_grad()
def bert_embed(texts, max_len=128):
    batch = tokenizer(
        texts, padding=True, truncation=True, max_length=max_len, return_tensors="pt"
    )
    batch = {k: v.to(DEVICE) for k, v in batch.items()}
    out = encoder(**batch)
    cls = out.last_hidden_state[:, 0]          # first column [CLS]-like token for classification
    emb = F.normalize(cls, dim=-1)             # normalization
    return emb.cpu().numpy().astype("float32") # size (B, 768)


### Notes

- Have initial dictionary of items
- Events: records interactions between items and users (ex. clicking on, visiting, saving)
- BERT takes set of tokens, then transforms those into vectors of numbers
- Want to be able to represent all tokens with 768-dimensional vector
- Most times, don't know user history, but know the items. In this case, the items embedding should happen offline. Shouldn't need any kind of real-time action. 
- What do we need to give recommendations? what the users have done

## Embedding items into vectors (for comparisons)

In [19]:
# ----------------------------
# 3) Offline job: item embeddings
# ----------------------------

items["text"] = items["title"] + ". " + items["description"]
item_vecs = bert_embed(items["text"].tolist())
item_id_list = items["item_id"].tolist()

# Build ANN index (inner product works with normalized vectors)
index = faiss.IndexFlatIP(item_vecs.shape[1])
index.add(item_vecs)

# this part typically changes way less than the events. we have the same set of items. 
# the events can evolve across time quickly
# how often do we update the embeddings? 


In [21]:
item_vecs
item_vecs.shape
# 5 embeddings with 768 numbers

(5, 768)

## What user-specific data are there?

In [23]:
# ----------------------------
# 4) Feature builder: user text from last N clicks
# ----------------------------
def build_user_text(user_id, events, items, N=3):
    hist = (events[events["user_id"] == user_id]
            .sort_values("ts")
            .tail(N)["item_id"]
            .tolist())
    if not hist:
        return "no history"
    text = items.set_index("item_id").loc[hist, "text"].tolist()
    return " ".join(text), set(hist)


# can we just use the last 3 items the user has seen as our data?
# why not entire user history? responses will get slower


## How to recommend "similar" item with BERT?

In [24]:
# ----------------------------
# 5) "What to recommend" function
# ----------------------------
def recommend(user_id, k=3):
    user_text, seen = build_user_text(user_id, events, items, N=3)
    u = bert_embed([user_text])  # (1, 768)
    scores, idx = index.search(u, k + len(seen))  # keep track of what was seen by the user
    recs = []
    for j in idx[0]:
        iid = item_id_list[j]
        if iid not in seen:
            recs.append(iid)
        if len(recs) == k:
            break
    return recs


In [25]:

for u in ["u1","u2","u3"]:
    print(u, "->", recommend(u, k=3))


u1 -> ['i2', 'i3', 'i4']
u2 -> ['i1', 'i3', 'i5']
u3 -> ['i5', 'i1', 'i2']
