In [1]:
import logging
import sys

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("pipeline.log"),
        logging.StreamHandler(sys.stdout),  # still captured in notebook
    ],
)
log = logging.getLogger(__name__)

log.info("Starting US ESCI pipeline...")

2025-11-24 04:57:44,901 [INFO] Starting US ESCI pipeline...


In [2]:
import platform, psutil, shutil, torch

print("=== SYSTEM INFO ===")
print("Machine:", platform.machine())
print("Processor:", platform.processor())
print("Python:", platform.python_version())

# CPU
print("\n=== CPU ===")
print("Cores:", psutil.cpu_count())
print("CPU Usage:", psutil.cpu_percent(), "%")

# RAM
print("\n=== RAM ===")
ram = psutil.virtual_memory()
print("Total:", round(ram.total/1e9, 2), "GB")
print("Used:", ram.percent, "%")

# Disk
print("\n=== DISK ===")
disk = shutil.disk_usage("/")
print("Total:", round(disk.total/1e9, 2), "GB")
print("Used:", round(disk.used/1e9, 2), "GB")

# GPU
print("\n=== GPU ===")
print("CUDA Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))


=== SYSTEM INFO ===
Machine: x86_64
Processor: x86_64
Python: 3.11.11

=== CPU ===
Cores: 64
CPU Usage: 1.0 %

=== RAM ===
Total: 270.29 GB
Used: 4.0 %

=== DISK ===
Total: 75.13 GB
Used: 65.09 GB

=== GPU ===
CUDA Available: True
GPU: NVIDIA A100 80GB PCIe


# 0. Imports, Install (if needed) & Config


 If you don't need installs, you can comment this cell out.
 Make sure you're using the same Python env as your previous notebook.

 !pip install pandas pyarrow numpy fastparquet scikit-learn rank-bm25 lightgbm datasets transformers sentencepiece einops

In [3]:
import sys

print("Python executable:", sys.executable)
sys.path = [p for p in sys.path if "python3.8" not in p]

print("\nFiltered sys.path (no python3.8 entries):")
for p in sys.path:
    print(" ", p)


Python executable: /opt/modules/devel/python/3.11.11/bin/python

Filtered sys.path (no python3.8 entries):
  
  /mnt/ceph/bibi8250/E-commerce
  /opt/modules/devel/python/3.11.11/lib/python311.zip
  /opt/modules/devel/python/3.11.11/lib/python3.11
  /opt/modules/devel/python/3.11.11/lib/python3.11/lib-dynload
  /mnt/ceph/bibi8250/.local/lib/python3.11/site-packages
  /opt/modules/devel/python/3.11.11/lib/python3.11/site-packages


In [4]:
import sys
print("Python executable:", sys.executable)

import numpy as np
import pandas as pd
import sklearn
from rank_bm25 import BM25Okapi
import lightgbm as lgb
import torch
import pyarrow
import datasets
import transformers
import huggingface_hub
import tokenizers

print("\nALL CODE-CRITICAL IMPORTS LOADED OK\n")

print("numpy:", np.__version__)
print("pandas:", pd.__version__)
print("sklearn:", sklearn.__version__)
print("rank_bm25:", BM25Okapi)
print("lightgbm:", lgb.__version__)
print("torch:", torch.__version__)
print("pyarrow:", pyarrow.__version__)
print("datasets:", datasets.__version__)
print("transformers:", transformers.__version__)
print("huggingface_hub:", huggingface_hub.__version__)
print("tokenizers:", tokenizers.__version__)


Python executable: /opt/modules/devel/python/3.11.11/bin/python
2025-11-24 04:58:10,868 [INFO] TensorFlow version 2.20.0 available.

ALL CODE-CRITICAL IMPORTS LOADED OK

numpy: 1.26.4
pandas: 2.3.3
sklearn: 1.7.2
rank_bm25: <class 'rank_bm25.BM25Okapi'>
lightgbm: 4.6.0
torch: 2.5.1+cu121
pyarrow: 22.0.0
datasets: 4.4.1
transformers: 4.57.1
huggingface_hub: 0.36.0
tokenizers: 0.22.1


In [5]:
import os
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import roc_auc_score

from rank_bm25 import BM25Okapi
import lightgbm as lgb

import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

# ---- CONFIG ----
DATA_ROOT    = Path("esci_pipeline/data")
PATH_TRAIN   = DATA_ROOT / "esci_train.parquet"
PATH_TEST    = DATA_ROOT / "esci_test.parquet"

LOCALE       = "us"  # change later for "es", "jp", etc.
RANDOM_STATE = 42

# Models
GTE_MODEL_NAME  = "Alibaba-NLP/gte-multilingual-base"
XENC_MODEL_NAME = "distilbert-base-multilingual-cased"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print("Using device:", DEVICE)


# ---- PERSISTENCE CONFIG ----
ARTIFACT_DIR = Path("esci_pipeline/artifacts_us")
ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)

PATH_TRAIN_PREP = ARTIFACT_DIR / "train_df_prepared.parquet"
PATH_VALID_PREP = ARTIFACT_DIR / "valid_df_prepared.parquet"
PATH_PROD_EMBS  = ARTIFACT_DIR / "prod_embs.npy"
PATH_PROD_IDS   = ARTIFACT_DIR / "product_ids.npy"
PATH_BM25_PKL   = ARTIFACT_DIR / "bm25_corpus.pkl"
PATH_LGB_MODEL  = ARTIFACT_DIR / "ltr_model_us.txt"

2025-11-24 04:58:40.065383: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using device: cuda


# 1. Load & Basic Cleaning

In [6]:
# 1.1 Load ESCI train/test
esci_train = pd.read_parquet(PATH_TRAIN)
esci_test  = pd.read_parquet(PATH_TEST)

print("Raw shapes:", esci_train.shape, esci_test.shape)
print("Locales train:\n", esci_train["product_locale"].value_counts())

# 1.2 Filter to locale (US for now)
esci_train = esci_train[esci_train["product_locale"] == LOCALE].copy()
esci_test  = esci_test[esci_test["product_locale"] == LOCALE].copy()

# 1.3 Drop rows with missing query/title
esci_train = esci_train.dropna(subset=["query", "product_title"])
esci_test  = esci_test.dropna(subset=["query", "product_title"])

# 1.4 Fill remaining text NaNs
text_cols = [
    "product_description",
    "product_bullet_point",
    "product_brand",
    "product_color",
    "product_text",
]

for col in text_cols:
    esci_train[col] = esci_train[col].fillna("")
    esci_test[col]  = esci_test[col].fillna("")

print("Cleaned shapes:", esci_train.shape, esci_test.shape)

# 1.5 Map ESCI labels to numeric relevance 0–3
label2rel = {
    "Irrelevant": 0,
    "Complement": 1,
    "Substitute": 2,
    "Exact": 3,
}

esci_train["relevance"] = esci_train["esci_label"].map(label2rel).astype("int8")
esci_test["relevance"]  = esci_test["esci_label"].map(label2rel).astype("int8")

print("Relevance distribution (train):")
print(esci_train["relevance"].value_counts().sort_index())


Raw shapes: (2027874, 14) (652490, 14)
Locales train:
 product_locale
us    1420372
jp     333112
es     274390
Name: count, dtype: int64
Cleaned shapes: (1420372, 14) (434234, 14)
Relevance distribution (train):
relevance
0    122273
1     29713
2    280324
3    988062
Name: count, dtype: int64


In [7]:
# ---- Balanced 300k subset of US ESCI (by relevance) ----
DESIRED_TOTAL = 300_000
LABEL_COL = "relevance"

# Check class distribution
label_counts = esci_train[LABEL_COL].value_counts().sort_index()
print("Full US label counts:\n", label_counts)

num_classes = label_counts.shape[0]

# Ideal per-class target (for 300k total)
ideal_per_class = DESIRED_TOTAL // num_classes
print("Ideal per-class target:", ideal_per_class)

# But make sure we don't ask more than the smallest class has
min_available = label_counts.min()
n_per_class = min(ideal_per_class, min_available)

if n_per_class < ideal_per_class:
    print(
        f"⚠ Smallest class only has {min_available} examples; "
        f"using n_per_class={n_per_class} (total={n_per_class * num_classes}) "
        f"instead of full 10,000."
    )

print("Using n_per_class =", n_per_class)

# Sample a balanced subset
esci_train_balanced = (
    esci_train
    .groupby(LABEL_COL, group_keys=False)
    .apply(lambda g: g.sample(n=n_per_class, random_state=RANDOM_STATE))
    .reset_index(drop=True)
)

print("Balanced subset shape:", esci_train_balanced.shape)
print(esci_train_balanced[LABEL_COL].value_counts().sort_index())

# Optionally save for translation / reuse
BALANCED_PATH = DATA_ROOT / "esci_us_balanced_300k.parquet"
esci_train_balanced.to_parquet(BALANCED_PATH, index=False)
print("Saved balanced subset to:", BALANCED_PATH)

esci_train = esci_train_balanced


Full US label counts:
 relevance
0    122273
1     29713
2    280324
3    988062
Name: count, dtype: int64
Ideal per-class target: 75000
⚠ Smallest class only has 29713 examples; using n_per_class=29713 (total=118852) instead of full 10,000.
Using n_per_class = 29713


  .apply(lambda g: g.sample(n=n_per_class, random_state=RANDOM_STATE))


Balanced subset shape: (118852, 15)
relevance
0    29713
1    29713
2    29713
3    29713
Name: count, dtype: int64
Saved balanced subset to: esci_pipeline/data/esci_us_balanced_300k.parquet


In [8]:
# ---- Balanced 10k subset of US ESCI TEST (by relevance) ----
DESIRED_TEST = 60_000
LABEL_COL_TEST = "relevance"

# Check test distribution
label_counts_test = esci_test[LABEL_COL_TEST].value_counts().sort_index()
print("Full US TEST label counts:\n", label_counts_test)

num_classes_test = label_counts_test.shape[0]
ideal_per_class_test = DESIRED_TEST // num_classes_test
print("Ideal per-class target (test):", ideal_per_class_test)

min_available_test = label_counts_test.min()
n_per_class_test = min(ideal_per_class_test, min_available_test)

if n_per_class_test < ideal_per_class_test:
    print(
        f"⚠ Smallest class only has {min_available_test} examples; "
        f"using n_per_class_test={n_per_class_test} "
        f"(total={n_per_class_test * num_classes_test}) instead of {DESIRED_TEST}."
    )

print("Using n_per_class_test =", n_per_class_test)

# Create balanced test dataset
esci_test_balanced = (
    esci_test
    .groupby(LABEL_COL_TEST, group_keys=False)
    .apply(lambda g: g.sample(n=n_per_class_test, random_state=RANDOM_STATE))
    .reset_index(drop=True)
)

print("Balanced TEST subset shape:", esci_test_balanced.shape)
print(esci_test_balanced[LABEL_COL_TEST].value_counts().sort_index())

# Save the FINAL 10k parquest (no translation)
test_path = DATA_ROOT / "us_test_balanced_60k.parquet"
esci_test_balanced.to_parquet(test_path, index=False)

print("Saved balanced 10k US TEST dataset to:", test_path)

esci_test = pd.read_parquet(DATA_ROOT / "us_test_balanced_60k.parquet")


Full US TEST label counts:
 relevance
0     43505
1     11147
2     97163
3    282419
Name: count, dtype: int64
Ideal per-class target (test): 15000
⚠ Smallest class only has 11147 examples; using n_per_class_test=11147 (total=44588) instead of 60000.
Using n_per_class_test = 11147
Balanced TEST subset shape: (44588, 15)
relevance
0    11147
1    11147
2    11147
3    11147
Name: count, dtype: int64


  .apply(lambda g: g.sample(n=n_per_class_test, random_state=RANDOM_STATE))


Saved balanced 10k US TEST dataset to: esci_pipeline/data/us_test_balanced_60k.parquet


# 2. Unified Text + Explicit Context Features

In [9]:
# 2.1 Unified product_text_clean
def build_product_text_clean(df: pd.DataFrame) -> pd.Series:
    parts = [
        df["product_title"],
        df["product_description"],
        df["product_bullet_point"],
        df["product_brand"],
        df["product_color"],
    ]
    return (
        parts[0].fillna("")
        + " [SEP] " + parts[1].fillna("")
        + " [SEP] " + parts[2].fillna("")
        + " [SEP] " + parts[3].fillna("")
        + " [SEP] " + parts[4].fillna("")
    )

esci_train["product_text_clean"] = build_product_text_clean(esci_train)
esci_test["product_text_clean"]  = build_product_text_clean(esci_test)

# 2.2 Explicit "context" features from metadata
def add_context_features(df: pd.DataFrame) -> pd.DataFrame:
    # --- Basic lengths ---
    df["ctx_title_len"]  = df["product_title"].fillna("").str.split().str.len().astype("float32")
    df["ctx_desc_len"]   = df["product_description"].fillna("").str.split().str.len().astype("float32")
    df["ctx_bullet_len"] = df["product_bullet_point"].fillna("").str.split().str.len().astype("float32")

    # Precompute lowercased strings
    title_lower = df["product_title"].fillna("").str.lower()
    brand_lower = df["product_brand"].fillna("").str.lower()
    color_lower = df["product_color"].fillna("").str.lower()

    # --- Brand in title (row-wise substring check) ---
    df["ctx_brand_in_title"] = [
        int((b != "") and (b in t))
        for b, t in zip(brand_lower, title_lower)
    ]

    # --- Color in title (row-wise substring check) ---
    df["ctx_color_in_title"] = [
        int((c != "") and (c in t))
        for c, t in zip(color_lower, title_lower)
    ]

    # Cast to int8 for compactness
    df["ctx_brand_in_title"] = df["ctx_brand_in_title"].astype("int8")
    df["ctx_color_in_title"] = df["ctx_color_in_title"].astype("int8")

    return df

esci_train = add_context_features(esci_train)
esci_test  = add_context_features(esci_test)

# 2.3 Popularity / behaviour proxies (within ESCI train)
prod_stats = (
    esci_train.groupby("product_id")["relevance"]
    .agg(prod_count="size", prod_mean_rel="mean", prod_max_rel="max")
    .reset_index()
)

query_stats = (
    esci_train.groupby("query_id")["relevance"]
    .agg(query_count="size", query_mean_rel="mean")
    .reset_index()
)

esci_train = esci_train.merge(prod_stats, on="product_id", how="left")
esci_train = esci_train.merge(query_stats, on="query_id", how="left")

# For test, merge stats computed from train only
esci_test = esci_test.merge(prod_stats, on="product_id", how="left")
esci_test = esci_test.merge(query_stats, on="query_id", how="left")

# Fill NaNs for unseen products/queries in test
pop_cols = ["prod_count", "prod_mean_rel", "prod_max_rel", "query_count", "query_mean_rel"]
for col in pop_cols:
    esci_train[col] = esci_train[col].fillna(0).astype("float32")
    esci_test[col]  = esci_test[col].fillna(0).astype("float32")

esci_train.head()


Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,product_title,product_description,...,ctx_title_len,ctx_desc_len,ctx_bullet_len,ctx_brand_in_title,ctx_color_in_title,prod_count,prod_mean_rel,prod_max_rel,query_count,query_mean_rel
0,193260,amazon.com/code activate,8677,B0066TUXU6,us,Irrelevant,1,1,Hulu,,...,1.0,0.0,122.0,1,0,1.0,0.0,0.0,4.0,0.75
1,264260,babygirl anklet,12307,B07663TBC4,us,Irrelevant,1,1,Goldenchen Fashion Jewelry 925 Silver Plated A...,Goldenchen is a fashion jewelry leader that eq...,...,13.0,102.0,70.0,1,1,1.0,0.0,0.0,4.0,0.5
2,1311389,max japan,66517,B06Y1CBVKT,us,Irrelevant,1,1,Max Vaimo HD-11FLK Flat Clinch Stapler with 3 ...,,...,12.0,0.0,39.0,0,0,1.0,0.0,0.0,5.0,0.0
3,1632982,pressure washer oil,83198,B004S67CUS,us,Irrelevant,0,1,SIMPSON Cleaning Pressure Washer Pump Guard,,...,6.0,0.0,33.0,1,0,1.0,0.0,0.0,2.0,0.0
4,300511,beach hats for women,14164,B072BJPTKJ,us,Irrelevant,0,1,CUPSHE Women's One Piece Swimsuit Halter Tummy...,<b>CUPSHE Intro</b><br> To inspire confidence ...,...,15.0,271.0,80.0,1,1,1.0,0.0,0.0,5.0,1.8


# 3. Train/Valid Split (Grouped by query_id)

In [10]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.1, random_state=RANDOM_STATE)
train_idx, valid_idx = next(gss.split(esci_train, groups=esci_train["query_id"]))

train_df = esci_train.iloc[train_idx].reset_index(drop=True)
valid_df = esci_train.iloc[valid_idx].reset_index(drop=True)

print("Train / Valid shapes:", train_df.shape, valid_df.shape)


Train / Valid shapes: (107125, 26) (11727, 26)


# 4. BM25 Candidate Generation

In [11]:
def build_bm25_corpus(df: pd.DataFrame):
    # one doc per product_id
    prod_group = df.groupby("product_id")["product_text_clean"].first()
    product_ids = prod_group.index.to_list()
    corpus = [doc.split() for doc in prod_group.values]
    return product_ids, corpus

bm25_product_ids, bm25_corpus = build_bm25_corpus(esci_train)
bm25 = BM25Okapi(bm25_corpus)

# Map product_id -> index in BM25 corpus
prodid_to_idx = {pid: i for i, pid in enumerate(bm25_product_ids)}

def bm25_candidates_for_query(query_text: str, top_k: int = 100):
    tokenized = query_text.split()
    scores = bm25.get_scores(tokenized)
    top_idx = np.argsort(scores)[::-1][:top_k]
    return [(bm25_product_ids[i], float(scores[i])) for i in top_idx]

# Example sanity check
print(bm25_candidates_for_query("bathroom fan without light", top_k=5))


[('B001PO29TA', 22.126202648051027), ('B07YBRY2RL', 16.25763903390984), ('B07CR7449N', 15.901836445679972), ('B092MQ8F2V', 15.486902814225328), ('B08JR2C7JC', 15.286855978338346)]


In [12]:
# ---- SAVE BM25 corpus (optional but useful) ----
import pickle

with open(PATH_BM25_PKL, "wb") as f:
    pickle.dump((bm25_product_ids, bm25_corpus), f)

print("Saved BM25 corpus to:", PATH_BM25_PKL)

Saved BM25 corpus to: esci_pipeline/artifacts_us/bm25_corpus.pkl


# 5. Attach BM25 Scores & Ranks to train/valid

In [13]:
def attach_bm25_scores(df_queries: pd.DataFrame, top_k: int = 100) -> pd.DataFrame:
    df_queries = df_queries.copy()
    rows = []

    for qid, sub in df_queries.groupby("query_id"):
        q_text = sub["query"].iloc[0]
        cand = bm25_candidates_for_query(q_text, top_k=top_k)
        cand_map = {pid: score for pid, score in cand}
        for idx, row in sub.iterrows():
            pid = row["product_id"]
            bm25_score = cand_map.get(pid, 0.0)
            rows.append((idx, bm25_score))

    bm25_scores = pd.DataFrame(rows, columns=["row_idx", "bm25_score"]).set_index("row_idx")
    df_queries = df_queries.join(bm25_scores, how="left")
    df_queries["bm25_score"] = df_queries["bm25_score"].fillna(0.0)

    # Rank within each query (higher score = better rank)
    df_queries["bm25_rank"] = (
        df_queries.groupby("query_id")["bm25_score"]
        .rank(ascending=False, method="first")
        .astype("int32")
    )
    return df_queries

train_df = attach_bm25_scores(train_df, top_k=100)
valid_df = attach_bm25_scores(valid_df, top_k=100)

train_df[["query_id", "bm25_score", "bm25_rank"]].head()


Unnamed: 0,query_id,bm25_score,bm25_rank
0,8677,0.0,1
1,12307,0.0,2
2,66517,0.0,2
3,83198,16.197333,1
4,79687,0.0,1


# 6. GTE Multilingual Embeddings & Similarity

In [14]:
from sentence_transformers import SentenceTransformer
import numpy as np
import torch

# ---- CONFIG ----
GTE_MODEL_NAME = "Alibaba-NLP/gte-multilingual-base"

# Load model on correct device
gte_model = SentenceTransformer(
    GTE_MODEL_NAME,
    trust_remote_code=True,
    device=DEVICE,   # "cuda" or "cpu" from your earlier config
)

@torch.no_grad()
def encode_texts(texts, batch_size=64):
    # SentenceTransformer handles tokenization + model + pooling internally
    embs = gte_model.encode(
        texts,
        batch_size=batch_size,
        convert_to_numpy=True,
        normalize_embeddings=True,    # already L2-normalized -> cosine via dot product
        show_progress_bar=False,
    )
    return embs

# 6.1 Precompute product embeddings (train universe)
prod_group = esci_train.groupby("product_id")["product_text_clean"].first()
product_ids = prod_group.index.to_list()
prod_embs   = encode_texts(prod_group.tolist(), batch_size=64)

prodid_to_emb = {pid: emb for pid, emb in zip(product_ids, prod_embs)}

# ---- SAVE product embeddings + IDs ----
np.save(PATH_PROD_EMBS, prod_embs)
np.save(PATH_PROD_IDS, np.array(product_ids))

print("Saved product embeddings to:", PATH_PROD_EMBS)
print("Saved product IDs to:", PATH_PROD_IDS)


# 6.2 Compute query embeddings for train/valid (correctly aligned)
train_q = train_df.groupby("query_id")["query"].first().reset_index()
valid_q = valid_df.groupby("query_id")["query"].first().reset_index()

train_query_embs = encode_texts(train_q["query"].tolist())
valid_query_embs = encode_texts(valid_q["query"].tolist())

qid_to_emb_train = {
    int(qid): emb for qid, emb in zip(train_q["query_id"].tolist(), train_query_embs)
}
qid_to_emb_valid = {
    int(qid): emb for qid, emb in zip(valid_q["query_id"].tolist(), valid_query_embs)
}

def attach_gte_scores(df: pd.DataFrame, qid_to_emb: dict) -> pd.DataFrame:
    df = df.copy()
    scores = []
    for idx, row in df.iterrows():
        q_emb = qid_to_emb[int(row["query_id"])]
        p_emb = prodid_to_emb[row["product_id"]]
        score = float(np.dot(q_emb, p_emb))  # cosine because normalized
        scores.append(score)

    df["gte_score"] = scores
    df["gte_rank"] = (
        df.groupby("query_id")["gte_score"]
        .rank(ascending=False, method="first")
        .astype("int32")
    )
    return df

train_df = attach_gte_scores(train_df, qid_to_emb_train)
valid_df = attach_gte_scores(valid_df, qid_to_emb_valid)

train_df[["query_id", "gte_score", "gte_rank"]].head()


2025-11-24 06:54:08,917 [INFO] Load pretrained SentenceTransformer: Alibaba-NLP/gte-multilingual-base


Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Saved product embeddings to: esci_pipeline/artifacts_us/prod_embs.npy
Saved product IDs to: esci_pipeline/artifacts_us/product_ids.npy


Unnamed: 0,query_id,gte_score,gte_rank
0,8677,0.515014,2
1,12307,0.652051,3
2,66517,0.623146,1
3,83198,0.651029,2
4,79687,0.486416,4


# 7. RRF Fusion Baseline (BM25 + GTE)

In [15]:
def rrf_fusion(bm25_rank, gte_rank, c: float = 60.0):
    return 1.0 / (c + bm25_rank) + 1.0 / (c + gte_rank)

for df in (train_df, valid_df):
    df["rrf_score"] = rrf_fusion(df["bm25_rank"], df["gte_rank"])
    df["rrf_rank"]  = (
        df.groupby("query_id")["rrf_score"]
        .rank(ascending=False, method="first")
        .astype("int32")
    )

valid_df[["query_id", "rrf_score", "rrf_rank"]].head()


Unnamed: 0,query_id,rrf_score,rrf_rank
0,14164,0.031778,2
1,45325,0.032018,2
2,52411,0.032522,1
3,63949,0.032018,2
4,100206,0.032266,2


# 8. MMR Baseline (Using GTE embeddings)

In [16]:
def mmr_rerank(df_q: pd.DataFrame, lambda_: float = 0.7, top_k: int = 20) -> pd.DataFrame:
    df_q = df_q.copy()
    cand_idx = df_q.index.to_list()
    selected = []
    remaining = cand_idx.copy()

    # Pre-collect product embeddings
    pid_to_emb_local = {
        pid: prodid_to_emb[pid] for pid in df_q["product_id"].unique()
    }

    while remaining and len(selected) < top_k:
        best_i = None
        best_score = -1e9
        for i in remaining:
            row = df_q.loc[i]
            rel = float(row["gte_score"])
            if not selected:
                score = rel
            else:
                emb_i = pid_to_emb_local[row["product_id"]]
                max_sim = max(
                    float(np.dot(emb_i, pid_to_emb_local[df_q.loc[j, "product_id"]]))
                    for j in selected
                )
                score = lambda_ * rel - (1.0 - lambda_) * max_sim
            if score > best_score:
                best_score = score
                best_i = i
        selected.append(best_i)
        remaining.remove(best_i)

    # Build rank series for selected items
    mmr_order = pd.Series(range(1, len(selected) + 1), index=selected)
    df_q["mmr_rank"] = (
        df_q.index.to_series().map(mmr_order).fillna(len(df_q) + 1).astype("int32")
    )
    return df_q

def apply_mmr(df: pd.DataFrame, lambda_: float = 0.7, top_k: int = 20) -> pd.DataFrame:
    parts = []
    for qid, sub in df.groupby("query_id"):
        parts.append(mmr_rerank(sub, lambda_=lambda_, top_k=top_k))
    return pd.concat(parts, axis=0).sort_index()

train_df = apply_mmr(train_df, lambda_=0.7, top_k=20)
valid_df = apply_mmr(valid_df, lambda_=0.7, top_k=20)

# Convert MMR ranks into scores for evaluation
for df in (train_df, valid_df):
    # Lower rank = better item => use negative rank as score
    df["mmr_score"] = -df["mmr_rank"].astype("float32")


valid_df[["query_id", "mmr_rank"]].head()


Unnamed: 0,query_id,mmr_rank
0,14164,4
1,45325,4
2,52411,2
3,63949,4
4,100206,3


# 9. LambdaMART (Context-Aware LTR)

In [17]:
# IMPORTANT: sort by query_id so LightGBM group sizes match row order
train_df = train_df.sort_values("query_id").reset_index(drop=True)
valid_df = valid_df.sort_values("query_id").reset_index(drop=True)

ltr_features = [
    "bm25_score",
    "gte_score",
    "ctx_title_len",
    "ctx_desc_len",
    "ctx_bullet_len",
    "ctx_brand_in_title",
    "ctx_color_in_title",
    "prod_count",
    "prod_mean_rel",
    "prod_max_rel",
    "query_count",
    "query_mean_rel",
]

X_train = train_df[ltr_features].astype("float32")
y_train = train_df["relevance"].astype("float32")
group_train = train_df.groupby("query_id").size().to_list()

X_valid = valid_df[ltr_features].astype("float32")
y_valid = valid_df["relevance"].astype("float32")
group_valid = valid_df.groupby("query_id").size().to_list()

lgb_train = lgb.Dataset(X_train, label=y_train, group=group_train)
lgb_valid = lgb.Dataset(X_valid, label=y_valid, group=group_valid, reference=lgb_train)

params = {
    "objective": "lambdarank",
    "metric": "ndcg",
    "ndcg_eval_at": [10],
    "learning_rate": 0.05,
    "num_leaves": 63,
    "min_data_in_leaf": 50,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "verbosity": -1,
    "random_state": RANDOM_STATE,
}

callbacks = [
    lgb.early_stopping(stopping_rounds=50, verbose=False),
]

lgbm_model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_valid],
    valid_names=["train", "valid"],
    num_boost_round=300,
    callbacks=callbacks,
)

# ---- SAVE LTR model ----
lgbm_model.save_model(str(PATH_LGB_MODEL))
print("Saved LightGBM LambdaMART model to:", PATH_LGB_MODEL)

# 9.1 Predict LTR scores
valid_df["ltr_score"] = lgbm_model.predict(
    X_valid,
    num_iteration=lgbm_model.best_iteration
)
valid_df["ltr_rank"]  = (
    valid_df.groupby("query_id")["ltr_score"]
    .rank(ascending=False, method="first")
    .astype("int32")
)

# ---- SAVE prepared train/valid DataFrames ----
train_df.to_parquet(PATH_TRAIN_PREP, index=False)
valid_df.to_parquet(PATH_VALID_PREP, index=False)

print("Saved prepared train_df to:", PATH_TRAIN_PREP)
print("Saved prepared valid_df to:", PATH_VALID_PREP)


valid_df[["query_id", "ltr_score", "ltr_rank"]].head()


Saved LightGBM LambdaMART model to: esci_pipeline/artifacts_us/ltr_model_us.txt
Saved prepared train_df to: esci_pipeline/artifacts_us/train_df_prepared.parquet
Saved prepared valid_df to: esci_pipeline/artifacts_us/valid_df_prepared.parquet


Unnamed: 0,query_id,ltr_score,ltr_rank
0,8,1.116704,1
1,70,-1.104282,3
2,70,-1.104282,4
3,70,-1.104282,5
4,70,0.80287,1


# 10. Cross-Encoder (DistilBERT) 

In [18]:
def make_hf_dataset(df: pd.DataFrame) -> Dataset:
    tmp = df[["query", "product_text_clean", "relevance"]].rename(
        columns={"relevance": "label"}
    )
    # preserve_index=False avoids keeping the Pandas index as a separate column
    return Dataset.from_pandas(tmp, preserve_index=False)

train_hf = make_hf_dataset(train_df)
valid_hf = make_hf_dataset(valid_df)

tokenizer_x = AutoTokenizer.from_pretrained(XENC_MODEL_NAME)

def tokenize_batch(batch):
    enc = tokenizer_x(
        batch["query"],
        batch["product_text_clean"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )
    enc["labels"] = batch["label"]
    return enc

train_tok = train_hf.map(tokenize_batch, batched=True)
valid_tok = valid_hf.map(tokenize_batch, batched=True)

train_tok = train_tok.remove_columns(["query", "product_text_clean"])
valid_tok = valid_tok.remove_columns(["query", "product_text_clean"])

train_tok.set_format("torch")
valid_tok.set_format("torch")

# 10.1 Model (4-class classification)
num_labels = 4
xenc_model = AutoModelForSequenceClassification.from_pretrained(
    XENC_MODEL_NAME,
    num_labels=num_labels,
    problem_type="single_label_classification",
)

training_args = TrainingArguments(
    output_dir="models/esci_us_xenc",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = (preds == labels).mean()
    return {"accuracy": float(acc)}

trainer = Trainer(
    model=xenc_model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=valid_tok,
    compute_metrics=compute_metrics,
)


trainer.train()

# 10.2 Getting CE scores for valid_df after training:
preds = trainer.predict(valid_tok)
logits = preds.predictions
probs  = torch.softmax(torch.tensor(logits), dim=-1).numpy()
valid_df["ce_score"] = probs[:, 3] + 0.5 * probs[:, 2]  # weight Exact/Substitute
valid_df["ce_rank"]  = (
    valid_df.groupby("query_id")["ce_score"]
    .rank(ascending=False, method="first")
    .astype("int32")
)


Map:   0%|          | 0/107125 [00:00<?, ? examples/s]

Map:   0%|          | 0/11727 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




Epoch,Training Loss,Validation Loss,Accuracy
1,1.003,1.052852,0.55837
2,0.8643,1.057088,0.570905
3,0.7164,1.164926,0.559734
4,0.5563,1.276992,0.559563
5,0.4487,1.435516,0.555726
6,0.3601,1.599343,0.554021
7,0.281,1.834243,0.555044
8,0.2235,2.241846,0.546175
9,0.1546,2.501801,0.55223
10,0.1698,2.727857,0.55078


# 11. Evaluation (nDCG@K, P@K, gAUC)

In [19]:
def dcg_at_k(rels, k: int):
    rels = np.asarray(rels)[:k]
    if rels.size == 0:
        return 0.0
    discounts = np.log2(np.arange(2, rels.size + 2))
    return float(np.sum((2 ** rels - 1) / discounts))

def ndcg_at_k(rels, k: int):
    rels = np.asarray(rels)
    best = np.sort(rels)[::-1]
    best_dcg = dcg_at_k(best, k)
    if best_dcg == 0:
        return 0.0
    return dcg_at_k(rels, k) / best_dcg

def eval_ranking(df: pd.DataFrame, score_col: str, k: int = 10):
    ndcgs = []
    precs = []
    auc_labels = []
    auc_scores = []

    for qid, sub in df.groupby("query_id"):
        sub_sorted = sub.sort_values(score_col, ascending=False)
        rels = sub_sorted["relevance"].values

        ndcgs.append(ndcg_at_k(rels, k))
        precs.append((rels[:k] > 0).mean())  # non-zero relevance => positive

        auc_labels.extend((rels > 0).astype(int).tolist())
        auc_scores.extend(sub_sorted[score_col].tolist())

    ndcg = float(np.mean(ndcgs))
    prec = float(np.mean(precs))
    try:
        gauc = float(roc_auc_score(auc_labels, auc_scores))
    except ValueError:
        gauc = float("nan")

    return {"nDCG@{}".format(k): ndcg, "P@{}".format(k): prec, "gAUC": gauc}

for col in ["bm25_score", "gte_score", "rrf_score", "ltr_score","mmr_score","ce_score"]:
    metrics = eval_ranking(valid_df, col, k=10)
    print(col, metrics)


bm25_score {'nDCG@10': 0.8234731674889765, 'P@10': 0.7967233652343497, 'gAUC': 0.5792894990043588}
gte_score {'nDCG@10': 0.8401853983140759, 'P@10': 0.7970691748356517, 'gAUC': 0.7507843379776473}
rrf_score {'nDCG@10': 0.81520136450779, 'P@10': 0.7966623400105906, 'gAUC': 0.5632964850187847}
ltr_score {'nDCG@10': 0.8758506878745544, 'P@10': 0.7974963514019657, 'gAUC': 0.9650283057502457}
mmr_score {'nDCG@10': 0.8398444855474029, 'P@10': 0.796804732199362, 'gAUC': 0.5977450099316454}
ce_score {'nDCG@10': 0.8406037396049565, 'P@10': 0.7964996060805662, 'gAUC': 0.6405646118871844}
