In [None]:
!pip -q install numpy pandas tqdm sentence-transformers
# Optional but recommended for speed; if it fails, the code will still work with the NumPy fallback.
!pip -q install faiss-cpu || echo "FAISS install failed; will use NumPy fallback."
# If you plan to use OpenAI embeddings later:
!pip -q install openai


In [None]:
from __future__ import annotations
import json, re
from pathlib import Path
from typing import Iterable, List

import numpy as np
import pandas as pd
from tqdm import tqdm


In [None]:
from google.colab import userdata
import os

# Retrieve the API key from Colab secrets
OPEN_API_KEY = userdata.get('OPEN_API_KEY')

# Set it as an environment variable
os.environ['OPEN_API_KEY'] = OPEN_API_KEY

print("OPEN_API_KEY loaded into environment variables.")

OPEN_API_KEY loaded into environment variables.


In [None]:
try:
    import faiss   # from faiss-cpu
    FAISS_AVAILABLE = True
except Exception:
    faiss = None
    FAISS_AVAILABLE = False

def build_ip_index(emb: np.ndarray):
    """
    Inner-product index. If FAISS is present, use it; otherwise a NumPy fallback
    with the same .search(Q, k) API. Assumes emb are L2-normalized.
    """
    if FAISS_AVAILABLE:
        d = emb.shape[1]
        index = faiss.IndexFlatIP(d)
        index.add(emb.astype("float32"))
        return index

    class NumpyIPIndex:
        def __init__(self, X):
            self.X = X.astype("float32")
        def add(self, X):
            pass
        def search(self, Q, k):
            S = Q @ self.X.T
            I = np.argpartition(-S, k-1, axis=1)[:, :k]
            S_topk = np.take_along_axis(S, I, axis=1)
            order = np.argsort(-S_topk, axis=1)
            I_sorted = np.take_along_axis(I, order, axis=1)
            D_sorted = np.take_along_axis(S_topk, order, axis=1)
            return D_sorted.astype("float32"), I_sorted.astype("int64")
    return NumpyIPIndex(emb)

def save_faiss_index(index, path: Path):
    if FAISS_AVAILABLE:
        faiss.write_index(index, str(path))
    else:
        # NumPy fallback is in-memory only; nothing to save.
        pass


In [None]:
def resolve_existing_file(preferred: str, keywords=("products",), exts=(".json", ".jsonl")) -> Path:
    """
    Try preferred; else search CWD, /content (Colab), and /mnt/data (uploaded in ChatGPT)
    for filenames containing all keywords and ending with one of exts.
    """
    cand = Path(preferred)
    if cand.exists():
        return cand

    search_roots = [Path("."), Path("/content"), Path("/mnt/data")]
    for root in search_roots:
        if not root.exists():
            continue
        for ext in exts:
            for p in root.rglob(f"*{ext}"):
                if all(k.lower() in p.name.lower() for k in keywords):
                    return p

    raise FileNotFoundError(
        f"Could not find products file. Tried '{preferred}' and searched CWD, /content, /mnt/data "
        f"for {keywords} with exts {exts}."
    )

def load_products_any(products_path_hint: str = "products.json") -> list[dict]:
    """
    Load products from JSON array, {'products': [...]}, or JSONL.
    """
    fp = resolve_existing_file(products_path_hint, keywords=("products",), exts=(".json", ".jsonl"))
    # Try JSON array/dict
    try:
        with open(fp, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, list):
            return data
        if isinstance(data, dict) and "products" in data:
            return data["products"]
    except json.JSONDecodeError:
        pass
    # Fallback: JSON Lines
    items = []
    with open(fp, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                items.append(json.loads(line))
    return items


In [None]:
from typing import Iterable, List # Import Iterable

def norm_text(s: str) -> str:
    s = (s or "").lower()
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"[^a-z0-9%\. ]+", " ", s)

    # Add stopword removal
    stopwords = set(["a", "an", "the", "and", "is", "in", "it", "to", "for", "with", "on", "this", "that"]) # Add more stopwords as needed
    words = s.split()
    s = " ".join([word for word in words if word not in stopwords])

    return s.strip()

def batch_embed(texts: Iterable[str], embed_fn, batch_size: int = 512) -> np.ndarray:
    texts = list(texts)
    out = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding", unit="batch"):
        out.append(embed_fn(texts[i:i+batch_size]))
    return np.vstack(out) if out else np.zeros((0, 384), dtype="float32")

In [None]:
def build_vector_store(
    products_json_path: str = "products.json",   # a hint; loader will auto-find
    out_dir: str = "vector_store",
    use_openai: bool = False
):
    out = Path(out_dir)
    out.mkdir(parents=True, exist_ok=True)

    # 1) Load products
    products = load_products_any(products_json_path)
    if not products:
        raise ValueError("No products loaded. Check your products file.")

    # 2) Canonicalize & normalize
    rows = []
    for r in products:
        pid = str(r.get("product_id") or r.get("id") or r.get("sku") or "")
        title = r.get("title") or r.get("name") or ""
        brand = r.get("brand") or ""
        category = r.get("category_path") or r.get("category") or ""
        desc = r.get("description") or ""
        rows.append({
            "product_id": pid,
            "title": title,
            "brand": brand,
            "category_path": category,
            "description": desc,
            "norm_title": norm_text(title),
            "norm_brand": norm_text(brand),
            "norm_category": norm_text(category),
            "norm_description": norm_text(desc),
        })
    df = pd.DataFrame(rows)
    df.to_parquet(out / "products.parquet", index=False)

    # 3) Choose embedding fn
    embed_fn = embed_texts_openai

    # 4) Per-field embeddings
    print("Embedding title...")
    emb_title = batch_embed(df["norm_title"].fillna("").astype(str).tolist(), embed_fn)
    print("Embedding brand...")
    emb_brand = batch_embed(df["norm_brand"].fillna("").astype(str).tolist(), embed_fn)
    print("Embedding category...")
    emb_cat   = batch_embed(df["norm_category"].fillna("").astype(str).tolist(), embed_fn)
    print("Embedding description...")
    emb_desc  = batch_embed(df["norm_description"].fillna("").astype(str).tolist(), embed_fn)

    # 5) Concatenate embeddings
    print("Concatenating embeddings...")
    emb_combined = np.concatenate([emb_title, emb_brand, emb_cat, emb_desc], axis=1)

    # 6) Save combined embeddings
    np.save(out / "emb_combined.npy", emb_combined)

    # 7) Build index (FAISS or NumPy fallback)
    print("Building combined index (FAISS available? ->", FAISS_AVAILABLE, ")")
    idx_combined = build_ip_index(emb_combined)

    # 8) Save FAISS index if present
    if FAISS_AVAILABLE:
        save_faiss_index(idx_combined, out / "index_combined.faiss")
    else:
        print("FAISS not installed: skipping .faiss file (NumPy fallback will be used at query time).")

    # 9) Row → product_id map
    pid_map = df["product_id"].astype(str).tolist()
    pd.Series(pid_map).to_json(out / "rowid_to_pid.json", orient="values")

    print("✅ Finished Step 1. Vector store written to:", out.resolve())

In [None]:
from pathlib import Path

print("CWD:", Path.cwd())
candidates = []
for root in [Path("."), Path("/content")]:
    if root.exists():
        for p in root.rglob("products*.json*"):
            candidates.append(p.resolve())

print("Found candidates:")
for c in candidates:
    print(" -", c)

if not candidates:
    print("No products*.json/.jsonl files found in CWD or /content.")



CWD: /content
Found candidates:
No products*.json/.jsonl files found in CWD or /content.


In [None]:
from google.colab import files
up = files.upload()  # choose your products.json or products.jsonl
list(up.keys())      # shows uploaded filename(s)


Saving products.json to products.json


['products.json']

In [None]:
# Re-define OpenAI client and embedding function if not already in the current kernel state
_OPENAI_CLIENT = None
def get_openai_client():
     global _OPENAI_CLIENT
     if _OPENAI_CLIENT is None:
         _OPENAI_CLIENT = OpenAI(api_key=os.environ["OPEN_API_KEY"])
     return _OPENAI_CLIENT

def embed_texts_openai(texts: List[str], model: str = "text-embedding-3-large") -> np.ndarray:
     client = get_openai_client()
     # Replace empty strings with a single space to avoid OpenAI API errors
     processed_texts = [t if t.strip() != "" else " " for t in texts]
     resp = client.embeddings.create(model=model, input=processed_texts)
     V = np.array([d.embedding for d in resp.data], dtype="float32")
     V /= (np.linalg.norm(V, axis=1, keepdims=True) + 1e-12)
     return V.astype("float32")

from openai import OpenAI # Import OpenAI class

In [None]:
# EXAMPLES — pick ONE that matches your file
# products_path = "/content/products.json"
# products_path = "/content/products.jsonl"
# products_path = "/content/drive/MyDrive/datathon/products.json"

products_path = "/content/products.json"   # <- CHANGE THIS to your real path

build_vector_store(
    products_json_path=products_path,
    out_dir="vector_store",
    use_openai=True
)

Embedding title...


Embedding: 100%|██████████| 7/7 [00:12<00:00,  1.72s/batch]


Embedding brand...


Embedding: 100%|██████████| 7/7 [00:09<00:00,  1.43s/batch]


Embedding category...


Embedding: 100%|██████████| 7/7 [00:08<00:00,  1.27s/batch]


Embedding description...


Embedding: 100%|██████████| 7/7 [00:11<00:00,  1.68s/batch]


Concatenating embeddings...
Building combined index (FAISS available? -> True )
✅ Finished Step 1. Vector store written to: /content/vector_store


In [None]:
from pathlib import Path
import numpy as np, json, pandas as pd

root = Path("vector_store")
print("Files:", [p.name for p in root.iterdir()])
print("emb_combined.npy shape:", np.load(root/"emb_combined.npy").shape)
print("rowid_to_pid.json length:", len(json.loads((root/"rowid_to_pid.json").read_text())))
pd.read_parquet(root/"products.parquet").head(3)

In [None]:
import pandas as pd
from pathlib import Path

root = Path("vector_store")
if (root / "products.parquet").exists():
    print("Head of products.parquet:")
    print(pd.read_parquet(root / "products.parquet").head())
else:
    print(f"products.parquet not found in {root}")

In [None]:
from pathlib import Path
import json, numpy as np
import pandas as pd # Import pandas for displaying dataframe

root = Path("vector_store")

if not root.exists():
    print(f"Error: Directory {root} not found.")
elif not list(root.iterdir()):
    print(f"Error: Directory {root} is empty.")
else:
    print(f"Files in {root}:")
    for p in root.iterdir():
        print(" -", p.name)

    # Check shapes of all embedding files
    embedding_files = {
        "title": "emb_title.npy",
        "brand": "emb_brand.npy",
        "category": "emb_category.npy",
        "description": "emb_description.npy",
    }

    for name, filename in embedding_files.items():
        file_path = root / filename
        if file_path.exists():
            E = np.load(file_path)
            print(f"Embeddings shape ({name}): {E.shape}")
        else:
            print(f"Embedding file not found: {file_path}")

    # Load pid map and peek at product metadata
    pid_map_path = root / "rowid_to_pid.json"
    if pid_map_path.exists():
        pid_map = json.loads(pid_map_path.read_text())
        print("Products indexed:", len(pid_map))
    else:
        print(f"Product ID map not found: {pid_map_path}")

    products_parquet_path = root / "products.parquet"
    if products_parquet_path.exists():
        meta = pd.read_parquet(products_parquet_path).head(3)
        print("\nHead of products.parquet:")
        display(meta)
    else:
        print(f"products.parquet not found: {products_parquet_path}")


In [None]:
VSTORE = Path("vector_store")  # change if you saved elsewhere

# Load combined embeddings
E_combined = np.load(VSTORE/"emb_combined.npy")

# Build in-memory IP index (fast; a few seconds)
IDX_combined = build_ip_index(E_combined)

# Load ID map and metadata
rowid_to_pid = json.loads((VSTORE/"rowid_to_pid.json").read_text())
meta = pd.read_parquet(VSTORE/"products.parquet")
pid_to_row = {pid: i for i, pid in enumerate(rowid_to_pid)}

len(rowid_to_pid), meta.shape

In [None]:
from google.colab import files
up = files.upload()  # pick your queries file, e.g., queries_synth_train.json
list(up.keys())


In [None]:
import json
from pathlib import Path

# Assuming find_queries_file and QPATH are defined in a previous cell
# If not, they should be included in this cell or ensured from prior cells.


def load_queries_any(path):
    path = Path(path)
    # try JSON (array or dict)
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, list):
            items = data
        elif isinstance(data, dict) and "queries" in data:
            items = data["queries"]
        else:
            raise ValueError("Unsupported JSON structure")
    except json.JSONDecodeError:
        # JSONL fallback
        items = []
        with open(path, "r", encoding="utf-8") as f:
            for line in line:
                line = line.strip()
                if line:
                    items.append(json.loads(line))

    # normalize keys
    norm = []
    for j in items:
        qid = j.get("query_id") or j.get("qid") or j.get("id")
        qtext = j.get("query") or j.get("text")
        if qid is None or qtext is None:
            continue
        norm.append({"query_id": str(qid), "query": str(qtext)})
    return norm

# Use the path found in Box 3A:
QPATH = "/content/queries_synth_train.json"
queries = load_queries_any(QPATH)
print("Total queries:", len(queries))
print(queries[:2])

Total queries: 288
[{'query_id': 's334', 'query': 'hearty organic soups for dinner'}, {'query_id': 's255', 'query': 'versatile cut chicken for grilling'}]


In [None]:
import json
from pathlib import Path

def find_queries_file(preferred="queries_synth_train.json"):
    roots = [Path("."), Path("/content"), Path("/mnt/data")]
    p = Path(preferred)
    if p.exists():
        return p.resolve()
    for r in roots:
        if not r.exists():
            continue
        for cand in r.rglob("queries_synth_test*.*json*"):
            return cand.resolve()
    raise FileNotFoundError("Couldn't find queries_synth_test*.json — upload it or pass the exact path.")

QPATH = find_queries_file()  # auto-discovers, including `/mnt/data/queries_synth_train (2).json`
print("Queries file:", QPATH)

# Load as [{"query_id": "...", "query": "..."}]
with open(QPATH, "r", encoding="utf-8") as f:
    queries_raw = json.load(f)

queries = []
for j in queries_raw:
    qid = j.get("query_id") or j.get("qid") or j.get("id")
    qtx = j.get("query") or j.get("text")
    if qid and qtx:
        queries.append({"query_id": str(qid), "query": str(qtx)})

print("Total queries:", len(queries))
print(queries[:3])

Queries file: /content/queries_synth_train.json
Total queries: 288
[{'query_id': 's334', 'query': 'hearty organic soups for dinner'}, {'query_id': 's255', 'query': 'versatile cut chicken for grilling'}, {'query_id': 's245', 'query': 'quick cook pork steaks'}]


In [None]:
!pip install -U sentence-transformers huggingface_hub transformers


Collecting huggingface_hub
  Downloading huggingface_hub-1.1.2-py3-none-any.whl.metadata (13 kB)
Collecting typer-slim (from huggingface_hub)
  Downloading typer_slim-0.20.0-py3-none-any.whl.metadata (16 kB)


In [None]:
from sentence_transformers import CrossEncoder

# Choose a different pre-trained cross-encoder model
cross_encoder_model_name = 'cross-encoder/ms-marco-MiniLM-L-6-v2' # Corrected model name

# Instantiate the CrossEncoder model
cross_encoder_model = CrossEncoder(cross_encoder_model_name)

In [None]:
# Instantiate the CrossEncoder model with the corrected name
cross_encoder_model = CrossEncoder(cross_encoder_model_name)

In [None]:
from google.colab import userdata
import os
import json
from pathlib import Path
import pandas as pd # Import pandas for displaying dataframe
import re
from openai import OpenAI
from typing import List
import numpy as np # Import numpy

# Retrieve the API key from Colab secrets and set it as an environment variable
OPEN_API_KEY = userdata.get('OPEN_API_KEY')
os.environ['OPEN_API_KEY'] = OPEN_API_KEY

print("OPEN_API_KEY loaded into environment variables.")

# Assuming find_queries_file is defined in a previous cell
def find_queries_file(preferred="queries_synth_test.json"):
    roots = [Path("."), Path("/content"), Path("/mnt/data")]
    p = Path(preferred)
    if p.exists():
        return p.resolve()
    for r in roots:
        if not r.exists():
            continue
        for cand in r.rglob("queries_synth_train*.*json*"):
            return cand.resolve()
    raise FileNotFoundError("Couldn't find queries_synth_test*.json — upload it or pass the exact path.")

QPATH = find_queries_file()  # auto-discovers
print("Queries file:", QPATH)

# Load as [{"query_id": "...", "query": "..."}]
with open(QPATH, "r", encoding="utf-8") as f:
    queries_raw = json.load(f)

queries = []
for j in queries_raw:
    qid = j.get("query_id") or j.get("qid") or j.get("id")
    qtx = j.get("query") or j.get("text")
    if qid and qtx:
        queries.append({"query_id": str(qid), "query": str(qtx)})

print("Total queries:", len(queries))
print(queries[:3])

# Assuming cross_encoder_model, IDX_combined, rowid_to_pid, and meta are loaded from previous steps
# If not, load them here:
# from sentence_transformers import CrossEncoder
# cross_encoder_model_name = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
# cross_encoder_model = CrossEncoder(cross_encoder_model_name)
# VSTORE = Path("vector_store")
# E_combined = np.load(VSTORE/"emb_combined.npy")
# IDX_combined = build_ip_index(E_combined)
# rowid_to_pid = json.loads((VSTORE/"rowid_to_pid.json").read_text())
# meta = pd.read_parquet(VSTORE/"products.parquet")
# pid_to_row = {pid: i for i, pid in enumerate(rowid_to_pid)}


def search(query_text: str, k_return: int = 10, k_rerank: int = 30,
           w_title=0.8, w_brand=0.05, w_cat=0.10, w_desc=0.05):
    """
    1) normalize + embed query once (MiniLM or OpenAI)
    2) search the combined index for top-k_rerank candidates
    3) re-rank top-k_rerank candidates using a cross-encoder
    4) return top-k_return product metadata + re-ranked score
    """
    qn = norm_text(query_text)

    # Embed query using OpenAI embeddings
    qv_title = embed_texts_openai([qn])  # (1, D)
    qv_brand = embed_texts_openai([qn])
    qv_cat   = embed_texts_openai([qn])
    qv_desc  = embed_texts_openai([qn])

    # Concatenate query embeddings with weights
    qv_combined = np.concatenate([qv_title * w_title, qv_brand * w_brand, qv_cat * w_cat, qv_desc * w_desc], axis=1)

    # top-k_rerank from combined index (scores are cosine/IP because vectors are normalized)
    D_combined, I_combined = IDX_combined.search(qv_combined, k_rerank)

    # Prepare data for re-ranking
    rerank_candidates = []
    sentence_pairs = []
    for i in I_combined[0]:
        pid = rowid_to_pid[i]
        m = meta.iloc[i]
        product_info = f"product title: {m['title']} product description: {m['description']}"
        sentence_pairs.append([query_text, product_info])
        rerank_candidates.append({
            "product_id": pid,
            "name": m["title"],
            "brand": m["brand"],
            "category": m["category_path"],
            "original_row_index": i # Keep track of original index
        })

    # Predict relevance scores using the cross-encoder model
    if sentence_pairs:
        rerank_scores = cross_encoder_model.predict(sentence_pairs)
        for i, score in enumerate(rerank_scores):
            rerank_candidates[i]["rerank_score"] = float(score)
    else:
         # Handle case where no candidates were found
        return []


    # Sort candidates by re-rank score in descending order
    rerank_candidates = sorted(rerank_candidates, key=lambda r: r.get("rerank_score", -float('inf')), reverse=True)

    # Return top k_return results
    results = []
    for i in range(min(k_return, len(rerank_candidates))):
        candidate = rerank_candidates[i]
        results.append({
            "product_id": candidate["product_id"],
            "name": candidate["name"],
            "brand": candidate["brand"],
            "category": candidate["category"],
            "score": candidate["rerank_score"] # Return the re-ranked score
        })

    return results

# Test the updated search function
sample_query = queries[0]["query"]
top_results = search(sample_query, k_return=10, k_rerank=50)
display(pd.DataFrame(top_results))

OPEN_API_KEY loaded into environment variables.
Queries file: /content/queries_synth_train.json
Total queries: 288
[{'query_id': 's334', 'query': 'hearty organic soups for dinner'}, {'query_id': 's255', 'query': 'versatile cut chicken for grilling'}, {'query_id': 's245', 'query': 'quick cook pork steaks'}]


Unnamed: 0,product_id,name,brand,category,score
0,1728263,H-E-B Organics Tomato Basil Soup,H-E-B,Pantry -> Soups & chili,5.999448
1,1728261,H-E-B Organics Chicken Noodle Soup,H-E-B,Pantry -> Soups & chili,5.985342
2,1728279,H-E-B Organics Southwest Style Black Bean Soup,H-E-B,Pantry -> Soups & chili,5.878461
3,1728270,H-E-B Organics Lentil Soup,H-E-B,Pantry -> Soups & chili,5.432003
4,5851502,H-E-B Hearty Wrangler Soup,H-E-B,Pantry -> Soups & chili,3.576779
5,497561,Hill Country Fare Chunky Chicken Corn Chowder ...,Hill Country Fare,Pantry -> Soups & chili,2.34043
6,369154,Hill Country Fare Chunky Classic Chicken Noodl...,Hill Country Fare,Pantry -> Soups & chili,1.799685
7,1287774,Hill Country Fare Hearty Chicken Noodle Soup,Hill Country Fare,Pantry -> Soups & chili,1.66287
8,10438511,H-E-B Italian-Style Wedding Soup,H-E-B,Pantry -> Soups & chili,1.65016
9,1417008,Hill Country Fare Hearty Chicken Broccoli Chee...,Hill Country Fare,Pantry -> Soups & chili,1.544112


In [None]:
# Assuming the necessary functions (norm_text, embed_texts_openai, get_openai_client)
# and imports (re, OpenAI, os, List, np, pandas, json, Path) are already in the kernel.
# If not, they should be included in this cell or ensured from prior cells.

# Load the necessary data and index
from pathlib import Path
import numpy as np
import json
import pandas as pd

VSTORE = Path("vector_store")

# Load combined embeddings
E_combined = np.load(VSTORE/"emb_combined.npy")

# Build in-memory IP index (fast; a few seconds)
# Assuming build_ip_index is defined in a previous cell
def build_ip_index(emb: np.ndarray):
    """
    Inner-product index. If FAISS is present, use it; otherwise a NumPy fallback
    with the same .search(Q, k) API. Assumes emb are L2-normalized.
    """
    try:
        import faiss   # from faiss-cpu
        FAISS_AVAILABLE = True
    except Exception:
        faiss = None
        FAISS_AVAILABLE = False

    if FAISS_AVAILABLE:
        d = emb.shape[1]
        index = faiss.IndexFlatIP(d)
        index.add(emb.astype("float32"))
        return index

    class NumpyIPIndex:
        def __init__(self, X):
            self.X = X.astype("float32")
        def add(self, X):
            pass
        def search(self, Q, k):
            S = Q @ self.X.T
            I = np.argpartition(-S, k-1, axis=1)[:, :k]
            S_topk = np.take_along_axis(S, I, axis=1)
            order = np.argsort(-S_topk, axis=1)
            I_sorted = np.take_along_axis(I, order, axis=1)
            D_sorted = np.take_along_axis(S_topk, order, axis=1)
            return D_sorted.astype("float32"), I_sorted.astype("int64")
    return NumpyIPIndex(emb)

IDX_combined = build_ip_index(E_combined)

# Load ID map and metadata
rowid_to_pid = json.loads((VSTORE/"rowid_to_pid.json").read_text())
meta = pd.read_parquet(VSTORE/"products.parquet")

# Assuming cross_encoder_model and queries are loaded from previous cells

# Test the updated search function
sample_query = queries[0]["query"]
top_results = search(sample_query, k_return=10, k_rerank=50)
display(pd.DataFrame(top_results))

Unnamed: 0,product_id,name,brand,category,score
0,1728263,H-E-B Organics Tomato Basil Soup,H-E-B,Pantry -> Soups & chili,5.999448
1,1728261,H-E-B Organics Chicken Noodle Soup,H-E-B,Pantry -> Soups & chili,5.985342
2,1728279,H-E-B Organics Southwest Style Black Bean Soup,H-E-B,Pantry -> Soups & chili,5.878461
3,1728270,H-E-B Organics Lentil Soup,H-E-B,Pantry -> Soups & chili,5.432003
4,5851502,H-E-B Hearty Wrangler Soup,H-E-B,Pantry -> Soups & chili,3.576779
5,497561,Hill Country Fare Chunky Chicken Corn Chowder ...,Hill Country Fare,Pantry -> Soups & chili,2.34043
6,369154,Hill Country Fare Chunky Classic Chicken Noodl...,Hill Country Fare,Pantry -> Soups & chili,1.799685
7,1287774,Hill Country Fare Hearty Chicken Noodle Soup,Hill Country Fare,Pantry -> Soups & chili,1.66287
8,10438511,H-E-B Italian-Style Wedding Soup,H-E-B,Pantry -> Soups & chili,1.65016
9,1417008,Hill Country Fare Hearty Chicken Broccoli Chee...,Hill Country Fare,Pantry -> Soups & chili,1.544112


In [None]:
from typing import List, Dict
import json
from pathlib import Path
import pandas as pd
from sklearn.metrics import average_precision_score, precision_score, recall_score
from collections import defaultdict
import time # Import time for logging duration

def rank_all_queries_topk_reranked(queries: List[Dict], k: int = 30) -> List[Dict]:
    """
    Runs the re-ranked search() for each query and returns a flat list like:
    [
      {"query_id": "s181", "rank": 1, "product_id": "6033004"},
      ...
    ]
    """
    output = []
    print(f"Starting re-ranking for {len(queries)} queries...")
    start_time = time.time()
    for i, q in enumerate(queries):
        qid, qtext = q["query_id"], q["query"]
        if (i + 1) % 50 == 0 or i == 0 or i == len(queries) - 1:
             print(f"  Processing query {i+1}/{len(queries)}: {qtext[:50]}...")


        # Get top-k candidates from the re-ranked search
        # Use k_rerank=100 for the initial retrieval and k_return=k for the final results
        results = search(qtext, k_return=k, k_rerank=100) # Changed k_rerank to 100

        # Tie-break by product_id for deterministic order (already sorted by score desc in search)
        results = sorted(results, key=lambda r: (-r["score"], str(r["product_id"])))

        # Flatten with ranks
        for rank, r in enumerate(results, start=1):
            output.append({
                "query_id": qid,
                "rank": rank,
                "product_id": str(r["product_id"]),
            })
    end_time = time.time()
    print(f"Finished re-ranking {len(queries)} queries in {end_time - start_time:.2f} seconds.")
    return output

# Use the training queries loaded previously
# queries = load_queries_any(qpath) # assuming qpath is defined and contains the path to the training queries

flat_topk_reranked = rank_all_queries_topk_reranked(queries, k=30) # Explicitly setting k=30
print(f"Generated {len(flat_topk_reranked)} re-ranked results.")
print("First 5 re-ranked results:", flat_topk_reranked[:5])


# Generate submission file for re-ranked results
OUT_FILE_RERANKED = "submission_reranked.json"
print(f"Writing re-ranked submission to {OUT_FILE_RERANKED}...")
with open(OUT_FILE_RERANKED, "w", encoding="utf-8") as f:
    json.dump(flat_topk_reranked, f, ensure_ascii=False, indent=2)

print("Wrote re-ranked submission:", OUT_FILE_RERANKED)

# Load the original submission file (if it exists) for comparison
# Assuming the original submission file is named "submission.json"
original_submission_file = "submission.json"
original_results = []
if Path(original_submission_file).exists():
    print(f"Loading original submission from {original_submission_file}...")
    with open(original_submission_file, "r", encoding="utf-8") as f:
        original_results = json.load(f)
    print("Loaded original submission:", original_submission_file)
else:
    print("Original submission file not found. Skipping comparison.")

Starting re-ranking for 288 queries...
  Processing query 1/288: hearty organic soups for dinner...
  Processing query 50/288: natural boneless pork...
  Processing query 100/288: wild caught tuna in olive oil...
  Processing query 150/288: minimally processed pork chunks...
  Processing query 200/288: H-E-B whole roasting chicken...
  Processing query 250/288: quick dinner chicken kabobs...
  Processing query 288/288: quick chicken fajita meal...
Finished re-ranking 288 queries in 399.13 seconds.
Generated 8640 re-ranked results.
First 5 re-ranked results: [{'query_id': 's334', 'rank': 1, 'product_id': '1728263'}, {'query_id': 's334', 'rank': 2, 'product_id': '1728261'}, {'query_id': 's334', 'rank': 3, 'product_id': '1728279'}, {'query_id': 's334', 'rank': 4, 'product_id': '1728270'}, {'query_id': 's334', 'rank': 5, 'product_id': '5851502'}]
Writing re-ranked submission to submission_reranked.json...
Wrote re-ranked submission: submission_reranked.json
Original submission file not fou