In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

PROJECT_ROOT = Path("..").resolve()
CSV_PATH = PROJECT_ROOT / "data" / "san_leandro_products.csv"
EMBS_PATH = PROJECT_ROOT / "data" / "image_embs.npy"
IMAGES_DIR = PROJECT_ROOT / "images"


CSV_PATH = Path("data/san_leandro_products.csv")
EMBS_PATH = Path("data/image_embs.npy")

df = pd.read_csv(CSV_PATH)

# Keep only rows that have images (match your embedding build logic)
df = df[df["image_filename"].notna() & (df["image_filename"].astype(str) != "")].reset_index(drop=True)

image_embs = np.load(EMBS_PATH)  # defines image_embs
print("df rows:", len(df))
print("image_embs shape:", image_embs.shape)


FileNotFoundError: [Errno 2] No such file or directory: 'data\\san_leandro_products.csv'

In [None]:
from time import perf_counter
import numpy as np
from sklearn.neighbors import NearestNeighbors

# --- safety: ensure float32 for sklearn speed ---
image_embs = np.asarray(image_embs, dtype=np.float32)

# --- ensure material_bucket exists ---
if "material_bucket" not in df.columns:
    # if you already have compute_material_bucket defined earlier, this will work:
    df["material_bucket"] = df.apply(compute_material_bucket, axis=1)

# --- Optional: make knn as fast as sklearn gets for high-D ---
nn = NearestNeighbors(n_neighbors=20, metric="cosine", algorithm="brute", n_jobs=-1)
nn.fit(image_embs)

# --- Build caches ONCE (removes pandas work from hot path) ---
sku_arr = df["sku"].astype(str).str.strip().to_numpy()
name_arr = df["name"].astype(str).to_numpy()
cat_arr = (
    df["category_slug"].astype(str).to_numpy()
    if "category_slug" in df.columns
    else np.array([""] * len(df))
)
bucket_arr = df["material_bucket"].astype(str).to_numpy()

# Optional: raw sku set (only if raw_df exists)
try:
    raw_sku_set = set(raw_df["sku"].astype(str).str.strip().to_list())
except NameError:
    raw_sku_set = None

# O(1) SKU lookup (first occurrence wins)
sku_to_index = {}
for i, s in enumerate(sku_arr):
    if s not in sku_to_index:
        sku_to_index[s] = i

def get_index_by_sku_fast(query_sku: str) -> int:
    s = str(query_sku).strip()
    if s in sku_to_index:
        return int(sku_to_index[s])

    if raw_sku_set is not None and s in raw_sku_set:
        raise ValueError(
            f"SKU {s!r} exists in san_leandro_products.csv but "
            "does not have an embedding (likely missing image_filename)."
        )
    raise ValueError(f"SKU {s!r} not found.")

def search_similar_by_index_fast(query_idx: int, top_k: int = 10, exclude_same: bool = True):
    query_vec = image_embs[query_idx:query_idx + 1]
    distances, indices = nn.kneighbors(query_vec, n_neighbors=top_k + 30)

    idxs = indices[0]
    dists = distances[0]

    query_bucket = bucket_arr[query_idx]
    mask = (bucket_arr[idxs] == query_bucket)
    if exclude_same:
        mask &= (idxs != query_idx)

    idxs = idxs[mask][:top_k]
    dists = dists[mask][:top_k]

    return [
        {
            "index": int(idx),
            "rank": int(rank),
            "sku": sku_arr[idx],
            "name": name_arr[idx],
            "category_slug": cat_arr[idx],
            "material_bucket": bucket_arr[idx],
            "distance": float(dist),
        }
        for rank, (idx, dist) in enumerate(zip(idxs, dists), start=1)
    ]

print("âœ… Caches built:", len(df), "rows")


In [None]:
def bench_ms(fn, n=500):
    # warm-up
    for _ in range(20):
        fn()
    t0 = perf_counter()
    for _ in range(n):
        fn()
    return (perf_counter() - t0) * 1000 / n

# pick a query SKU that exists
query_sku = str(df["sku"].iloc[0])

# --- Benchmark SKU lookup ---
ms_lookup_baseline = bench_ms(lambda: get_index_by_sku(query_sku), n=200)
ms_lookup_fast = bench_ms(lambda: get_index_by_sku_fast(query_sku), n=200)

# --- Benchmark full search path (lookup + knn + filter + result build) ---
ms_search_baseline = bench_ms(lambda: search_similar_by_index(get_index_by_sku(query_sku), top_k=10), n=300)
ms_search_fast = bench_ms(lambda: search_similar_by_index_fast(get_index_by_sku_fast(query_sku), top_k=10), n=300)

# --- Print a clean summary ---
print(f"SKU lookup baseline: {ms_lookup_baseline:.4f} ms/call")
print(f"SKU lookup fast    : {ms_lookup_fast:.4f} ms/call")
print()
print(f"Search baseline    : {ms_search_baseline:.4f} ms/call")
print(f"Search fast        : {ms_search_fast:.4f} ms/call")

# sanity check: same SKUs returned (order should match in most cases)
base = search_similar_by_index(get_index_by_sku(query_sku), top_k=10)
fast = search_similar_by_index_fast(get_index_by_sku_fast(query_sku), top_k=10)
print()
print("Top-10 SKU match:", [r["sku"] for r in base] == [r["sku"] for r in fast])
