In [None]:
from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from PIL import Image  # just for loading images

# Paths
PROJECT_ROOT = Path("..").resolve()
CSV_PATH = PROJECT_ROOT / "data" / "san_leandro_products.csv"
EMBS_PATH = PROJECT_ROOT / "data" / "image_embs.npy"
IMAGES_DIR = PROJECT_ROOT / "images"  # <-- folder with your JPGs

print("CSV:", CSV_PATH)
print("Embeddings:", EMBS_PATH)
print("Images dir:", IMAGES_DIR)

# Load full CSV (all products)
raw_df = pd.read_csv(CSV_PATH)

# Filtered view: only products that have an image file
IMAGE_COL = "image_filename"
if IMAGE_COL not in raw_df.columns:
    raise ValueError(f"{IMAGE_COL!r} column not found in CSV")

df = raw_df[raw_df[IMAGE_COL].notna() & (raw_df[IMAGE_COL] != "")]
df = df.reset_index(drop=True)

print("Full CSV rows:", len(raw_df))
print("Filtered rows with images (used for embeddings):", len(df))
df.head()


In [None]:
from sklearn.neighbors import NearestNeighbors

# If you don't already have these installed in your venv, run:
#   pip install "torch" "transformers"
import torch
from transformers import CLIPModel, CLIPProcessor

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


def build_clip_image_embs(df, images_dir, image_col="image_filename", batch_size=32):
    """
    Build CLIP image embeddings for every row in df and return:
      - image_embs: numpy array of shape (N, D)
      - df_aligned: df restricted to rows that actually have an image file
    """
    model_name = "openai/clip-vit-base-patch32"
    print(f"Loading CLIP model: {model_name}")
    processor = CLIPProcessor.from_pretrained(model_name)
    model = CLIPModel.from_pretrained(model_name).to(device)
    model.eval()

    all_embs = []
    keep_indices = []

    n = len(df)
    print(f"Computing embeddings for {n} products...")

    for start in range(0, n, batch_size):
        batch = df.iloc[start : start + batch_size]
        images = []
        batch_indices = []

        for idx, row in batch.iterrows():
            fname = str(row[image_col])
            img_path = images_dir / fname
            if not img_path.is_file():
                print(f"WARNING: image file missing for index {idx}: {img_path}")
                continue

            img = Image.open(img_path).convert("RGB")
            images.append(img)
            batch_indices.append(idx)

        if not images:
            continue

        inputs = processor(images=images, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            emb = model.get_image_features(**inputs)

        all_embs.append(emb.cpu().numpy())
        keep_indices.extend(batch_indices)

        print(f"  processed {min(start + batch_size, n)}/{n} rows", end="\r")

    if not all_embs:
        raise RuntimeError(
            "No embeddings were built. Check that your images directory is correct."
        )

    image_embs = np.vstack(all_embs)
    df_aligned = df.loc[keep_indices].reset_index(drop=True)

    print()
    print("Embeddings built. image_embs shape:", image_embs.shape)
    print("Aligned df length:", len(df_aligned))

    return image_embs, df_aligned


# --- Load or rebuild embeddings so that they ALWAYS match df ---

if EMBS_PATH.exists():
    image_embs = np.load(EMBS_PATH)
    print("Loaded existing embeddings:", image_embs.shape)
else:
    image_embs = None
    print("No existing embeddings file found at", EMBS_PATH)

if (image_embs is None) or (image_embs.shape[0] != len(df)):
    print("Embeddings are missing or out of sync with CSV -> rebuilding from images.")
    image_embs, df = build_clip_image_embs(df, IMAGES_DIR, IMAGE_COL, batch_size=32)
    np.save(EMBS_PATH, image_embs)
    print("Saved fresh embeddings to", EMBS_PATH)

print("Final check -> image_embs.shape[0]:", image_embs.shape[0], " len(df):", len(df))

if image_embs.shape[0] != len(df):
    raise ValueError(
        "After rebuild, embeddings still do not align with df. "
        "This would only happen if some rows were dropped while building embeddings "
        "but df was not updated. Double-check your CSV and images."
    )

nn = NearestNeighbors(n_neighbors=20, metric="cosine")
nn.fit(image_embs)

print("kNN index built over", image_embs.shape[0], "products")


In [None]:
# --- material bucketing + helpers ---

def _compute_material_bucket(row) -> str:
    """
    Coarse material/type bucket so we don't mix wood planks with vinyl, tile,
    or installation trim (stair nose, etc.). Uses category_slug + name + URL.
    """
    cat = str(row.get("category_slug", "")).lower().strip()
    base = cat.strip("/").split("/")[-1]  # e.g. "engineered-hardwood-wood"
    name = str(row.get("name", "")).lower()
    url = str(row.get("product_url", "")).lower()

    # --- installation / trim first (never treated as surface) ---
    if "installation-materials" in base:
        return "install"
    if any(k in name for k in ["stair nose", "stairnose", "stair-nose"]):
        return "trim"
    if "moldings-wood" in url or "molding" in name:
        return "trim"

    # --- main surface families (from category_slug) ---
    if "wood" in base:
        return "wood"
    if "laminate" in base:
        return "laminate"
    if "vinyl" in base or "nucore" in base:
        return "vinyl"
    if "tile" in base:
        return "tile"
    if "stone" in base:
        return "stone"
    if "decoratives" in base:
        return "decoratives"
    if "fixtures" in base or "bathroom-accessories" in base:
        return "fixtures"

    return "other"


# compute once for the DataFrame used by embeddings
if "material_bucket" not in df.columns:
    df["material_bucket"] = df.apply(_compute_material_bucket, axis=1)


def get_index_by_sku(query_sku: str) -> int:
    """
    Return the row index in df corresponding to the given SKU.
    Only SKUs present in the filtered df (with embeddings) are valid.
    """
    sku_str = str(query_sku).strip()
    matches = df.index[df["sku"].astype(str).str.strip() == sku_str].tolist()
    if not matches:
        # Check if it's only in the raw CSV but not in the filtered df
        exists_in_raw = raw_df["sku"].astype(str).str.strip().eq(sku_str).any()
        if exists_in_raw:
            raise ValueError(
                f"SKU {sku_str!r} exists in san_leandro_products.csv but "
                "does not have an embedding (likely missing image_filename "
                "in the filtered set)."
            )
        raise ValueError(f"SKU {sku_str!r} not found in san_leandro_products.csv.")
    return matches[0]


def find_index_by_name_substring(substr: str, occurrence: int = 0) -> int:
    """
    Find the index of a product whose name contains the given substring
    (case-insensitive). If multiple matches exist, `occurrence` chooses
    which one (0 = first).
    """
    mask = df["name"].str.contains(substr, case=False, na=False)
    matches = df[mask]
    if matches.empty:
        raise ValueError(f"No products with name containing {substr!r}")
    if occurrence >= len(matches):
        raise ValueError(
            f"Only {len(matches)} matches for {substr!r}, but occurrence={occurrence}"
        )
    return matches.index[occurrence]


def search_similar_by_index(query_idx: int, top_k: int = 10, exclude_same: bool = True):
    """
    Given a row index in df, return the top_k most similar products by image
    embedding, but RESTRICTED to the same material bucket.
    """
    query_vec = image_embs[query_idx].reshape(1, -1)
    # ask for extra neighbors in case many get filtered out by material bucket
    distances, indices = nn.kneighbors(query_vec, n_neighbors=top_k + 30)

    query_bucket = df.loc[query_idx, "material_bucket"]

    dist_list = distances[0].tolist()
    idx_list = indices[0].tolist()

    results = []
    for dist, idx in zip(dist_list, idx_list):
        if exclude_same and idx == query_idx:
            continue

        row = df.iloc[idx]
        if df.loc[idx, "material_bucket"] != query_bucket:
            # different material/type -> skip
            continue

        results.append(
            {
                "index": int(idx),
                "rank": len(results) + 1,
                "sku": row["sku"],
                "name": row["name"],
                "category_slug": row.get("category_slug"),
                "material_bucket": df.loc[idx, "material_bucket"],
                "distance": float(dist),
            }
        )

        if len(results) >= top_k:
            break

    return results


def show_product_image(row, title_prefix: str = ""):
    """
    Display the product image for a given row of df.
    """
    img_name = row.get("image_filename")
    if not isinstance(img_name, str) or not img_name:
        print("No image_filename for this row.")
        return

    img_path = IMAGES_DIR / img_name
    if not img_path.exists():
        print("Image file not found:", img_path)
        return

    img = Image.open(img_path)
    plt.figure(figsize=(3, 3))
    plt.imshow(img)
    plt.axis("off")
    title = f"{title_prefix}SKU {row['sku']} â€“ {row['name']}"
    plt.title(title, fontsize=8)
    plt.tight_layout()
    plt.show()


def show_results_with_images(query_idx: int, top_k: int = 10):
    """
    Convenience: show query product + top_k similar products with images.
    """
    query_row = df.iloc[query_idx]
    query_bucket = df.loc[query_idx, "material_bucket"]

    print("QUERY PRODUCT")
    print("SKU:", query_row["sku"])
    print("Name:", query_row["name"])
    print("Category:", query_row.get("category_slug"))
    print("Material bucket:", query_bucket)
    show_product_image(query_row, title_prefix="QUERY: ")

    results = search_similar_by_index(query_idx, top_k=top_k)

    print("\nSIMILAR PRODUCTS")
    print("--------------------------------------------")
    for r in results:
        row = df.iloc[r["index"]]
        print(
            f"Rank {r['rank']} | SKU {r['sku']} | dist={r['distance']:.4f} "
            f"| bucket={r['material_bucket']}"
        )
        print("  Name:", r["name"])
        print("  Category:", r["category_slug"])
        show_product_image(row, title_prefix=f"RANK {r['rank']}: ")


In [None]:
query_sku = "101156321"  # change this to a valid SKU from df["sku"]

query_idx = get_index_by_sku(query_sku)
query_row = df.iloc[query_idx]

print("QUERY PRODUCT")
print("SKU:", query_row["sku"])
print("Name:", query_row["name"])
print("Category:", query_row.get("category_slug"))
print()

results = search_similar_by_index(query_idx, top_k=5)

print("SIMILAR PRODUCTS")
print("----------------")
for r in results:
    print(f"Rank {r['rank']} | SKU {r['sku']} | dist={r['distance']:.4f}")
    print("  Name:", r["name"])
    print("  Category:", r["category_slug"])
    print()


In [None]:
query_sku = "101156321"  # change to a valid SKU from df["sku"]

query_idx = get_index_by_sku(query_sku)
show_results_with_images(query_idx, top_k=10)