In [16]:
from pathlib import Path

import numpy as np
import pandas as pd
from PIL import Image

import torch
from transformers import CLIPModel, CLIPProcessor

# Assume you start Jupyter from the repo root.
# If you start it from notebooks/, change "." to "..".
PROJECT_ROOT = Path("..").resolve()
DATA_DIR = PROJECT_ROOT / "data"
IMAGES_DIR = DATA_DIR / "images"

CSV_PATH = DATA_DIR / "san_leandro_products.csv"
EMB_PATH = DATA_DIR / "image_embs.npy"
FILTERED_CSV_PATH = DATA_DIR / "san_leandro_products_with_embs.csv"

print("Project root:", PROJECT_ROOT)
print("CSV path:", CSV_PATH)
print("Images dir:", IMAGES_DIR)


Project root: C:\Users\j0sep\Mat-Vis-Net
CSV path: C:\Users\j0sep\Mat-Vis-Net\data\san_leandro_products.csv
Images dir: C:\Users\j0sep\Mat-Vis-Net\data\images


In [17]:
df = pd.read_csv(CSV_PATH)

# Adjust the column name if yours is different
IMAGE_COL = "image_filename"

if IMAGE_COL not in df.columns:
    raise ValueError(f"{IMAGE_COL!r} column not found in CSV. Check your column names.")

# Keep only rows that actually have an image file
df = df[df[IMAGE_COL].notna() & (df[IMAGE_COL] != "")]
df = df.reset_index(drop=True)

print("Rows with images:", len(df))
df.head()


Rows with images: 2985


Unnamed: 0,sku,name,category_slug,product_url,image_url,image_filename
0,100997154,Captains Runs Waterproof Laminate Plank,/10mm-and-above-laminate,https://www.flooranddecor.com/aquaguard-perfor...,https://i8.amplience.net/i/flooranddecor/10099...,100997154.jpg
1,100992742,East Bay Breeze Waterproof Laminate Plank,/10mm-and-above-laminate,https://www.flooranddecor.com/aquaguard-perfor...,https://i8.amplience.net/i/flooranddecor/10099...,100992742.jpg
2,100997105,Cocoa Waterproof Laminate Plank,/10mm-and-above-laminate,https://www.flooranddecor.com/aquaguard-perfor...,https://i8.amplience.net/i/flooranddecor/10099...,100997105.jpg
3,101006005,Mountain Valley Waterproof Laminate Plank,/10mm-and-above-laminate,https://www.flooranddecor.com/aquaguard-perfor...,https://i8.amplience.net/i/flooranddecor/10100...,101006005.jpg
4,101235778,Easton Hickory Waterproof Laminate,/10mm-and-above-laminate,https://www.flooranddecor.com/hydroshield-plus...,https://i8.amplience.net/i/flooranddecor/10123...,101235778.jpg


In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model_name = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_name).to(device)
processor = CLIPProcessor.from_pretrained(model_name)
model.eval()


Using device: cpu


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [19]:
def embed_image(path: Path) -> np.ndarray:
    image = Image.open(path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model.get_image_features(**inputs)

    # Normalize and flatten to 1D numpy
    emb = outputs[0]
    emb = emb / emb.norm(p=2)
    return emb.cpu().numpy()


In [20]:
emb_list = []
kept_rows = []

for idx, row in df.iterrows():
    img_name = row[IMAGE_COL]
    img_path = IMAGES_DIR / img_name

    if not img_path.exists():
        # Skip rows with missing files
        continue

    emb = embed_image(img_path)
    emb_list.append(emb)
    kept_rows.append(row)

len(emb_list)


0

In [None]:
from pathlib import Path
import pandas as pd
from PIL import Image
import numpy as np
import torch
from transformers import CLIPModel, CLIPProcessor
from tqdm.auto import tqdm

# --- paths ---
PROJECT_ROOT = Path("..").resolve()
CSV_PATH = PROJECT_ROOT / "data" / "san_leandro_products.csv"
IMAGES_DIR = PROJECT_ROOT / "images"

# --- load metadata ---
df = pd.read_csv(CSV_PATH)
df = df[df["image_filename"].notna() & (df["image_filename"] != "")].reset_index(drop=True)
print("rows with images:", len(df))

# ---------- STEP 1: material grouping ----------
MATERIAL_SOURCE_COLS = ["body", "material", "category_slug"]

def infer_material_source_col(df):
    for col in MATERIAL_SOURCE_COLS:
        if col in df.columns:
            return col
    return None

MATERIAL_COL = infer_material_source_col(df)
print("Using material source column:", MATERIAL_COL)

def normalize_material(text: str) -> str:
    t = str(text).lower()

    # --- Tile families ---
    if "porcelain" in t:
        if "wood" in t:
            return "wood_look_porcelain"
        return "porcelain"
    if "ceramic" in t:
        return "ceramic"

    # --- Wood / wood-like families ---
    if "laminate" in t:
        return "laminate"
    if "vinyl" in t or "lvp" in t or "lvt" in t:
        return "vinyl"
    if "engineered" in t:
        return "engineered_wood"
    if "solid" in t and ("hardwood" in t or "wood" in t):
        return "solid_wood"
    if "hardwood" in t or "wood" in t:
        return "wood"

    return "other"

if MATERIAL_COL is not None:
    df["material_group"] = df[MATERIAL_COL].apply(normalize_material)
else:
    df["material_group"] = "other"

print(df["material_group"].value_counts())

# ---------- CLIP model ----------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

model_name = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_name).to(device)

# first time this line runs, it may download the processor files
processor = CLIPProcessor.from_pretrained(model_name)

model.eval()

# --- helpers ---
def load_image(path: Path):
    img = Image.open(path).convert("RGB")
    return img

def embed_images(image_paths, batch_size=16):
    all_embs = []

    for i in tqdm(range(0, len(image_paths), batch_size)):
        batch_paths = image_paths[i:i+batch_size]
        images = [load_image(p) for p in batch_paths]

        inputs = processor(images=images, return_tensors="pt", padding=True).to(device)

        with torch.no_grad():
            outputs = model.get_image_features(**inputs)  # [B, D]

        # normalize
        embs = outputs / outputs.norm(p=2, dim=-1, keepdim=True)
        all_embs.append(embs.cpu().numpy())

    return np.vstack(all_embs)

# --- build list of image paths ---
image_paths = [IMAGES_DIR / fname for fname in df["image_filename"].tolist()]
missing = [p for p in image_paths if not p.exists()]
print("missing image files:", len(missing))

# --- actually compute embeddings ---
image_embs = embed_images(image_paths, batch_size=16)
print("image_embs shape:", image_embs.shape)


rows with images: 2985
Using material source column: category_slug
material_group
other              2561
wood                186
vinyl                90
laminate             70
engineered_wood      52
porcelain            15
solid_wood            8
ceramic               3
Name: count, dtype: int64
device: cpu
missing image files: 0


  0%|          | 0/187 [00:00<?, ?it/s]

In [None]:
from sklearn.neighbors import NearestNeighbors

# NearestNeighbors with cosine distance (1 - cosine similarity)
nn = NearestNeighbors(n_neighbors=5, metric="cosine")
nn.fit(image_embs)

print("Index built over", image_embs.shape[0], "products")


In [None]:
def search_similar_by_index(query_idx, top_k=5):
    """
    Return the top_k nearest neighbors for a given product index,
    excluding the product itself, deduping by image, and enforcing
    material-aware filtering.

    - Never return the exact same SKU as the query.
    - Never show the same image twice (even if SKU differs).
    - If the query is any kind of wood/laminate/vinyl/etc, only return
      the *same* material_group (so wood won't match laminate/vinyl/
      solid/engineered and vice versa).
    - Tile vs non-tile is also separated (porcelain/ceramic vs wood stuff).
    """
    # Embedding for the query product
    query_emb = image_embs[query_idx].reshape(1, -1)
    query_row = df.iloc[query_idx]
    query_sku = query_row["sku"]
    query_group = query_row.get("material_group", "other")

    # Ask for some extra neighbors to survive filtering/dedup
    n_neighbors = min(top_k + 20, len(df))
    distances, indices = nn.kneighbors(query_emb, n_neighbors=n_neighbors)
    distances = distances[0]
    indices = indices[0]

    results = []
    seen_images = set()

    # Groups that we treat as wood-family, but we still
    # don't mix them with each other unless group matches exactly.
    wood_groups = {"wood", "engineered_wood", "solid_wood", "laminate", "vinyl"}

    # Tile-like groups
    tile_groups = {"porcelain", "ceramic", "wood_look_porcelain"}

    for dist, idx in zip(distances, indices):
        # Skip the exact same row
        if idx == query_idx:
            continue

        row = df.iloc[idx]

        # Skip same SKU as the query
        if row["sku"] == query_sku:
            continue

        img_key = row["image_filename"]

        # Skip duplicate images
        if img_key in seen_images:
            continue

        candidate_group = row.get("material_group", "other")

        # ---------- MATERIAL FILTERING ----------

        if query_group in wood_groups:
            # If query is wood-like (wood, laminate, vinyl, engineeered, solid),
            # require exact same material_group.
            #
            # This is where your rule kicks in:
            # "if it's wood, exclude laminate vs vinyl vs solid vs engineered"
            # because those are all separate groups.
            if candidate_group != query_group:
                continue
        else:
            # For non-wood queries, keep tile vs non-tile separated.
            query_is_tile = query_group in tile_groups
            candidate_is_tile = candidate_group in tile_groups

            # Don't mix tile with non-tile
            if query_is_tile != candidate_is_tile:
                continue

        # ---------- END MATERIAL FILTERING ----------

        seen_images.add(img_key)

        results.append(
            {
                "rank": len(results) + 1,
                "sku": row["sku"],
                "name": row["name"],
                "category": row["category_slug"],
                "material_group": candidate_group,
                "distance": float(dist),
                "image_path": str(IMAGES_DIR / row["image_filename"]),
            }
        )

        if len(results) >= top_k:
            break

    return results


In [None]:
from IPython.display import display

# Pick a sample index (try different numbers later)
query_idx = 0

query_row = df.iloc[query_idx]
query_img = load_image(IMAGES_DIR / query_row["image_filename"])

print("Query SKU:", query_row["sku"])
print("Name:", query_row["name"])
print("Category:", query_row["category_slug"])
display(query_img)

results = search_similar_by_index(query_idx, top_k=5)
results


In [None]:
for r in results:
    print(f"Rank {r['rank']} | SKU {r['sku']} | dist={r['distance']:.4f}")
    display(load_image(Path(r["image_path"])))


In [None]:
from sklearn.cluster import KMeans

k = 20  # example
kmeans = KMeans(n_clusters=k, random_state=0)
labels = kmeans.fit_predict(image_embs)


In [None]:
EMBS_PATH = PROJECT_ROOT / "data" / "image_embs.npy"
np.save(EMBS_PATH, image_embs)
print("Saved embeddings to:", EMBS_PATH)