In [None]:
from pathlib import Path
import numpy as np
import pandas as pd

PROJECT_ROOT = Path("..").resolve()
CSV_PATH = PROJECT_ROOT / "data" / "san_leandro_products.csv"
EMBS_PATH = PROJECT_ROOT / "data" / "image_embs.npy"
IMAGES_DIR = PROJECT_ROOT / "images"

print("CSV:", CSV_PATH)
print("Embeddings:", EMBS_PATH)
print("Images dir:", IMAGES_DIR)

# --- load product metadata ---
df = pd.read_csv(CSV_PATH)
df = df[df["image_filename"].notna() & (df["image_filename"] != "")].reset_index(drop=True)
print("rows with images:", len(df))

# --- load precomputed CLIP embeddings ---
image_embs = np.load(EMBS_PATH)
print("image_embs shape:", image_embs.shape)

if image_embs.shape[0] != len(df):
    raise ValueError(
        f"Row mismatch: {len(df)} rows in df vs {image_embs.shape[0]} embeddings"
    )

# ---------- material grouping (same logic as KNN notebook) ----------
MATERIAL_SOURCE_COLS = ["body", "material", "category_slug"]

def infer_material_source_col(df: pd.DataFrame):
    for col in MATERIAL_SOURCE_COLS:
        if col in df.columns:
            return col
    return None

MATERIAL_COL = infer_material_source_col(df)
print("Using material source column:", MATERIAL_COL)

def normalize_material(text: str) -> str:
    t = str(text).lower()

    # --- Tile families ---
    if "porcelain" in t:
        # Wood-look porcelain should not match real wood
        if "wood" in t:
            return "wood_look_porcelain"
        return "porcelain"
    if "ceramic" in t:
        return "ceramic"

    # --- Wood / wood-like families (kept separate on purpose) ---
    if "laminate" in t:
        return "laminate"
    if "vinyl" in t or "lvp" in t or "lvt" in t:
        return "vinyl"
    if "engineered" in t:
        return "engineered_wood"
    if "solid" in t and ("hardwood" in t or "wood" in t):
        return "solid_wood"
    if "hardwood" in t or "wood" in t:
        # Generic wood, but *not* tile (tile was handled above)
        return "wood"

    return "other"

if MATERIAL_COL is not None:
    df["material_group"] = df[MATERIAL_COL].apply(normalize_material)
else:
    df["material_group"] = "other"

df["material_group"].value_counts()


In [None]:
from sklearn.cluster import KMeans

# Tune this if you want more / fewer clusters
N_CLUSTERS = 40

kmeans = KMeans(
    n_clusters=N_CLUSTERS,
    random_state=42,
    n_init=10,
)
cluster_labels = kmeans.fit_predict(image_embs)

df["cluster_id"] = cluster_labels
df["cluster_size"] = df.groupby("cluster_id")["sku"].transform("size")

print("n_clusters:", N_CLUSTERS)
df[["sku", "name", "material_group", "cluster_id", "cluster_size"]].head()


In [None]:
OUTPUT_CSV = PROJECT_ROOT / "data" / "san_leandro_products.csv"

df.to_csv(OUTPUT_CSV, index=False)
print("Wrote clustered metadata to:", OUTPUT_CSV)


In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2, random_state=42)
coords_2d = pca.fit_transform(image_embs)   # shape [N, 2]

df["pca_x"] = coords_2d[:, 0]
df["pca_y"] = coords_2d[:, 1]

print("Explained variance (PC1, PC2):", pca.explained_variance_ratio_)
df[["pca_x", "pca_y", "cluster_id"]].head()


In [None]:
import matplotlib.pyplot as plt

# --- pick how to get the material label for each row ---

if "material" in df.columns:
    material_col = "material"
else:
    # Fall back to category_slug -> simple material label
    if "category_slug" not in df.columns:
        raise ValueError("Need either 'material' or 'category_slug' in df to color by material.")

    def slug_to_material(slug: str) -> str:
        if isinstance(slug, str):
            if "/wood" in slug:
                return "wood"
            if "/tile" in slug or "/porcelain" in slug or "/ceramic" in slug:
                return "tile/porcelain"
            if "/vinyl" in slug:
                return "vinyl"
            if "/stone" in slug:
                return "stone"
            if "/decoratives" in slug or "/decorative" in slug:
                return "decor"
        return "other"

    df["material"] = df["category_slug"].apply(slug_to_material)
    material_col = "material"

print("Using material column:", material_col)
print(df[material_col].value_counts())

# --- scatter plot colored by material ---

plt.figure(figsize=(10, 8))

for material in sorted(df[material_col].unique()):
    mask = df[material_col] == material
    plt.scatter(
        df.loc[mask, "pca_x"],
        df.loc[mask, "pca_y"],
        s=40,
        alpha=0.8,
        label=material,
    )

plt.title("CLIP Image Embeddings Colored by Material")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")

plt.legend(
    title="material",
    bbox_to_anchor=(1.05, 1.0),
    loc="upper left",
    borderaxespad=0.0,
)

plt.tight_layout()
plt.show()


In [None]:
# dominant material_group per cluster
cluster_summary = (
    df.groupby("cluster_id")["material_group"]
      .value_counts(normalize=True)
      .rename("frac")
      .reset_index()
)

dominant = (
    cluster_summary
    .sort_values(["cluster_id", "frac"], ascending=[True, False])
    .groupby("cluster_id")
    .head(1)
)

dominant[["cluster_id", "material_group", "frac"]]

In [None]:
from PIL import Image
from IPython.display import display

def load_image(path: Path):
    img = Image.open(path).convert("RGB")
    return img

def show_cluster_examples(cluster_id: int, max_examples: int = 8):
    subset = df[df["cluster_id"] == cluster_id].head(max_examples)
    print(
        f"Cluster {cluster_id} | size={subset['cluster_id'].count()} "
        f"| material_groups={subset['material_group'].value_counts().to_dict()}"
    )

    for _, row in subset.iterrows():
        print(
            f"SKU {row['sku']} | {row['name']} | "
            f"group={row['material_group']} | cluster={row['cluster_id']}"
        )
        img_path = IMAGES_DIR / row["image_filename"]
        display(load_image(img_path))

# Example: inspect cluster 0 (change the ID after you see the scatter)
show_cluster_examples(0, max_examples=6)
