In [1]:
# 0. Install dependencies (run once per environment)

%pip install fastai umap-learn scikit-learn --quiet


Note: you may need to restart the kernel to use updated packages.


In [2]:
# 1. Imports and configuration

from pathlib import Path
import pandas as pd
import numpy as np
import torch
from torch import nn

from fastai.basics import *
import umap
from sklearn.metrics.pairwise import cosine_distances

torch.manual_seed(42)
np.random.seed(42)

# Input datasets
GENERATED_CSV_ORIGINAL_PATH = Path("Palettes/Generated/palette_export_generated.csv")
GENERATED_CSV_RANDOMIZED_PATH = Path("Palettes/Generated/palette_export_generated_rand_order.csv")
ADOBE_CSV_ORIGINAL_PATH = Path("Palettes/Adobe/adobe_palettes.csv")
ADOBE_CSV_RANDOMIZED_PATH = Path("Palettes/Adobe/adobe_palettes_randomized.csv")

# Models
MODEL_ORIGINAL_PATH = Path("trained_models/palette_autoencoder.pkl")
MODEL_RANDOMIZED_PATH = Path("trained_models/palette_autoencoder_rand_order.pkl")

# Outputs (4 UMAP files)
OUT_NORM_DATA_NORM_MODEL_PATH = Path("out/palettes_umap_houdini_norm_order_dataset_norm_order_model.csv")
OUT_RAND_DATA_NORM_MODEL_PATH = Path("out/palettes_umap_houdini_rand_order_dataset_norm_order_model.csv")
OUT_NORM_DATA_RAND_MODEL_PATH = Path("out/palettes_umap_houdini_norm_order_dataset_rand_order_model.csv")
OUT_RAND_DATA_RAND_MODEL_PATH = Path("out/palettes_umap_houdini_rand_order_dataset_rand_order_model.csv")

# Optional per-run cosine outputs
OUT_STATS_DIR = Path("out")

COLOR_COUNT = 5  # number of colors per palette
SRGB_TO_LINEAR = True  # convert Adobe palettes from sRGB to linear (expects 0..1 input)

UMAP_KWARGS = dict(
    n_components=3,
    n_neighbors=15,
    min_dist=0.05,
    metric="euclidean",
    random_state=42,
)

RUN_CONFIGS = [
    {
        "name": "norm_data_norm_model",
        "generated_csv": GENERATED_CSV_ORIGINAL_PATH,
        "adobe_csv": ADOBE_CSV_ORIGINAL_PATH,
        "model_path": MODEL_ORIGINAL_PATH,
        "umap_out": OUT_NORM_DATA_NORM_MODEL_PATH,
    },
    {
        "name": "rand_data_norm_model",
        "generated_csv": GENERATED_CSV_RANDOMIZED_PATH,
        "adobe_csv": ADOBE_CSV_RANDOMIZED_PATH,
        "model_path": MODEL_ORIGINAL_PATH,
        "umap_out": OUT_RAND_DATA_NORM_MODEL_PATH,
    },
    {
        "name": "norm_data_rand_model",
        "generated_csv": GENERATED_CSV_ORIGINAL_PATH,
        "adobe_csv": ADOBE_CSV_ORIGINAL_PATH,
        "model_path": MODEL_RANDOMIZED_PATH,
        "umap_out": OUT_NORM_DATA_RAND_MODEL_PATH,
    },
    {
        "name": "rand_data_rand_model",
        "generated_csv": GENERATED_CSV_RANDOMIZED_PATH,
        "adobe_csv": ADOBE_CSV_RANDOMIZED_PATH,
        "model_path": MODEL_RANDOMIZED_PATH,
        "umap_out": OUT_RAND_DATA_RAND_MODEL_PATH,
    },
]


In [3]:
# 2. Define model class (required for load_learner)

class PaletteAutoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, latent_dim),
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim),
            nn.Sigmoid(),
        )

    def forward(self, x):
        return self.decoder(self.encoder(x))


In [4]:
# 3. Utilities

feature_cols = [f"{x}{i}" for i in range(1, COLOR_COUNT + 1) for x in ("r", "g", "b")]
expected_generated = ["law", "id_palette"] + [
    f"c{x}{i}" for i in range(1, COLOR_COUNT + 1) for x in ("r", "g", "b")
]
expected_adobe = [f"{x}{i}" for i in range(1, COLOR_COUNT + 1) for x in ("r", "g", "b")]


def _srgb_to_linear(arr):
    arr = np.clip(arr, 0.0, 1.0)
    return np.where(arr <= 0.04045, arr / 12.92, ((arr + 0.055) / 1.055) ** 2.4)


def _validate_normalized(df, cols, source_name):
    min_v = float(df[cols].min().min())
    max_v = float(df[cols].max().max())
    if min_v < 0.0 or max_v > 1.0:
        raise ValueError(
            f"{source_name} contains non-normalized values (min={min_v:.4f}, max={max_v:.4f}). "
            "Expected all color channels in [0, 1]."
        )


def load_palette_dataframe(generated_csv_path: Path, adobe_csv_path: Path) -> pd.DataFrame:
    if not generated_csv_path.exists():
        raise FileNotFoundError(f"Generated CSV not found: {generated_csv_path}")
    if not adobe_csv_path.exists():
        raise FileNotFoundError(f"Adobe CSV not found: {adobe_csv_path}")

    df_gen = pd.read_csv(generated_csv_path)
    missing = [c for c in expected_generated if c not in df_gen.columns]
    if missing:
        raise ValueError(f"Generated column mismatch for {generated_csv_path}. Missing {missing}, got {list(df_gen.columns)}")
    df_gen = df_gen.drop(columns=[c for c in ("palette_name", "batch") if c in df_gen.columns])

    rename_map = {f"c{x}{i}": f"{x}{i}" for i in range(1, COLOR_COUNT + 1) for x in ("r", "g", "b")}
    df_gen = df_gen.rename(columns=rename_map)

    df_adobe = pd.read_csv(adobe_csv_path)
    if list(df_adobe.columns) != expected_adobe:
        raise ValueError(f"Adobe column mismatch for {adobe_csv_path}. Expected {expected_adobe}, got {list(df_adobe.columns)}")

    df_adobe.insert(0, "id_palette", -1)
    df_adobe.insert(0, "law", -1)

    # Input data is already normalized; validate instead of scaling 0..255
    _validate_normalized(df_gen, feature_cols, f"Generated CSV ({generated_csv_path})")
    _validate_normalized(df_adobe, feature_cols, f"Adobe CSV ({adobe_csv_path})")

    if SRGB_TO_LINEAR:
        df_adobe[feature_cols] = _srgb_to_linear(df_adobe[feature_cols].to_numpy(dtype="float32"))

    df_gen[feature_cols] = df_gen[feature_cols].astype("float32")
    df_adobe[feature_cols] = df_adobe[feature_cols].astype("float32")

    df = pd.concat([df_gen, df_adobe], ignore_index=True)
    df[feature_cols] = df[feature_cols].astype("float32")
    df[["law", "id_palette"]] = df[["law", "id_palette"]].astype("int64")
    return df


def load_model(model_path: Path):
    if not model_path.exists():
        raise FileNotFoundError(f"Model not found at {model_path}. Run the training notebook first.")
    learner = load_learner(model_path)
    learner.model.eval()
    return learner


def encode_latents(model, data, batch_size=256):
    model.eval()
    device = next(model.parameters()).device
    zs = []
    with torch.no_grad():
        for start in range(0, len(data), batch_size):
            xb = torch.from_numpy(data[start:start + batch_size]).to(device)
            zs.append(model.encoder(xb).cpu())
    return torch.cat(zs).numpy()


In [5]:
# 4. Inference runner (latent + cosine + UMAP + save)


def run_inference(run_name: str, generated_csv_path: Path, adobe_csv_path: Path, model_path: Path, umap_out_path: Path):
    print(f"\n=== Run: {run_name} ===")
    print(f"Generated CSV: {generated_csv_path}")
    print(f"Adobe CSV:     {adobe_csv_path}")
    print(f"Model:         {model_path}")
    print(f"UMAP out:      {umap_out_path}")

    df = load_palette_dataframe(generated_csv_path, adobe_csv_path)
    learner = load_model(model_path)

    palette_array = df[feature_cols].to_numpy(dtype="float32")
    Z = encode_latents(learner.model, palette_array, batch_size=256)
    print("Latent shape:", Z.shape)

    # Cosine similarity: Adobe vs generated
    mask_adobe = df["id_palette"] == -1
    Z_adobe = Z[mask_adobe.values]
    Z_gen = Z[~mask_adobe.values]

    a_adobe = df.loc[mask_adobe, ["law", "id_palette"]].reset_index(drop=True)
    a_gen = df.loc[~mask_adobe, ["law", "id_palette"]].reset_index(drop=True)

    dists = cosine_distances(Z_adobe, Z_gen)
    nearest = dists.min(axis=1)

    stats = {
        "run_name": run_name,
        "count": int(len(nearest)),
        "mean": float(nearest.mean()),
        "median": float(np.median(nearest)),
        "p95": float(np.quantile(nearest, 0.95)),
    }

    law_ids = sorted(a_gen["law"].unique())
    centroids = []
    for law_id in law_ids:
        idx = a_gen["law"] == law_id
        centroids.append(Z_gen[idx.values].mean(axis=0))
    centroids = np.stack(centroids, axis=0)

    law_dists = cosine_distances(Z_adobe, centroids)
    assigned_idx = law_dists.argmin(axis=1)
    assigned_law = [law_ids[i] for i in assigned_idx]
    assigned_dist = law_dists.min(axis=1)

    adobe_nn = a_adobe.copy()
    adobe_nn["nearest_gen_cosine_dist"] = nearest
    adobe_nn["assigned_law"] = assigned_law
    adobe_nn["assigned_law_cosine_dist"] = assigned_dist
    adobe_nn["assigned_law_cosine_sim"] = 1.0 - assigned_dist

    stats_path = OUT_STATS_DIR / f"adobe_cosine_stats_{run_name}.csv"
    assign_path = OUT_STATS_DIR / f"adobe_law_assignments_{run_name}.csv"
    stats_path.parent.mkdir(parents=True, exist_ok=True)
    pd.DataFrame([stats]).to_csv(stats_path, index=False)
    adobe_nn.to_csv(assign_path, index=False)

    umap_3d = umap.UMAP(**UMAP_KWARGS)
    Z_3d = umap_3d.fit_transform(Z)
    print("UMAP shape:", Z_3d.shape)

    out_df = pd.DataFrame(Z_3d, columns=["umap_x", "umap_y", "umap_z"])
    out_df.insert(0, "id_palette", df["id_palette"].values)
    out_df.insert(0, "law", df["law"].values)

    umap_out_path.parent.mkdir(parents=True, exist_ok=True)
    out_df.to_csv(umap_out_path, index=False)

    print("Saved UMAP:", umap_out_path)
    print("Saved stats:", stats_path)
    print("Saved assignments:", assign_path)

    return {
        "run_name": run_name,
        "rows": len(df),
        "latent_dim": int(Z.shape[1]),
        "umap_out": str(umap_out_path),
        "stats_out": str(stats_path),
        "assign_out": str(assign_path),
        "cosine_mean": stats["mean"],
        "cosine_p95": stats["p95"],
    }


In [6]:
# 5. Run all 4 combinations automatically

results = []
for cfg in RUN_CONFIGS:
    result = run_inference(
        run_name=cfg["name"],
        generated_csv_path=cfg["generated_csv"],
        adobe_csv_path=cfg["adobe_csv"],
        model_path=cfg["model_path"],
        umap_out_path=cfg["umap_out"],
    )
    results.append(result)

results_df = pd.DataFrame(results)
results_df



=== Run: norm_data_norm_model ===
Generated CSV: Palettes\Generated\palette_export_generated.csv
Adobe CSV:     Palettes\Adobe\adobe_palettes.csv
Model:         trained_models\palette_autoencoder.pkl
UMAP out:      out\palettes_umap_houdini_norm_order_dataset_norm_order_model.csv
Latent shape: (8000, 16)


If you only need to load model weights and optimizer state, use the safe `Learner.load` instead.
  warn("load_learner` uses Python's insecure pickle module, which can execute malicious arbitrary code when loading. Only load files you trust.\nIf you only need to load model weights and optimizer state, use the safe `Learner.load` instead.")
  warn(


UMAP shape: (8000, 3)
Saved UMAP: out\palettes_umap_houdini_norm_order_dataset_norm_order_model.csv
Saved stats: out\adobe_cosine_stats_norm_data_norm_model.csv
Saved assignments: out\adobe_law_assignments_norm_data_norm_model.csv

=== Run: rand_data_norm_model ===
Generated CSV: Palettes\Generated\palette_export_generated_rand_order.csv
Adobe CSV:     Palettes\Adobe\adobe_palettes_randomized.csv
Model:         trained_models\palette_autoencoder.pkl
UMAP out:      out\palettes_umap_houdini_rand_order_dataset_norm_order_model.csv
Latent shape: (8000, 16)


If you only need to load model weights and optimizer state, use the safe `Learner.load` instead.
  warn("load_learner` uses Python's insecure pickle module, which can execute malicious arbitrary code when loading. Only load files you trust.\nIf you only need to load model weights and optimizer state, use the safe `Learner.load` instead.")
  warn(


UMAP shape: (8000, 3)
Saved UMAP: out\palettes_umap_houdini_rand_order_dataset_norm_order_model.csv
Saved stats: out\adobe_cosine_stats_rand_data_norm_model.csv
Saved assignments: out\adobe_law_assignments_rand_data_norm_model.csv

=== Run: norm_data_rand_model ===
Generated CSV: Palettes\Generated\palette_export_generated.csv
Adobe CSV:     Palettes\Adobe\adobe_palettes.csv
Model:         trained_models\palette_autoencoder_rand_order.pkl
UMAP out:      out\palettes_umap_houdini_norm_order_dataset_rand_order_model.csv
Latent shape: (8000, 16)


If you only need to load model weights and optimizer state, use the safe `Learner.load` instead.
  warn("load_learner` uses Python's insecure pickle module, which can execute malicious arbitrary code when loading. Only load files you trust.\nIf you only need to load model weights and optimizer state, use the safe `Learner.load` instead.")
  warn(


UMAP shape: (8000, 3)
Saved UMAP: out\palettes_umap_houdini_norm_order_dataset_rand_order_model.csv
Saved stats: out\adobe_cosine_stats_norm_data_rand_model.csv
Saved assignments: out\adobe_law_assignments_norm_data_rand_model.csv

=== Run: rand_data_rand_model ===
Generated CSV: Palettes\Generated\palette_export_generated_rand_order.csv
Adobe CSV:     Palettes\Adobe\adobe_palettes_randomized.csv
Model:         trained_models\palette_autoencoder_rand_order.pkl
UMAP out:      out\palettes_umap_houdini_rand_order_dataset_rand_order_model.csv
Latent shape: (8000, 16)


If you only need to load model weights and optimizer state, use the safe `Learner.load` instead.
  warn("load_learner` uses Python's insecure pickle module, which can execute malicious arbitrary code when loading. Only load files you trust.\nIf you only need to load model weights and optimizer state, use the safe `Learner.load` instead.")
  warn(


UMAP shape: (8000, 3)
Saved UMAP: out\palettes_umap_houdini_rand_order_dataset_rand_order_model.csv
Saved stats: out\adobe_cosine_stats_rand_data_rand_model.csv
Saved assignments: out\adobe_law_assignments_rand_data_rand_model.csv


Unnamed: 0,run_name,rows,latent_dim,umap_out,stats_out,assign_out,cosine_mean,cosine_p95
0,norm_data_norm_model,8000,16,out\palettes_umap_houdini_norm_order_dataset_norm_order_model.csv,out\adobe_cosine_stats_norm_data_norm_model.csv,out\adobe_law_assignments_norm_data_norm_model.csv,0.020997,0.041848
1,rand_data_norm_model,8000,16,out\palettes_umap_houdini_rand_order_dataset_norm_order_model.csv,out\adobe_cosine_stats_rand_data_norm_model.csv,out\adobe_law_assignments_rand_data_norm_model.csv,0.016883,0.034568
2,norm_data_rand_model,8000,16,out\palettes_umap_houdini_norm_order_dataset_rand_order_model.csv,out\adobe_cosine_stats_norm_data_rand_model.csv,out\adobe_law_assignments_norm_data_rand_model.csv,0.022072,0.04863
3,rand_data_rand_model,8000,16,out\palettes_umap_houdini_rand_order_dataset_rand_order_model.csv,out\adobe_cosine_stats_rand_data_rand_model.csv,out\adobe_law_assignments_rand_data_rand_model.csv,0.018545,0.036627


In [7]:
# 6. Quick summary

for r in results:
    print(
        f"{r['run_name']}: rows={r['rows']}, latent_dim={r['latent_dim']}, "
        f"cosine_mean={r['cosine_mean']:.6f}, cosine_p95={r['cosine_p95']:.6f}"
    )
    print(f"  UMAP:   {r['umap_out']}")
    print(f"  Stats:  {r['stats_out']}")
    print(f"  Assign: {r['assign_out']}")


norm_data_norm_model: rows=8000, latent_dim=16, cosine_mean=0.020997, cosine_p95=0.041848
  UMAP:   out\palettes_umap_houdini_norm_order_dataset_norm_order_model.csv
  Stats:  out\adobe_cosine_stats_norm_data_norm_model.csv
  Assign: out\adobe_law_assignments_norm_data_norm_model.csv
rand_data_norm_model: rows=8000, latent_dim=16, cosine_mean=0.016883, cosine_p95=0.034568
  UMAP:   out\palettes_umap_houdini_rand_order_dataset_norm_order_model.csv
  Stats:  out\adobe_cosine_stats_rand_data_norm_model.csv
  Assign: out\adobe_law_assignments_rand_data_norm_model.csv
norm_data_rand_model: rows=8000, latent_dim=16, cosine_mean=0.022072, cosine_p95=0.048630
  UMAP:   out\palettes_umap_houdini_norm_order_dataset_rand_order_model.csv
  Stats:  out\adobe_cosine_stats_norm_data_rand_model.csv
  Assign: out\adobe_law_assignments_norm_data_rand_model.csv
rand_data_rand_model: rows=8000, latent_dim=16, cosine_mean=0.018545, cosine_p95=0.036627
  UMAP:   out\palettes_umap_houdini_rand_order_dataset_