# 05 — Attractiveness Inference (FaceStats v4)
Run the trained attractiveness regressor on CLIP embeddings and save per-image scores using the v4 paths.

## Overview
- Load CLIP embeddings from `data/processed/embeddings_clip.parquet`
- Restore the trained regressor from `models/attractiveness_regressor.pt`
- Predict an attractiveness score for each image ID
- Save results (Option A): `data/processed/metadata/attractiveness_scores.parquet`
- You can merge into other tables later if desired

In [1]:
import os, sys
from pathlib import Path

# Ensure cwd is project root for imports/paths
PROJECT_ROOT = Path.cwd()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent
os.chdir(PROJECT_ROOT)

SRC_PATH = PROJECT_ROOT / "src"
if str(SRC_PATH) not in sys.path:
    sys.path.insert(0, str(SRC_PATH))

print("cwd:", Path.cwd())
print("src path added:", SRC_PATH)


cwd: /Users/jayklarin/__DI/Repositories/FaceStats
src path added: /Users/jayklarin/__DI/Repositories/FaceStats/src


In [2]:
from pathlib import Path
import polars as pl
import torch
from typing import Union

from models.attractiveness_model import AttractivenessRegressor

# Paths (v4)
EMBED_PATH = Path("data/processed/embeddings_clip.parquet")
MODEL_PATH = Path("models/attractiveness_regressor.pt")
PREPROC_DIR = Path("data/processed/preproc/")
OUTPUT_PATH = Path("data/processed/metadata/attractiveness_scores.parquet")

print(f"Embeddings: {EMBED_PATH.resolve()}")
print(f"Model:       {MODEL_PATH.resolve()}")
print(f"Images dir:  {PREPROC_DIR.resolve()}")
print(f"Output:      {OUTPUT_PATH.resolve()}")


Embeddings: /Users/jayklarin/__DI/Repositories/FaceStats/data/processed/embeddings_clip.parquet
Model:       /Users/jayklarin/__DI/Repositories/FaceStats/models/attractiveness_regressor.pt
Images dir:  /Users/jayklarin/__DI/Repositories/FaceStats/data/processed/preproc
Output:      /Users/jayklarin/__DI/Repositories/FaceStats/data/processed/metadata/attractiveness_scores.parquet


## Load embeddings
Reads CLIP embeddings (parquet) and extracts IDs from filenames (without extension).

In [3]:
if not EMBED_PATH.exists():
    raise FileNotFoundError(f"Missing embeddings parquet: {EMBED_PATH}")

embeddings_df = pl.read_parquet(EMBED_PATH)
expected_cols = {"filename", "embedding"}
missing = expected_cols - set(embeddings_df.columns)
if missing:
    raise ValueError(f"Embeddings parquet missing columns: {missing}")

print(embeddings_df.head())
print(f"Loaded {len(embeddings_df)} embeddings")


shape: (5, 2)
┌───────────────────────┬─────────────────────────────────┐
│ filename              ┆ embedding                       │
│ ---                   ┆ ---                             │
│ str                   ┆ list[f64]                       │
╞═══════════════════════╪═════════════════════════════════╡
│ SFHQ_pt4_00000208.jpg ┆ [0.050459, -0.042498, … -0.011… │
│ SFHQ_pt4_00002966.jpg ┆ [-0.00345, -0.025553, … -0.002… │
│ SFHQ_pt4_00003463.jpg ┆ [0.049876, -0.015998, … 0.0299… │
│ SFHQ_pt4_00002219.jpg ┆ [0.050927, -0.002582, … 0.0140… │
│ SFHQ_pt4_00003112.jpg ┆ [0.021666, -0.005994, … -0.009… │
└───────────────────────┴─────────────────────────────────┘
Loaded 80 embeddings


In [4]:
def filename_to_id(name: str) -> str:
    return Path(name).stem

# Derive IDs and split out arrays
df_with_id = embeddings_df.with_columns(
    pl.col("filename").map_elements(filename_to_id).alias("id")
)

ids = df_with_id["id"].to_list()
emb_array = df_with_id["embedding"].to_list()

if not emb_array:
    raise ValueError("No embeddings found in the parquet file.")

embedding_dim = len(emb_array[0])
print(f"Embedding dim: {embedding_dim}")


Embedding dim: 512


## Load model
Restores the PyTorch regressor (handles full-model or state-dict checkpoints).

In [5]:
def load_regressor(path: Path, input_dim: int) -> torch.nn.Module:
    if not path.exists():
        raise FileNotFoundError(f"Missing model file: {path}")

    checkpoint: Union[dict, torch.nn.Module] = torch.load(path, map_location="cpu")

    if isinstance(checkpoint, torch.nn.Module):
        model = checkpoint
    else:
        model = AttractivenessRegressor(input_dim=input_dim)
        state_dict = checkpoint
        if isinstance(checkpoint, dict) and any(k in checkpoint for k in ["state_dict", "model_state_dict"]):
            state_dict = checkpoint.get("state_dict") or checkpoint.get("model_state_dict")
        model.load_state_dict(state_dict)

    model.eval()
    return model

model = load_regressor(MODEL_PATH, embedding_dim)
print(model)


FileNotFoundError: Missing model file: models/attractiveness_regressor.pt

## Run inference
Computes one attractiveness score per embedding (no gradients, CPU by default).

In [None]:
with torch.no_grad():
    X = torch.tensor(emb_array, dtype=torch.float32)
    preds = model(X).squeeze().cpu().numpy()

print(f"Pred shape: {preds.shape}")


## Build results table
Creates a tidy table with `id` and `attractiveness_score`.

In [None]:
results_df = pl.DataFrame({
    "id": ids,
    "attractiveness_score": preds.tolist(),
})
print(results_df.head())
print(f"Total rows: {len(results_df)}")


## Save predictions (Option A)
Saves to `data/processed/metadata/attractiveness_scores.parquet`.

In [None]:
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
results_df.write_parquet(OUTPUT_PATH)
print(f"Wrote predictions to {OUTPUT_PATH}")


## Next steps
- If you want the scores merged into `attributes_with_meta.parquet`, join on `id` externally.
- Use the saved parquet in downstream notebooks (e.g., composites or clustering).