# 04 — Attractiveness Model Training (FaceStats v4)
Train the AttractivenessRegressor on CLIP embeddings using synthetic, deterministic labels (hash-based) until real labels are available.

## Overview
- Load embeddings from `data/processed/embeddings_clip.parquet`
- Derive `id` from filename stem
- Generate deterministic synthetic labels (hash of `id`, scaled to [0, 1]) for placeholder training
- Split train/validation, train the MLP regressor, report losses
- Save checkpoint to `models/attractiveness_regressor.pt`
- Swap in real labels later by replacing the synthetic-label cell

In [1]:
import os, sys
from pathlib import Path

# Ensure cwd is project root for imports/paths
PROJECT_ROOT = Path.cwd()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent
os.chdir(PROJECT_ROOT)

SRC_PATH = PROJECT_ROOT / "src"
if str(SRC_PATH) not in sys.path:
    sys.path.insert(0, str(SRC_PATH))

print("cwd:", Path.cwd())
print("src path added:", SRC_PATH)


cwd: /Users/jayklarin/__DI/Repositories/FaceStats
src path added: /Users/jayklarin/__DI/Repositories/FaceStats/src


In [2]:
from pathlib import Path
from typing import Union
import hashlib
import numpy as np
import polars as pl
import torch
from torch.utils.data import Dataset, DataLoader

from models.attractiveness_model import AttractivenessRegressor

# Paths (v4)
EMBED_PATH = Path("data/processed/embeddings_clip.parquet")
OUTPUT_MODEL_PATH = Path("models/attractiveness_regressor.pt")

# Training hyperparameters
RANDOM_SEED = 42
EPOCHS = 5
BATCH_SIZE = 16
LR = 1e-4
VAL_FRACTION = 0.2

# Reproducibility
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)


<torch._C.Generator at 0x11ad612f0>

## Load embeddings
Reads CLIP embeddings parquet, validates columns, and derives `id` from filename stem.

In [3]:
if not EMBED_PATH.exists():
    raise FileNotFoundError(f"Missing embeddings parquet: {EMBED_PATH}")

embeddings_df = pl.read_parquet(EMBED_PATH)
expected_cols = {"filename", "embedding"}
missing = expected_cols - set(embeddings_df.columns)
if missing:
    raise ValueError(f"Embeddings parquet missing columns: {missing}")

embeddings_df = embeddings_df.with_columns(
    pl.col("filename").map_elements(lambda x: Path(x).stem).alias("id")
)

print(embeddings_df.head())
print(f"Loaded {len(embeddings_df)} embeddings")


shape: (5, 3)
┌───────────────────────┬─────────────────────────────────┬───────────────────┐
│ filename              ┆ embedding                       ┆ id                │
│ ---                   ┆ ---                             ┆ ---               │
│ str                   ┆ list[f64]                       ┆ str               │
╞═══════════════════════╪═════════════════════════════════╪═══════════════════╡
│ SFHQ_pt4_00000208.jpg ┆ [0.050459, -0.042498, … -0.011… ┆ SFHQ_pt4_00000208 │
│ SFHQ_pt4_00002966.jpg ┆ [-0.00345, -0.025553, … -0.002… ┆ SFHQ_pt4_00002966 │
│ SFHQ_pt4_00003463.jpg ┆ [0.049876, -0.015998, … 0.0299… ┆ SFHQ_pt4_00003463 │
│ SFHQ_pt4_00002219.jpg ┆ [0.050927, -0.002582, … 0.0140… ┆ SFHQ_pt4_00002219 │
│ SFHQ_pt4_00003112.jpg ┆ [0.021666, -0.005994, … -0.009… ┆ SFHQ_pt4_00003112 │
└───────────────────────┴─────────────────────────────────┴───────────────────┘
Loaded 80 embeddings


## Synthetic labels (deterministic hash)
Creates placeholder labels by hashing the `id` and scaling to [0, 1]. Replace this cell when real labels are available.

In [4]:
def hash_to_unit_float(text: str) -> float:
    h = hashlib.sha256(text.encode("utf-8")).hexdigest()
    as_int = int(h, 16)
    return as_int / ((1 << 256) - 1)

embeddings_df = embeddings_df.with_columns(
    pl.col("id").map_elements(hash_to_unit_float).alias("attractiveness_label")
)

print(embeddings_df.select(["id", "attractiveness_label"]).head())
print("Label stats:", embeddings_df["attractiveness_label"].min(), embeddings_df["attractiveness_label"].max())


shape: (5, 2)
┌───────────────────┬──────────────────────┐
│ id                ┆ attractiveness_label │
│ ---               ┆ ---                  │
│ str               ┆ f64                  │
╞═══════════════════╪══════════════════════╡
│ SFHQ_pt4_00000208 ┆ 0.496663             │
│ SFHQ_pt4_00002966 ┆ 0.044135             │
│ SFHQ_pt4_00003463 ┆ 0.421839             │
│ SFHQ_pt4_00002219 ┆ 0.18107              │
│ SFHQ_pt4_00003112 ┆ 0.864885             │
└───────────────────┴──────────────────────┘
Label stats: 0.011262744286187338 0.970123531559696


## Train/validation split

In [5]:
n = len(embeddings_df)
indices = np.arange(n)
np.random.shuffle(indices)
split = int(n * (1 - VAL_FRACTION))
train_idx, val_idx = indices[:split], indices[split:]

train_df = embeddings_df[train_idx.tolist()]
val_df = embeddings_df[val_idx.tolist()]

print(f"Train size: {len(train_df)} | Val size: {len(val_df)}")


Train size: 64 | Val size: 16


## Dataset and DataLoader

In [6]:
class EmbeddingDataset(Dataset):
    def __init__(self, df: pl.DataFrame):
        self.X = df["embedding"].to_list()
        self.y = df["attractiveness_label"].to_list()

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.X[idx], dtype=torch.float32),
            torch.tensor(self.y[idx], dtype=torch.float32),
        )

train_ds = EmbeddingDataset(train_df)
val_ds = EmbeddingDataset(val_df)

train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)


## Initialize model

In [7]:
embedding_dim = len(train_df["embedding"][0]) if len(train_df) else 0
if embedding_dim == 0:
    raise ValueError("Embedding dimension could not be determined.")

model = AttractivenessRegressor(input_dim=embedding_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
loss_fn = torch.nn.MSELoss()

print(model)


AttractivenessRegressor(
  (net): Sequential(
    (0): Linear(in_features=512, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=256, out_features=256, bias=True)
    (4): ReLU()
    (5): Linear(in_features=256, out_features=1, bias=True)
  )
)


## Training loop

In [8]:
def run_epoch(dl, model, optimizer=None):
    total_loss = 0.0
    count = 0
    for X, y in dl:
        if optimizer:
            optimizer.zero_grad()
            preds = model(X).squeeze()
            loss = loss_fn(preds, y)
            loss.backward()
            optimizer.step()
        else:
            with torch.no_grad():
                preds = model(X).squeeze()
                loss = loss_fn(preds, y)
        total_loss += loss.item() * len(X)
        count += len(X)
    return total_loss / max(count, 1)

for epoch in range(1, EPOCHS + 1):
    train_loss = run_epoch(train_dl, model, optimizer)
    val_loss = run_epoch(val_dl, model, optimizer=None)
    print(f"Epoch {epoch:02d} | train_loss={train_loss:.4f} | val_loss={val_loss:.4f}")


Epoch 01 | train_loss=0.3601 | val_loss=0.4366
Epoch 02 | train_loss=0.3447 | val_loss=0.4191
Epoch 03 | train_loss=0.3305 | val_loss=0.4016
Epoch 04 | train_loss=0.3151 | val_loss=0.3847
Epoch 05 | train_loss=0.3014 | val_loss=0.3690


## Save model
Saves state_dict to `models/attractiveness_regressor.pt` (creates the directory if needed).

In [9]:
OUTPUT_MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
torch.save(model.state_dict(), OUTPUT_MODEL_PATH)
print(f"Saved model to {OUTPUT_MODEL_PATH}")


Saved model to models/attractiveness_regressor.pt


## Next steps
- Replace the synthetic label cell with real labels when available, then retrain and resave.
- Adjust hyperparameters (epochs, LR, hidden size) in the imports/config cell as needed.