```mermaid
flowchart LR

classDef notebook fill:#5c7fa6,stroke:#3f5a7b,color:#f2f6fb,font-weight:bold;
classDef python fill:#9a80b8,stroke:#6d5789,color:#f7f3fb, font-weight:bold;
classDef tools fill:#e9c48a,stroke:#b58950,color:#2d1c05;
classDef methods fill:#8cc7ab,stroke:#5e9475,color:#0f2f1f;

N04["04_attractiveness_model.ipynb"]:::notebook

N04 --> M1["train_attractiveness.py"]:::python

M1 --> T4["Tools:<br>torch<br>polars"]:::tools

T4 --> M4["Methods:<br>run_training()<br>EmbeddingDataset (instantiated, not standalone)"]:::methods


In [1]:
import os, sys

# 1) Set notebook working directory = FaceStats project root
PROJECT_ROOT = "/Users/jayklarin/__DI/Repositories/FaceStats"
os.chdir(PROJECT_ROOT)
print("cwd:", os.getcwd())

# 2) Add src/ folder to Python PATH
SRC_PATH = os.path.join(PROJECT_ROOT, "src")
if SRC_PATH not in sys.path:
    sys.path.insert(0, SRC_PATH)

print("src path added:", SRC_PATH)

cwd: /Users/jayklarin/__DI/Repositories/FaceStats
src path added: /Users/jayklarin/__DI/Repositories/FaceStats/src


### Step 1 — Load Embeddings & Attributes

We load the CLIP embeddings and the cleaned attribute table generated in previous notebooks.

- **Embeddings:**  
  `data/processed/embeddings/clip_embeddings.parquet`

- **Attributes:**  
  `data/processed/metadata/attributes_final.parquet`

These two tables are joined on `filename`, giving us a unified training dataset for the attractiveness regression model.


In [2]:

import polars as pl
from pathlib import Path

# ------------------------------------------------------------
# Paths (FaceStats v4) with fallback for embeddings
# ------------------------------------------------------------
EMB_PRIMARY = Path("data/processed/embeddings/embeddings_clip.parquet")
EMB_FALLBACK = Path("data/processed/embeddings_clip.parquet")
ATTR_PATH = Path("data/processed/metadata/attributes_final.parquet")

if EMB_PRIMARY.exists():
    EMB_PATH = EMB_PRIMARY
elif EMB_FALLBACK.exists():
    EMB_PATH = EMB_FALLBACK
else:
    raise FileNotFoundError("Embeddings parquet not found; run 02_embeddings.ipynb to regenerate.")

if not ATTR_PATH.exists():
    raise FileNotFoundError("attributes_final.parquet not found; run 03_attributes.ipynb / merges to regenerate.")

print("Loading embeddings…", EMB_PATH)
emb = pl.read_parquet(EMB_PATH)
print("Embeddings:", emb.shape)

print("Loading attributes…", ATTR_PATH)
attr = pl.read_parquet(ATTR_PATH)
print("Attributes:", attr.shape)

# ------------------------------------------------------------
# Join tables
# ------------------------------------------------------------
df = (
    emb
    .join(attr, on="filename", how="inner")
)

print("Merged dataset:", df.shape)
df.head()


Loading embeddings… data/processed/embeddings/embeddings_clip.parquet
Embeddings: (10000, 2)
Loading attributes… data/processed/metadata/attributes_final.parquet
Attributes: (10000, 5)
Merged dataset: (10000, 6)


filename,embedding,age,gender_final,ethnicity_final,attractiveness
str,list[f64],null,str,str,i64
"""SFHQ_pt4_00086092.jpg""","[0.040853, -0.001176, … -0.034655]",,"""unknown""","""unknown""",8
"""SFHQ_pt4_00065309.jpg""","[0.030933, 0.005503, … 0.001794]",,"""female""","""unknown""",7
"""SFHQ_pt4_00062466.jpg""","[0.071319, -0.005249, … 0.051021]",,"""unknown""","""unknown""",2
"""SFHQ_pt4_00090828.jpg""","[0.066199, -0.003542, … 0.020381]",,"""unknown""","""unknown""",4
"""SFHQ_pt4_00032251.jpg""","[0.021252, -0.060645, … -0.004985]",,"""unknown""","""unknown""",5


In [3]:
import torch
import numpy as np
import polars as pl

from src.models.attractiveness_model import AttractivenessRegressor

# ---------------------------------------------------------------------
# Load pretrained attractiveness model
# ---------------------------------------------------------------------
MODEL_PATH = "models/attractiveness_regressor.pt"

print("Loading attractiveness regressor…")
device = "mps" if torch.backends.mps.is_available() else "cpu"

reg = AttractivenessRegressor()
reg.load_state_dict(torch.load(MODEL_PATH, map_location=device))
reg.to(device)
reg.eval()

print("Model loaded on:", device)

# ---------------------------------------------------------------------
# Prepare embedding matrix
# ---------------------------------------------------------------------
print("\nPreparing embedding matrix…")

# Convert list[f64] → numpy matrix
X = np.vstack(df["embedding"].to_list())

# Convert to tensor
X_tensor = torch.tensor(X, dtype=torch.float32).to(device)

print("Embedding tensor:", X_tensor.shape)

# ---------------------------------------------------------------------
# Predict attractiveness
# ---------------------------------------------------------------------
print("\nPredicting attractiveness for all images…")

with torch.no_grad():
    preds = reg(X_tensor).cpu().numpy().flatten()

print("Done. Example predictions:", preds[:5])

# ---------------------------------------------------------------------
# Attach to DataFrame
# ---------------------------------------------------------------------
df = df.with_columns([
    pl.Series("attractiveness", preds)
])

print("\nUpdated dataset:", df.shape)
df.head()


Loading attractiveness regressor…
Model loaded on: mps

Preparing embedding matrix…
Embedding tensor: torch.Size([10000, 512])

Predicting attractiveness for all images…
Done. Example predictions: [3.1616962 3.0371342 2.7696643 2.9870064 3.152438 ]

Updated dataset: (10000, 6)


filename,embedding,age,gender_final,ethnicity_final,attractiveness
str,list[f64],null,str,str,f32
"""SFHQ_pt4_00086092.jpg""","[0.040853, -0.001176, … -0.034655]",,"""unknown""","""unknown""",3.161696
"""SFHQ_pt4_00065309.jpg""","[0.030933, 0.005503, … 0.001794]",,"""female""","""unknown""",3.037134
"""SFHQ_pt4_00062466.jpg""","[0.071319, -0.005249, … 0.051021]",,"""unknown""","""unknown""",2.769664
"""SFHQ_pt4_00090828.jpg""","[0.066199, -0.003542, … 0.020381]",,"""unknown""","""unknown""",2.987006
"""SFHQ_pt4_00032251.jpg""","[0.021252, -0.060645, … -0.004985]",,"""unknown""","""unknown""",3.152438


In [4]:
# =========================================================
# Step 3 — Save Updated Attractiveness Table
# =========================================================

OUT_PATH = "data/processed/metadata/attractiveness_with_attributes.parquet"

df.write_parquet(OUT_PATH)

print("\nSaved updated attractiveness table →", OUT_PATH)
df.head()



Saved updated attractiveness table → data/processed/metadata/attractiveness_with_attributes.parquet


filename,embedding,age,gender_final,ethnicity_final,attractiveness
str,list[f64],null,str,str,f32
"""SFHQ_pt4_00086092.jpg""","[0.040853, -0.001176, … -0.034655]",,"""unknown""","""unknown""",3.161696
"""SFHQ_pt4_00065309.jpg""","[0.030933, 0.005503, … 0.001794]",,"""female""","""unknown""",3.037134
"""SFHQ_pt4_00062466.jpg""","[0.071319, -0.005249, … 0.051021]",,"""unknown""","""unknown""",2.769664
"""SFHQ_pt4_00090828.jpg""","[0.066199, -0.003542, … 0.020381]",,"""unknown""","""unknown""",2.987006
"""SFHQ_pt4_00032251.jpg""","[0.021252, -0.060645, … -0.004985]",,"""unknown""","""unknown""",3.152438


In [5]:
import numpy as np

print("Preparing model training dataset…")

# ------------------------------------------------------------
# X — CLIP Embeddings Matrix (N × 512)
# ------------------------------------------------------------
X = np.vstack(df["embedding"].to_list())
print("X shape:", X.shape)

# ------------------------------------------------------------
# y — Attractiveness Scores
# ------------------------------------------------------------
y = df["attractiveness"].to_numpy()
print("y shape:", y.shape)

# Quick preview
print("\nSample y values:", y[:10])


Preparing model training dataset…
X shape: (10000, 512)
y shape: (10000,)

Sample y values: [3.1616962 3.0371342 2.7696643 2.9870064 3.152438  3.3481672 3.1513348
 3.38724   3.161443  3.1377716]


In [6]:
import torch
import torch.nn as nn
import torch.optim as optim

print("Training a new attractiveness regressor…")

# ============================================================
#  MODEL DEFINITION — MLP Regressor
# ============================================================

class AttractivenessRegressor(nn.Module):
    def __init__(self, input_dim=512):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.net(x)


# ============================================================
#  PREPARE DATA
# ============================================================

device = "mps" if torch.backends.mps.is_available() else "cpu"
print("Device:", device)

X_tensor = torch.tensor(X, dtype=torch.float32).to(device)
y_tensor = torch.tensor(y, dtype=torch.float32).view(-1, 1).to(device)

model = AttractivenessRegressor().to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

# ============================================================
#  TRAIN LOOP
# ============================================================

epochs = 20
for epoch in range(1, epochs + 1):
    model.train()
    optimizer.zero_grad()

    preds = model(X_tensor)
    loss = loss_fn(preds, y_tensor)

    loss.backward()
    optimizer.step()

    if epoch % 5 == 0 or epoch == 1:
        print(f"[Epoch {epoch}/{epochs}] Loss: {loss.item():.4f}")

print("\nTraining complete.")


# ============================================================
#  SAVE MODEL
# ============================================================

OUT_PATH = "src/models/attractiveness_regressor.pt"
os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)

torch.save(model.state_dict(), OUT_PATH)

print("\nSaved new attractiveness regressor →", OUT_PATH)


Training a new attractiveness regressor…
Device: mps
[Epoch 1/20] Loss: 9.2023
[Epoch 5/20] Loss: 8.6517
[Epoch 10/20] Loss: 7.7897
[Epoch 15/20] Loss: 6.2504
[Epoch 20/20] Loss: 3.9730

Training complete.

Saved new attractiveness regressor → src/models/attractiveness_regressor.pt
