In [10]:
import os, sys

# 1) Set notebook working directory = FaceStats project root
PROJECT_ROOT = "/Users/jayklarin/__DI/Repositories/FaceStats"
os.chdir(PROJECT_ROOT)
print("cwd:", os.getcwd())

# 2) Add src/ folder to Python PATH
SRC_PATH = os.path.join(PROJECT_ROOT, "src")
if SRC_PATH not in sys.path:
    sys.path.insert(0, SRC_PATH)

print("src path added:", SRC_PATH)

cwd: /Users/jayklarin/__DI/Repositories/FaceStats
src path added: /Users/jayklarin/__DI/Repositories/FaceStats/src


### Step 1 — Load Embeddings & Attributes

We load the CLIP embeddings and the cleaned attribute table generated in previous notebooks.

- **Embeddings:**  
  `data/processed/embeddings/clip_embeddings.parquet`

- **Attributes:**  
  `data/processed/metadata/attributes_final.parquet`

These two tables are joined on `filename`, giving us a unified training dataset for the attractiveness regression model.


In [11]:
import polars as pl

# ------------------------------------------------------------
# Paths (FaceStats v4)
# ------------------------------------------------------------
EMB_PATH = "data/processed/embeddings/embeddings_clip.parquet"
ATTR_PATH = "data/processed/metadata/attributes_final.parquet"

print("Loading embeddings…")
emb = pl.read_parquet(EMB_PATH)
print("Embeddings:", emb.shape)

print("Loading attributes…")
attr = pl.read_parquet(ATTR_PATH)
print("Attributes:", attr.shape)

# ------------------------------------------------------------
# Join tables
# ------------------------------------------------------------
df = (
    emb
    .join(attr, on="filename", how="inner")
)

print("\nMerged dataset:", df.shape)
df.head()


Loading embeddings…
Embeddings: (700, 2)
Loading attributes…
Attributes: (700, 5)

Merged dataset: (700, 6)


filename,embedding,age,gender_final,ethnicity_final,attractiveness
str,list[f64],null,str,str,f32
"""SFHQ_pt4_00001843.jpg""","[0.07163, -0.028173, … -0.007507]",,"""unknown""","""unknown""",2.914631
"""SFHQ_pt4_00000591.jpg""","[0.027857, -0.008046, … -0.003737]",,"""male""","""white""",2.938567
"""SFHQ_pt4_00002437.jpg""","[0.065614, -0.018432, … -0.001973]",,"""unknown""","""unknown""",3.175124
"""SFHQ_pt4_00002345.jpg""","[0.064536, -0.023444, … -0.057561]",,"""unknown""","""unknown""",3.208315
"""SFHQ_pt4_00003073.jpg""","[0.080103, -0.03326, … 0.013459]",,"""unknown""","""unknown""",3.280991


In [12]:
import torch
import numpy as np
import polars as pl

from src.models.attractiveness_model import AttractivenessRegressor

# ---------------------------------------------------------------------
# Load pretrained attractiveness model
# ---------------------------------------------------------------------
MODEL_PATH = "models/attractiveness_regressor.pt"

print("Loading attractiveness regressor…")
device = "mps" if torch.backends.mps.is_available() else "cpu"

reg = AttractivenessRegressor()
reg.load_state_dict(torch.load(MODEL_PATH, map_location=device))
reg.to(device)
reg.eval()

print("Model loaded on:", device)

# ---------------------------------------------------------------------
# Prepare embedding matrix
# ---------------------------------------------------------------------
print("\nPreparing embedding matrix…")

# Convert list[f64] → numpy matrix
X = np.vstack(df["embedding"].to_list())

# Convert to tensor
X_tensor = torch.tensor(X, dtype=torch.float32).to(device)

print("Embedding tensor:", X_tensor.shape)

# ---------------------------------------------------------------------
# Predict attractiveness
# ---------------------------------------------------------------------
print("\nPredicting attractiveness for all images…")

with torch.no_grad():
    preds = reg(X_tensor).cpu().numpy().flatten()

print("Done. Example predictions:", preds[:5])

# ---------------------------------------------------------------------
# Attach to DataFrame
# ---------------------------------------------------------------------
df = df.with_columns([
    pl.Series("attractiveness", preds)
])

print("\nUpdated dataset:", df.shape)
df.head()


Loading attractiveness regressor…
Model loaded on: mps

Preparing embedding matrix…
Embedding tensor: torch.Size([700, 512])

Predicting attractiveness for all images…
Done. Example predictions: [2.914631  2.938567  3.1751235 3.2083156 3.2809908]

Updated dataset: (700, 6)


filename,embedding,age,gender_final,ethnicity_final,attractiveness
str,list[f64],null,str,str,f32
"""SFHQ_pt4_00001843.jpg""","[0.07163, -0.028173, … -0.007507]",,"""unknown""","""unknown""",2.914631
"""SFHQ_pt4_00000591.jpg""","[0.027857, -0.008046, … -0.003737]",,"""male""","""white""",2.938567
"""SFHQ_pt4_00002437.jpg""","[0.065614, -0.018432, … -0.001973]",,"""unknown""","""unknown""",3.175123
"""SFHQ_pt4_00002345.jpg""","[0.064536, -0.023444, … -0.057561]",,"""unknown""","""unknown""",3.208316
"""SFHQ_pt4_00003073.jpg""","[0.080103, -0.03326, … 0.013459]",,"""unknown""","""unknown""",3.280991


In [13]:
# =========================================================
# Step 3 — Save Updated Attractiveness Table
# =========================================================

OUT_PATH = "data/processed/metadata/attractiveness_with_attributes.parquet"

df.write_parquet(OUT_PATH)

print("\nSaved updated attractiveness table →", OUT_PATH)
df.head()



Saved updated attractiveness table → data/processed/metadata/attractiveness_with_attributes.parquet


filename,embedding,age,gender_final,ethnicity_final,attractiveness
str,list[f64],null,str,str,f32
"""SFHQ_pt4_00001843.jpg""","[0.07163, -0.028173, … -0.007507]",,"""unknown""","""unknown""",2.914631
"""SFHQ_pt4_00000591.jpg""","[0.027857, -0.008046, … -0.003737]",,"""male""","""white""",2.938567
"""SFHQ_pt4_00002437.jpg""","[0.065614, -0.018432, … -0.001973]",,"""unknown""","""unknown""",3.175123
"""SFHQ_pt4_00002345.jpg""","[0.064536, -0.023444, … -0.057561]",,"""unknown""","""unknown""",3.208316
"""SFHQ_pt4_00003073.jpg""","[0.080103, -0.03326, … 0.013459]",,"""unknown""","""unknown""",3.280991


In [14]:
import numpy as np

print("Preparing model training dataset…")

# ------------------------------------------------------------
# X — CLIP Embeddings Matrix (N × 512)
# ------------------------------------------------------------
X = np.vstack(df["embedding"].to_list())
print("X shape:", X.shape)

# ------------------------------------------------------------
# y — Attractiveness Scores
# ------------------------------------------------------------
y = df["attractiveness"].to_numpy()
print("y shape:", y.shape)

# Quick preview
print("\nSample y values:", y[:10])


Preparing model training dataset…
X shape: (700, 512)
y shape: (700,)

Sample y values: [2.914631  2.938567  3.1751235 3.2083156 3.2809908 3.2844422 3.6335676
 2.7881405 3.2307274 3.1101334]


In [15]:
import torch
import torch.nn as nn
import torch.optim as optim

print("Training a new attractiveness regressor…")

# ============================================================
#  MODEL DEFINITION — MLP Regressor
# ============================================================

class AttractivenessRegressor(nn.Module):
    def __init__(self, input_dim=512):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.net(x)


# ============================================================
#  PREPARE DATA
# ============================================================

device = "mps" if torch.backends.mps.is_available() else "cpu"
print("Device:", device)

X_tensor = torch.tensor(X, dtype=torch.float32).to(device)
y_tensor = torch.tensor(y, dtype=torch.float32).view(-1, 1).to(device)

model = AttractivenessRegressor().to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

# ============================================================
#  TRAIN LOOP
# ============================================================

epochs = 20
for epoch in range(1, epochs + 1):
    model.train()
    optimizer.zero_grad()

    preds = model(X_tensor)
    loss = loss_fn(preds, y_tensor)

    loss.backward()
    optimizer.step()

    if epoch % 5 == 0 or epoch == 1:
        print(f"[Epoch {epoch}/{epochs}] Loss: {loss.item():.4f}")

print("\nTraining complete.")


# ============================================================
#  SAVE MODEL
# ============================================================

OUT_PATH = "src/models/attractiveness_regressor.pt"
os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)

torch.save(model.state_dict(), OUT_PATH)

print("\nSaved new attractiveness regressor →", OUT_PATH)


Training a new attractiveness regressor…
Device: mps
[Epoch 1/20] Loss: 9.6875
[Epoch 5/20] Loss: 9.1305
[Epoch 10/20] Loss: 8.1592
[Epoch 15/20] Loss: 6.4937
[Epoch 20/20] Loss: 4.1854

Training complete.

Saved new attractiveness regressor → src/models/attractiveness_regressor.pt
