```mermaid
flowchart LR

classDef notebook fill:#5c7fa6,stroke:#3f5a7b,color:#f2f6fb,font-weight:bold;
classDef python fill:#9a80b8,stroke:#6d5789,color:#f7f3fb,font-weight:bold;
classDef tools fill:#e9c48a,stroke:#b58950,color:#2d1c05;
classDef methods fill:#8cc7ab,stroke:#5e9475,color:#0f2f1f;

N03["03_attributes.ipynb"]:::notebook

N03 --> A1["face_attributes.py"]:::python

A1 --> T3["Tools:<br>joblib<br>numpy<br>Pillow"]:::tools

T3 --> M3["Methods:<br>get_embedding()<br>infer_attributes()"]:::methods


In [1]:
import os, sys

# 1) Set notebook working directory = FaceStats project root
PROJECT_ROOT = "/Users/jayklarin/__DI/Repositories/FaceStats"
os.chdir(PROJECT_ROOT)
print("cwd:", os.getcwd())

# 2) Add src/ folder to Python PATH
SRC_PATH = os.path.join(PROJECT_ROOT, "src")
if SRC_PATH not in sys.path:
    sys.path.insert(0, SRC_PATH)

print("src path added:", SRC_PATH)


cwd: /Users/jayklarin/__DI/Repositories/FaceStats
src path added: /Users/jayklarin/__DI/Repositories/FaceStats/src


In [2]:
import torch
from transformers import CLIPModel, CLIPProcessor

# Select device: MPS if available
device = "mps" if torch.backends.mps.is_available() else "cpu"
print("Using device:", device)

# --- Wrap CLIP model loading ---
_original_clip_from_pretrained = CLIPModel.from_pretrained

def _mps_clip_from_pretrained(*args, **kwargs):
    model = _original_clip_from_pretrained(*args, **kwargs)
    return model.to(device)

CLIPModel.from_pretrained = _mps_clip_from_pretrained
print("CLIP model will now load onto MPS.")


# --- Wrap CLIPProcessor output only (no recursion risk) ---
_original_processor_from_pretrained = CLIPProcessor.from_pretrained

class MPSProcessorWrapper:
    def __init__(self, processor):
        self.processor = processor

    def __getattr__(self, name):
        return getattr(self.processor, name)

    def __call__(self, *args, **kwargs):
        out = self.processor(*args, **kwargs)
        # Move tensors in the output batch to MPS
        for k, v in out.items():
            if torch.is_tensor(v):
                out[k] = v.to(device)
        return out

def _mps_processor_from_pretrained(*args, **kwargs):
    processor = _original_processor_from_pretrained(*args, **kwargs)
    return MPSProcessorWrapper(processor)

CLIPProcessor.from_pretrained = _mps_processor_from_pretrained
print("CLIP processor outputs will now run on MPS.")


Using device: mps
CLIP model will now load onto MPS.
CLIP processor outputs will now run on MPS.


In [3]:
import os
import numpy as np
import polars as pl
import src.attributes.face_attributes as fa

INPUT_DIR = "data/processed/preproc"
OUTPUT_FILE = "data/processed/metadata/attributes.parquet"

os.makedirs("data/processed/metadata", exist_ok=True)


def infer_attributes_safe(image_path):
    emb = fa.get_embedding(image_path)          # shape (N,)
    emb = np.array(emb).reshape(1, -1)

    gender_pred = fa.GENDER_MODEL.predict(emb)[0]
    ethnicity_pred = fa.ETHNICITY_MODEL.predict(emb)[0]

    def resolve(pred, classes):
        if isinstance(pred, (int, np.integer)):
            return classes[pred]
        return str(pred)

    return {
        "gender": resolve(gender_pred, fa.GENDER_CLASSES),
        "ethnicity": resolve(ethnicity_pred, fa.ETHNICITY_CLASSES),
        "age": None,
    }

rows = []

for fname in sorted(os.listdir(INPUT_DIR)):
    if not fname.lower().endswith(".jpg"):
        continue

    path = os.path.join(INPUT_DIR, fname)
    attrs = infer_attributes_safe(path)

    rows.append({
        "filename": fname,
        "age": attrs.get("age"),
        "gender": attrs["gender"],
        "ethnicity": attrs["ethnicity"],
    })

if not rows:
    raise SystemExit("No rows produced; check input images.")

df = pl.DataFrame(rows)
df.write_parquet(OUTPUT_FILE)

print(df.head())
# 25 - 35 minutes on cpu
# 10 - 15 minutes on mps

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


shape: (5, 4)
┌───────────────────────┬──────┬─────────┬─────────────────┐
│ filename              ┆ age  ┆ gender  ┆ ethnicity       │
│ ---                   ┆ ---  ┆ ---     ┆ ---             │
│ str                   ┆ null ┆ str     ┆ str             │
╞═══════════════════════╪══════╪═════════╪═════════════════╡
│ SFHQ_pt4_00000004.jpg ┆ null ┆ male    ┆ unknown         │
│ SFHQ_pt4_00000014.jpg ┆ null ┆ unknown ┆ latino/hispanic │
│ SFHQ_pt4_00000017.jpg ┆ null ┆ unknown ┆ unknown         │
│ SFHQ_pt4_00000020.jpg ┆ null ┆ male    ┆ unknown         │
│ SFHQ_pt4_00000021.jpg ┆ null ┆ unknown ┆ unknown         │
└───────────────────────┴──────┴─────────┴─────────────────┘


### Step 3 — Load FairFace Label Structure

We avoid HuggingFace entirely and use the FairFace test output file to recover:

- Race class order (7-class)
- Race class order (4-class)
- Gender class order
- Age bucket order

These are needed to build a lightweight local classifier that maps InsightFace embeddings → attribute predictions.

This step ensures all later attributes (such as ethnicity filters in composites) behave correctly.


In [4]:
import polars as pl

LABELS = "data/processed/metadata/fairface_label_structure.parquet"
df = pl.read_parquet(LABELS)

# Columns are wide (race7, race4, gender, age_buckets), so grab the first row lists directly
row = df.row(0)
race7_labels = row[df.columns.index("race7")]
race4_labels = row[df.columns.index("race4")]
gender_labels = row[df.columns.index("gender")]
age_labels = row[df.columns.index("age_buckets")]

race7_labels, race4_labels, gender_labels, age_labels


(['white',
  'black',
  'latino/hispanic',
  'east asian',
  'southeast asian',
  'indian',
  'middle eastern'],
 ['white', 'black', 'asian', 'indian'],
 ['male', 'female'],
 ['0-2', '3-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70+'])

### Step 4 — Validate Attribute Results

Now that we have:

- InsightFace age + gender estimations  
- FairFace race/gender/age **label order**  
- Clean attribute parquet from Step 2  

We perform a quick validation:

1. Load the first few faces  
2. Show their raw InsightFace predictions  
3. Confirm ethnicity is still `"unknown"` (expected)  
4. Confirm age values look reasonable (0–100)  
5. Confirm gender values match `male` / `female`  
6. Print the FairFace label order for final confirmation  


In [5]:
import polars as pl
from attributes.face_attributes import infer_attributes

# Load the attributes we already computed earlier
ATTR_FILE = "data/processed/metadata/attributes.parquet"
df_attr = pl.read_parquet(ATTR_FILE)

print("Loaded attributes:", df_attr.shape)

# --- Show a sample of 5 records ---
print("\nSample of attribute predictions:")
display(df_attr.head().to_pandas())

# --- Validate age range ---
print("\nAge range (min, max):")
print(df_attr["age"].min(), df_attr["age"].max())

# --- Validate gender distribution ---
print("\nGender distribution:")
print(df_attr["gender"].value_counts())

# --- Validate ethnicity (should be mostly 'unknown' for now) ---
print("\nEthnicity distribution:")
print(df_attr["ethnicity"].value_counts())

# --- Load label structure from Step 3 ---
LABELS = {
    "race7": [
        "white", "black", "latino/hispanic",
        "east asian", "southeast asian",
        "indian", "middle eastern"
    ],
    "race4": ["white", "black", "asian", "indian"],
    "gender": ["male", "female"],
    "age": [
        "0-2", "3-9", "10-19", "20-29", "30-39",
        "40-49", "50-59", "60-69", "70+"
    ]
}

print("\nFairFace Label Structure:")
for k,v in LABELS.items():
    print(f"{k}: {v}")


Loaded attributes: (10000, 4)

Sample of attribute predictions:


Unnamed: 0,filename,age,gender,ethnicity
0,SFHQ_pt4_00000004.jpg,,male,unknown
1,SFHQ_pt4_00000014.jpg,,unknown,latino/hispanic
2,SFHQ_pt4_00000017.jpg,,unknown,unknown
3,SFHQ_pt4_00000020.jpg,,male,unknown
4,SFHQ_pt4_00000021.jpg,,unknown,unknown



Age range (min, max):
None None

Gender distribution:
shape: (3, 2)
┌─────────┬───────┐
│ gender  ┆ count │
│ ---     ┆ ---   │
│ str     ┆ u32   │
╞═════════╪═══════╡
│ male    ┆ 1733  │
│ unknown ┆ 7161  │
│ female  ┆ 1106  │
└─────────┴───────┘

Ethnicity distribution:
shape: (7, 2)
┌─────────────────────────┬───────┐
│ ethnicity               ┆ count │
│ ---                     ┆ ---   │
│ str                     ┆ u32   │
╞═════════════════════════╪═══════╡
│ middle_eastern          ┆ 118   │
│ indian                  ┆ 99    │
│ black                   ┆ 267   │
│ white                   ┆ 1245  │
│ latino/hispanic         ┆ 230   │
│ unknown                 ┆ 7845  │
│ east_or_southeast_asian ┆ 196   │
└─────────────────────────┴───────┘

FairFace Label Structure:
race7: ['white', 'black', 'latino/hispanic', 'east asian', 'southeast asian', 'indian', 'middle eastern']
race4: ['white', 'black', 'asian', 'indian']
gender: ['male', 'female']
age: ['0-2', '3-9', '10-19', '20-29', '

### Step 5 — Train FairFace Attribute Classifier (InsightFace Embeddings)

In this step, we merge three data sources:

1. **InsightFace / CLIP Embeddings**  
   `data/processed/embeddings/embeddings_clip.parquet`  
   Contains:  
   - `filename`  
   - `embedding` (512-D InsightFace vector)

2. **Attributes extracted by face_attributes.py**  
   `data/processed/metadata/attributes.parquet`  
   Contains:  
   - age  
   - gender  
   - ethnicity

3. **FairFace class label structure**  
   `data/processed/metadata/fairface_label_structure.parquet`  
   Contains:  
   - race7 labels  
   - race4 labels  
   - gender labels  
   - age bucket labels  

We join on `filename` and prepare a combined training dataset.  
This merged dataset is used to train lightweight auxiliary classifiers  
(e.g., ethnicity classifier) for downstream composite filtering.


In [6]:
import polars as pl
from pathlib import Path

ATTR = Path("data/processed/metadata/attributes.parquet")
EMB_PRIMARY = Path("data/processed/embeddings/embeddings_clip.parquet")
EMB_FALLBACK = Path("data/processed/embeddings_clip.parquet")
LABELS = Path("data/processed/metadata/fairface_label_structure.parquet")

if EMB_PRIMARY.exists():
    EMB = EMB_PRIMARY
elif EMB_FALLBACK.exists():
    EMB = EMB_FALLBACK
else:
    raise FileNotFoundError("Embeddings parquet not found; run 02_embeddings.ipynb to regenerate.")

# Load all three sources
df_attr = pl.read_parquet(ATTR)
df_emb  = pl.read_parquet(EMB)
df_lab  = pl.read_parquet(LABELS)

print("Shapes:")
print("Attributes:", df_attr.shape)
print("Embeddings (using):", EMB, df_emb.shape)
print("Labels:", df_lab.shape)

# === Merge: embeddings + attributes ===
df = (
    df_emb.join(df_attr, on="filename", how="inner")
)

print("\nMerged dataset shape:", df.shape)
df.head()


Shapes:
Attributes: (10000, 4)
Embeddings (using): data/processed/embeddings/embeddings_clip.parquet (10000, 2)
Labels: (1, 4)

Merged dataset shape: (10000, 5)


filename,embedding,age,gender,ethnicity
str,list[f64],null,str,str
"""SFHQ_pt4_00000004.jpg""","[0.030962, -0.03723, … 0.008554]",,"""male""","""unknown"""
"""SFHQ_pt4_00000014.jpg""","[0.040956, 0.017153, … 0.030146]",,"""unknown""","""latino/hispanic"""
"""SFHQ_pt4_00000017.jpg""","[0.050254, -0.009216, … 0.012368]",,"""unknown""","""unknown"""
"""SFHQ_pt4_00000020.jpg""","[0.031655, -0.006061, … 0.041207]",,"""male""","""unknown"""
"""SFHQ_pt4_00000021.jpg""","[0.047193, -0.061247, … 0.003204]",,"""unknown""","""unknown"""


### Step 7 — Merge Manual Gender/Ethnicity Labels

We now combine three sources:

1. **Attributes**  
   - Age estimates  
   - Existing metadata  
   - (`data/processed/metadata/attributes.parquet`)

2. **Embeddings (CLIP)**  
   - 1280-D embeddings for each image  
   - (`data/processed/embeddings/embeddings_clip.parquet`)

3. **Manual Labels (new!)**  
   - Gender + Ethnicity assigned via the Streamlit labeling app  
   - (`data/processed/metadata/manual_labels.csv`)

This merged dataset will be the foundation for training:
- Gender classifier  
- 6-class Ethnicity classifier  
- Any future supervised models

The merged file will be stored as:

`data/processed/metadata/attributes_with_manual.parquet`

In [7]:
import polars as pl

# Load auto + manual
attr = pl.read_parquet("data/processed/metadata/attributes.parquet")
ml   = pl.read_csv("data/processed/metadata/manual_labels.csv")

print("Auto attributes:", attr.shape)
print("Manual labels:", ml.shape)

# Merge manual labels
merged = (
    attr
    .join(ml, on="filename", how="left")
)

# Final combined labels
merged = merged.with_columns([

    # Final gender
    pl.coalesce([
        pl.col("gender_right"),     # manual (new)
        pl.col("gender"),           # auto fallback
        pl.lit("unknown")
    ]).alias("gender_final"),

    # Final ethnicity
    pl.coalesce([
        pl.col("ethnicity_right"),  # manual (new)
        pl.col("ethnicity"),        # auto fallback
        pl.lit("unknown")
    ]).alias("ethnicity_final"),

])

clean = merged.select([
    "filename",
    "age",
    "gender_final",
    "ethnicity_final",
])

print("Clean attributes:", clean.shape)
clean.head()


Auto attributes: (10000, 4)
Manual labels: (200, 3)
Clean attributes: (10000, 4)


filename,age,gender_final,ethnicity_final
str,null,str,str
"""SFHQ_pt4_00000004.jpg""",,"""male""","""unknown"""
"""SFHQ_pt4_00000014.jpg""",,"""unknown""","""latino/hispanic"""
"""SFHQ_pt4_00000017.jpg""",,"""unknown""","""unknown"""
"""SFHQ_pt4_00000020.jpg""",,"""male""","""unknown"""
"""SFHQ_pt4_00000021.jpg""",,"""unknown""","""unknown"""


## Save attributes

In [8]:
OUT = "data/processed/metadata/attributes_clean.parquet"
clean.write_parquet(OUT)
print("Saved →", OUT)


Saved → data/processed/metadata/attributes_clean.parquet


### Step 8 — Train Gender & Ethnicity Classifiers (Using Clean Labels)

Now that we have **attributes_clean.parquet** with corrected gender + ethnicity:

- `gender_final`  
- `ethnicity_final`  

We can train two classifiers:

1. **Gender classifier**  
2. **Ethnicity classifier**

Both use the **CLIP embeddings** as input (64-D vectors from `embeddings_clip.parquet`).

This produces:

- `models/gender_clf.pkl`
- `models/ethnicity_clf.pkl`

These models will be used throughout the rest of the FaceStats pipeline.


In [9]:
import polars as pl
import numpy as np
import joblib
import os

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# ------------------------------------------------------------
# LOAD CLEAN LABELS + EMBEDDINGS
# ------------------------------------------------------------
ATTR = "data/processed/metadata/attributes_clean.parquet"
EMB  = "data/processed/embeddings/embeddings_clip.parquet"

df_attr = pl.read_parquet(ATTR)
df_emb  = pl.read_parquet(EMB)

print("Attributes:", df_attr.shape)
print("Embeddings:", df_emb.shape)

# ------------------------------------------------------------
# MERGE
# ------------------------------------------------------------
df = (
    df_emb
    .join(df_attr, on="filename", how="inner")
)

print("Merged:", df.shape)
df.head()

# ------------------------------------------------------------
# PREPARE INPUT MATRICES
# ------------------------------------------------------------
X = np.vstack(df["embedding"].to_list())

gender_y    = df["gender_final"].to_list()
ethnicity_y = df["ethnicity_final"].to_list()

print("Unique genders:", set(gender_y))
print("Unique ethnicities:", set(ethnicity_y))

# Must have ≥ 2 classes to train
assert len(set(gender_y)) >= 2, "Not enough gender classes!"
assert len(set(ethnicity_y)) >= 2, "Not enough ethnicity classes!"

# ------------------------------------------------------------
# PIPELINES
# ------------------------------------------------------------
def make_pipeline():
    return Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=500))
    ])

gender_clf    = make_pipeline()
ethnicity_clf = make_pipeline()

# ------------------------------------------------------------
# TRAIN
# ------------------------------------------------------------
print("Training gender classifier…")
gender_clf.fit(X, gender_y)

print("Training ethnicity classifier…")
ethnicity_clf.fit(X, ethnicity_y)

# ------------------------------------------------------------
# SAVE
# ------------------------------------------------------------
OUT_DIR = "src/models"
os.makedirs(OUT_DIR, exist_ok=True)

gender_path    = os.path.join(OUT_DIR, "gender_clf.pkl")
ethnicity_path = os.path.join(OUT_DIR, "ethnicity_clf.pkl")

joblib.dump(gender_clf, gender_path)
joblib.dump(ethnicity_clf, ethnicity_path)

print("\nSaved models:")
print("  →", gender_path)
print("  →", ethnicity_path)


Attributes: (10000, 4)
Embeddings: (10000, 2)
Merged: (10000, 5)
Unique genders: {'female', 'male', 'unknown'}
Unique ethnicities: {'latino/hispanic', 'black', 'white', 'unknown', 'middle_eastern', 'indian', 'east_or_southeast_asian'}
Training gender classifier…
Training ethnicity classifier…

Saved models:
  → src/models/gender_clf.pkl
  → src/models/ethnicity_clf.pkl


In [10]:
# ================================================================
# Step 9 — Apply Gender & Ethnicity Classifiers to All Embeddings
# ================================================================
import polars as pl
import numpy as np
import joblib
import os

# ----------------------------------------------------------
# Load embeddings + existing attributes
# ----------------------------------------------------------
emb = pl.read_parquet("data/processed/embeddings/embeddings_clip.parquet")
attr = pl.read_parquet("data/processed/metadata/attributes.parquet")

print("Embeddings:", emb.shape)
print("Attributes:", attr.shape)

# Merge
df = (
    emb
    .join(attr, on="filename", how="inner")
)

# Convert embeddings → numpy matrix
X = np.vstack(df["embedding"].to_list())

# ----------------------------------------------------------
# Load trained classifiers
# ----------------------------------------------------------
gender_clf = joblib.load("src/models/gender_clf.pkl")
ethnicity_clf = joblib.load("src/models/ethnicity_clf.pkl")

# ----------------------------------------------------------
# Run predictions
# ----------------------------------------------------------
gender_pred = gender_clf.predict(X)
ethnicity_pred = ethnicity_clf.predict(X)

# ----------------------------------------------------------
# Build prediction table
# ----------------------------------------------------------
df_pred = pl.DataFrame({
    "filename": df["filename"],
    "gender_pred": gender_pred,
    "ethnicity_pred": ethnicity_pred,
})

print(df_pred.head())

# ----------------------------------------------------------
# Merge with original attributes
# ----------------------------------------------------------
full = (
    df
    .join(df_pred, on="filename", how="inner")
)

print("\nFinal merged shape:", full.shape)
full.head()

# ----------------------------------------------------------
# Save final table
# ----------------------------------------------------------
OUT = "data/processed/metadata/attributes_with_predictions.parquet"
full.write_parquet(OUT)

print("\nSaved final attribute table →", OUT)


Embeddings: (10000, 2)
Attributes: (10000, 4)
shape: (5, 3)
┌───────────────────────┬─────────────┬─────────────────┐
│ filename              ┆ gender_pred ┆ ethnicity_pred  │
│ ---                   ┆ ---         ┆ ---             │
│ str                   ┆ str         ┆ str             │
╞═══════════════════════╪═════════════╪═════════════════╡
│ SFHQ_pt4_00000004.jpg ┆ male        ┆ unknown         │
│ SFHQ_pt4_00000014.jpg ┆ unknown     ┆ latino/hispanic │
│ SFHQ_pt4_00000017.jpg ┆ unknown     ┆ unknown         │
│ SFHQ_pt4_00000020.jpg ┆ male        ┆ unknown         │
│ SFHQ_pt4_00000021.jpg ┆ unknown     ┆ unknown         │
└───────────────────────┴─────────────┴─────────────────┘

Final merged shape: (10000, 7)

Saved final attribute table → data/processed/metadata/attributes_with_predictions.parquet


In [11]:
import joblib
import numpy as np
from PIL import Image
import polars as pl
import os

# ============================================================
# Step 10 — Update face_attributes.py with new classifier logic
# ============================================================

updated_code = r"""
import os
import joblib
import numpy as np
from PIL import Image

# ------------------------------------------------------------
# Load trained models
# ------------------------------------------------------------
MODEL_DIR = os.path.join(os.path.dirname(__file__), "..", "models")
MODEL_DIR = os.path.abspath(MODEL_DIR)

GENDER_MODEL = joblib.load(os.path.join(MODEL_DIR, "gender_clf.pkl"))
ETHNICITY_MODEL = joblib.load(os.path.join(MODEL_DIR, "ethnicity_clf.pkl"))

# Mapping (must match training order)
GENDER_CLASSES = ["female", "male"]
ETHNICITY_CLASSES = [
    "white",
    "black",
    "latino/hispanic",
    "east_or_southeast_asian",
    "indian",
    "middle_eastern"
]

# ------------------------------------------------------------
# Extract CLIP embedding for a single image
# (Used at inference time on new images)
# ------------------------------------------------------------
def get_embedding(image_path):
    from src.embeddings.embed_clip import get_clip_embedding
    return get_clip_embedding(image_path)

# ------------------------------------------------------------
# Main inference function used by pipelines
# ------------------------------------------------------------
def infer_attributes(image_path):
    emb = get_embedding(image_path)          # shape (N,)
    emb = np.array(emb).reshape(1, -1)

    gender_pred = GENDER_MODEL.predict(emb)[0]
    ethnicity_pred = ETHNICITY_MODEL.predict(emb)[0]

    return {
        "gender": GENDER_CLASSES[gender_pred],
        "ethnicity": ETHNICITY_CLASSES[ethnicity_pred],
    }
"""

# Write the updated file
ATTR_PATH = "src/attributes/face_attributes.py"

with open(ATTR_PATH, "w") as f:
    f.write(updated_code)

print(f"Updated file → {ATTR_PATH}")


Updated file → src/attributes/face_attributes.py


In [12]:
import os
import random
import numpy as np
from PIL import Image
import src.attributes.face_attributes as fa  # use the src path so imports work consistently

IMG_DIR = "data/processed/preproc"

def infer_attributes_safe(image_path):
    emb = fa.get_embedding(image_path)
    emb = np.array(emb).reshape(1, -1)

    gender_pred = fa.GENDER_MODEL.predict(emb)[0]
    ethnicity_pred = fa.ETHNICITY_MODEL.predict(emb)[0]

    def resolve(pred, classes):
        if isinstance(pred, (int, np.integer)):
            return classes[pred]
        return str(pred)

    return {
        "gender": resolve(gender_pred, fa.GENDER_CLASSES),
        "ethnicity": resolve(ethnicity_pred, fa.ETHNICITY_CLASSES),
    }

# pick 5 random images
sample_files = random.sample(
    [f for f in os.listdir(IMG_DIR) if f.lower().endswith(".jpg")],
    5
)

print("Testing images:", sample_files, "\n")

results = []
for fname in sample_files:
    path = os.path.join(IMG_DIR, fname)
    attrs = infer_attributes_safe(path)
    results.append((fname, attrs))

# Display results
for fname, attrs in results:
    print(f"--- {fname} ---")
    print("Gender:    ", attrs["gender"])
    print("Ethnicity: ", attrs["ethnicity"])
    print()


Testing images: ['SFHQ_pt4_00000276.jpg', 'SFHQ_pt4_00085615.jpg', 'SFHQ_pt4_00019638.jpg', 'SFHQ_pt4_00083112.jpg', 'SFHQ_pt4_00010782.jpg'] 

--- SFHQ_pt4_00000276.jpg ---
Gender:     male
Ethnicity:  black

--- SFHQ_pt4_00085615.jpg ---
Gender:     unknown
Ethnicity:  unknown

--- SFHQ_pt4_00019638.jpg ---
Gender:     unknown
Ethnicity:  unknown

--- SFHQ_pt4_00083112.jpg ---
Gender:     unknown
Ethnicity:  white

--- SFHQ_pt4_00010782.jpg ---
Gender:     unknown
Ethnicity:  unknown



### Step 13 — Batch Inference for All Images (Full Dataset)

This step applies our trained gender & ethnicity classifiers to **all embeddings**, merges manual labels, and produces the final clean attribute table.

- Loads embeddings  
- Loads raw attributes  
- Loads manual labels  
- Applies classifiers  
- Merges everything together  
- Saves `attributes_final.parquet`


In [13]:
import os
import polars as pl
import joblib
import numpy as np

# --------------------------------------------------
# Paths
# --------------------------------------------------
EMB = "data/processed/embeddings/embeddings_clip.parquet"
ATTR = "data/processed/metadata/attributes.parquet"
MANUAL = "data/processed/metadata/manual_labels.csv"

GENDER_MODEL = "src/models/gender_clf.pkl"
ETH_MODEL = "src/models/ethnicity_clf.pkl"

# --------------------------------------------------
# Load models
# --------------------------------------------------
gender_clf = joblib.load(GENDER_MODEL)
ethnicity_clf = joblib.load(ETH_MODEL)

# --------------------------------------------------
# Load data
# --------------------------------------------------
df_emb = pl.read_parquet(EMB)
df_attr = pl.read_parquet(ATTR)
df_manual = pl.read_csv(MANUAL) if os.path.exists(MANUAL) else pl.DataFrame()

print("Embeddings:", df_emb.shape)
print("Attributes:", df_attr.shape)
print("Manual:", df_manual.shape)

# --------------------------------------------------
# Prepare classifier input
# --------------------------------------------------
X = np.vstack(df_emb["embedding"].to_list())

# --------------------------------------------------
# Predict
# --------------------------------------------------
gender_pred = gender_clf.predict(X)
eth_pred = ethnicity_clf.predict(X)

df_pred = pl.DataFrame({
    "filename": df_emb["filename"],
    "gender_pred": gender_pred,
    "ethnicity_pred": eth_pred,
})

# --------------------------------------------------
# Merge everything
# --------------------------------------------------
merged = (
    df_attr
    .join(df_pred, on="filename", how="inner")
 )

if df_manual.height > 0:
    merged = (
        merged
        .join(df_manual, on="filename", how="left", suffix="_manual")
    )

# --------------------------------------------------
# Final label logic:
# 1. manual label > model prediction > original > unknown
# --------------------------------------------------
merged = merged.with_columns([
    pl.coalesce(["gender_manual", "gender_pred", "gender", pl.lit("unknown")])
      .alias("gender_final"),
    pl.coalesce(["ethnicity_manual", "ethnicity_pred", "ethnicity", pl.lit("unknown")])
      .alias("ethnicity_final"),
])

# Keep only useful columns (attractiveness merged in next step)
final = merged.select([
    "filename",
    "age",
    "gender_final",
    "ethnicity_final",
])

print("Final attribute table (before attractiveness):", final.shape)
final.head()


Embeddings: (10000, 2)
Attributes: (10000, 4)
Manual: (200, 3)
Final attribute table (before attractiveness): (10000, 4)


filename,age,gender_final,ethnicity_final
str,null,str,str
"""SFHQ_pt4_00086092.jpg""",,"""unknown""","""unknown"""
"""SFHQ_pt4_00065309.jpg""",,"""female""","""unknown"""
"""SFHQ_pt4_00062466.jpg""",,"""unknown""","""unknown"""
"""SFHQ_pt4_00090828.jpg""",,"""unknown""","""unknown"""
"""SFHQ_pt4_00032251.jpg""",,"""unknown""","""unknown"""


In [14]:
# =============================================================
# Compute canonical attractiveness (deciles 1–10) and merge
# =============================================================
from pathlib import Path

from src.attractiveness.scoring import AttractivenessScorer

MODEL_PATH = Path("src/models/attractiveness_regressor.pt")
SCORES_PARQUET = Path("data/processed/metadata/attractiveness_scores.parquet")
SCORES_NUMPY = Path("data/processed/attractiveness_scores.npy")
OUT_PATH = Path("data/processed/metadata/attributes_final.parquet")

print("Computing attractiveness scores from embeddings…")
scorer = AttractivenessScorer(MODEL_PATH)
df_scores = scorer.score_embeddings(df_emb.select(["filename", "embedding"]))
scorer.save_scores(df_scores, SCORES_PARQUET, numpy_path=SCORES_NUMPY)
print("Saved canonical scores →", SCORES_PARQUET)

# Merge deciled score into final attributes
df_with_attr = final.join(
    df_scores.select(["filename", "attractiveness"]),
    on="filename",
    how="left",
)

print("Final with attractiveness:", df_with_attr.shape)
df_with_attr.write_parquet(OUT_PATH)

print("✔ Saved merged file →", OUT_PATH)
df_with_attr.head(5)


Computing attractiveness scores from embeddings…
Saved canonical scores → data/processed/metadata/attractiveness_scores.parquet
Final with attractiveness: (10000, 5)
✔ Saved merged file → data/processed/metadata/attributes_final.parquet


filename,age,gender_final,ethnicity_final,attractiveness
str,null,str,str,i64
"""SFHQ_pt4_00086092.jpg""",,"""unknown""","""unknown""",8
"""SFHQ_pt4_00065309.jpg""",,"""female""","""unknown""",7
"""SFHQ_pt4_00062466.jpg""",,"""unknown""","""unknown""",2
"""SFHQ_pt4_00090828.jpg""",,"""unknown""","""unknown""",4
"""SFHQ_pt4_00032251.jpg""",,"""unknown""","""unknown""",5
