In [1]:
import os, sys

# 1) Set notebook working directory = FaceStats project root
PROJECT_ROOT = "/Users/jayklarin/__DI/Repositories/FaceStats"
os.chdir(PROJECT_ROOT)
print("cwd:", os.getcwd())

# 2) Add src/ folder to Python PATH
SRC_PATH = os.path.join(PROJECT_ROOT, "src")
if SRC_PATH not in sys.path:
    sys.path.insert(0, SRC_PATH)

print("src path added:", SRC_PATH)


cwd: /Users/jayklarin/__DI/Repositories/FaceStats
src path added: /Users/jayklarin/__DI/Repositories/FaceStats/src


In [2]:
from attributes.face_attributes import infer_attributes

import os
import polars as pl

INPUT_DIR = "data/processed/preproc"
OUTPUT_FILE = "data/processed/metadata/attributes.parquet"

os.makedirs("data/processed/metadata", exist_ok=True)

rows = []

for fname in sorted(os.listdir(INPUT_DIR)):
    if not fname.lower().endswith(".jpg"):
        continue

    path = os.path.join(INPUT_DIR, fname)

    attrs = infer_attributes(path)

    rows.append({
        "filename": fname,
        "age": attrs["age"],
        "gender": attrs["gender"],
        "ethnicity": attrs["ethnicity"],
    })

df = pl.DataFrame(rows)
df.write_parquet(OUTPUT_FILE)

df.head()




Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /Users/jayklarin/.insightface/models/buffalo_l/1k3d68.onnx landmark_3d_68 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /Users/jayklarin/.insightface/models/buffalo_l/2d106det.onnx landmark_2d_106 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /Users/jayklarin/.insightface/models/buffalo_l/det_10g.onnx detection [1, 3, '?', '?'] 127.5 128.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /Users/jayklarin/.insightface/models/buffalo_l/genderage.onnx genderage ['None', 3, 96, 96] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /Users/jayklarin/.insightface/models/buffalo_l/w600k_r50.onnx recognition ['None', 3, 112,

filename,age,gender,ethnicity
str,f64,str,str
"""SFHQ_pt4_00000058.jpg""",,"""unknown""","""unknown"""
"""SFHQ_pt4_00000072.jpg""",81.0,"""female""","""Asian"""
"""SFHQ_pt4_00000090.jpg""",46.0,"""female""","""Asian"""
"""SFHQ_pt4_00000095.jpg""",30.0,"""female""","""Asian"""
"""SFHQ_pt4_00000151.jpg""",,"""unknown""","""unknown"""


### Step 4 — Validate Attribute Results

Now that we have age, gender, and ethnicity extracted using `face_attributes.py`, we need to validate the results.

InsightFace may return:

- `None` for age (if no face detected)
- `"unknown"` for gender
- `"unknown"` for ethnicity

This step adds:
- face detection confidence  
- number of faces detected  
- quality checks  
- flags for low-quality results  


In [3]:
from attributes.face_attributes import app


In [None]:
import numpy as np
from PIL import Image

def infer_attributes_with_meta(image_path):
    img = np.array(Image.open(image_path).convert("RGB"))
    faces = app.get(img)

    # No faces
    if len(faces) == 0:
        return {
            "age": None,
            "gender": "unknown",
            "ethnicity": "unknown",
            "face_count": 0,
            "confidence": 0.0,
        }

    face = faces[0]

    # Map gender
    gender = "male" if face.sex == 1 else "female"

    # Ethnicity with confidence gating
    race_probs = face.race
    race_labels = ["Asian", "White", "Black", "Indian", "Middle Eastern", "Latino", "Other"]

    race_idx = int(np.argmax(race_probs))
    race_conf = float(race_probs[race_idx])

    ETHNICITY_THRESHOLD = 0.55

    if race_conf >= ETHNICITY_THRESHOLD:
        ethnicity = race_labels[race_idx]
    else:
        ethnicity = "unknown"


    return {
        "age": float(face.age),
        "gender": gender,
        "ethnicity": ethnicity,
        "face_count": len(faces),
        "confidence": float(face.det_score),   # Important: detection confidence
    }


In [5]:
import os
import polars as pl

INPUT_DIR = "data/processed/preproc"
OUTPUT_FILE = "data/processed/metadata/attributes_with_meta.parquet"

rows = []

for fname in sorted(os.listdir(INPUT_DIR)):
    if not fname.lower().endswith(".jpg"):
        continue

    path = os.path.join(INPUT_DIR, fname)
    attrs = infer_attributes_with_meta(path)

    rows.append({
        "filename": fname,
        "age": attrs["age"],
        "gender": attrs["gender"],
        "ethnicity": attrs["ethnicity"],
        "face_count": attrs["face_count"],
        "confidence": attrs["confidence"],
    })

df = pl.DataFrame(rows)
df.write_parquet(OUTPUT_FILE)

df.head()


filename,age,gender,ethnicity,face_count,confidence
str,f64,str,str,i64,f64
"""SFHQ_pt4_00000058.jpg""",,"""unknown""","""unknown""",0,0.0
"""SFHQ_pt4_00000072.jpg""",81.0,"""female""","""Asian""",1,0.532229
"""SFHQ_pt4_00000090.jpg""",46.0,"""female""","""Asian""",1,0.526885
"""SFHQ_pt4_00000095.jpg""",30.0,"""female""","""Asian""",1,0.814608
"""SFHQ_pt4_00000151.jpg""",,"""unknown""","""unknown""",0,0.0


In [6]:
df_clean = (
    df
    .filter(pl.col("face_count") == 1)
    .filter(pl.col("confidence") >= 0.40)
    .filter(pl.col("age").is_not_null())
    .filter((pl.col("age") >= 5) & (pl.col("age") <= 100))
)

df_clean.head()


filename,age,gender,ethnicity,face_count,confidence
str,f64,str,str,i64,f64
"""SFHQ_pt4_00000072.jpg""",81.0,"""female""","""Asian""",1,0.532229
"""SFHQ_pt4_00000090.jpg""",46.0,"""female""","""Asian""",1,0.526885
"""SFHQ_pt4_00000095.jpg""",30.0,"""female""","""Asian""",1,0.814608
"""SFHQ_pt4_00000182.jpg""",34.0,"""female""","""Asian""",1,0.696127
"""SFHQ_pt4_00000208.jpg""",50.0,"""female""","""Asian""",1,0.55539


In [7]:
df_clean.write_parquet("data/processed/metadata/attributes_clean.parquet")

In [8]:
df_clean.describe()

statistic,filename,age,gender,ethnicity,face_count,confidence
str,str,f64,str,str,f64,f64
"""count""","""44""",44.0,"""44""","""44""",44.0,44.0
"""null_count""","""0""",0.0,"""0""","""0""",0.0,0.0
"""mean""",,44.272727,,,1.0,0.623791
"""std""",,16.219947,,,0.0,0.087865
"""min""","""SFHQ_pt4_00000072.jpg""",22.0,"""female""","""Asian""",1.0,0.505435
"""25%""",,31.0,,,1.0,0.55539
"""50%""",,41.0,,,1.0,0.61063
"""75%""",,56.0,,,1.0,0.696127
"""max""","""SFHQ_pt4_00003657.jpg""",83.0,"""female""","""Asian""",1.0,0.814608


In [9]:
print("Counts by gender:")
print(df_clean["gender"].value_counts())

print("\nCounts by ethnicity:")
print(df_clean["ethnicity"].value_counts())


Counts by gender:
shape: (1, 2)
┌────────┬───────┐
│ gender ┆ count │
│ ---    ┆ ---   │
│ str    ┆ u32   │
╞════════╪═══════╡
│ female ┆ 44    │
└────────┴───────┘

Counts by ethnicity:
shape: (1, 2)
┌───────────┬───────┐
│ ethnicity ┆ count │
│ ---       ┆ ---   │
│ str       ┆ u32   │
╞═══════════╪═══════╡
│ Asian     ┆ 44    │
└───────────┴───────┘


In [10]:
df_flags = df.filter(
    (pl.col("face_count") != 1) |
    (pl.col("confidence") < 0.40) |
    (pl.col("age").is_null()) |
    (pl.col("gender") == "unknown") |
    (pl.col("ethnicity") == "unknown")
)

df_flags.write_parquet("data/processed/metadata/attributes_flags.parquet")
df_flags.head()


filename,age,gender,ethnicity,face_count,confidence
str,f64,str,str,i64,f64
"""SFHQ_pt4_00000058.jpg""",,"""unknown""","""unknown""",0,0.0
"""SFHQ_pt4_00000151.jpg""",,"""unknown""","""unknown""",0,0.0
"""SFHQ_pt4_00000223.jpg""",,"""unknown""","""unknown""",0,0.0
"""SFHQ_pt4_00000251.jpg""",,"""unknown""","""unknown""",0,0.0
"""SFHQ_pt4_00000373.jpg""",,"""unknown""","""unknown""",0,0.0
