In [None]:
# goal: run an existing model (deepface) on my balanced database to do predictions 

In [None]:
import os, pandas as pd, numpy as np
from tqdm import tqdm
from deepface import DeepFace

ROOT = os.path.abspath("..") 

DATA_DIR = os.path.join(ROOT, "data", "processed", "balanced_images")
TRAIN_DIR = os.path.join(DATA_DIR, "train")
VAL_DIR   = os.path.join(DATA_DIR, "val")

TRAIN_CSV = os.path.join(ROOT, "data", "processed", "balanced_train.csv")
VAL_CSV   = os.path.join(ROOT, "data", "processed", "balanced_val.csv")

OUT_DIR   = os.path.join(ROOT, "results", "baseline")
os.makedirs(OUT_DIR, exist_ok=True)

print(TRAIN_DIR)
print(TRAIN_CSV)
print(OUT_DIR)
# setting all the paths nedded

In [2]:
def attach_path(df, img_dir):
    df = df.copy()
    df["fname"] = df["file"].apply(lambda f: os.path.basename(f))
    df["img_path"] = df["fname"].apply(lambda f: os.path.join(img_dir, f))
    return df

train_df = attach_path(pd.read_csv(TRAIN_CSV), TRAIN_DIR)
val_df   = attach_path(pd.read_csv(VAL_CSV),   VAL_DIR)

len(train_df), len(val_df), train_df.head(2)
# linking each file in the csv with its own actual image path

(7000,
 2100,
               file    age  gender   race  service_test      fname  \
 0  train/60423.jpg  20-29  Female  Black          True  60423.jpg   
 1  train/45029.jpg  30-39  Female  Black         False  45029.jpg   
 
                                             img_path  
 0  /Users/leobideau/Desktop/fairface-project/data...  
 1  /Users/leobideau/Desktop/fairface-project/data...  )

In [3]:
FAIR2COARSE = {
    "White":"white", "Black":"black", "Indian":"indian",
    "Middle Eastern":"middle eastern", "Latino_Hispanic":"latino hispanic",
    "East Asian":"asian", "Southeast Asian":"asian",
}
GENDER_MAP_PRED2GT = {"Man":"Male", "Woman":"Female"}
# here i'm alligning deepface outputs with fairface label format

In [6]:
def analyze_one(img_path, detector="retinaface"):
    try:
        res = DeepFace.analyze(
            img_path,                              
            actions=["gender", "race"],
            detector_backend=detector            
        )

        if isinstance(res, list):
            res = res[0]

        return {
            "pred_gender":      res.get("dominant_gender"),
            "pred_gender_score":res.get("gender", {}).get(res.get("dominant_gender")),
            "pred_race":        res.get("dominant_race"),
            "pred_race_score":  res.get("race", {}).get(res.get("dominant_race")),
            "error":            None,
        }

    except Exception as e:
        return {
            "pred_gender": None, "pred_gender_score": None,
            "pred_race":   None, "pred_race_score":   None,
            "error": str(e),
        }
# I just use deepface on one image and pull out the gender nd race outputs i need,
# i'm keeping the call minimal because deepface changes a lot between versions.
# If it returns multiple faces, I keep the first. If it errors, I store the error

In [6]:
%pip install pyarrow
# installing  pyarrow so I can write parquet files for storing predictions

Collecting pyarrow
  Downloading pyarrow-22.0.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (3.1 kB)
Downloading pyarrow-22.0.0-cp311-cp311-macosx_12_0_arm64.whl (34.3 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.3/34.3 MB[0m [31m3.4 MB/s[0m  [33m0:00:10[0m[0m eta [36m0:00:01[0m[36m0:00:01[0m
[?25hInstalling collected packages: pyarrow
Successfully installed pyarrow-22.0.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
pred_train = run_split(train_df, os.path.join(OUT_DIR, "pred_train.parquet"))
pred_val   = run_split(val_df,   os.path.join(OUT_DIR, "pred_val.parquet"))
pred_train.head(3)
# run deepface on the full train nd val splits and save the results
# Nb: this will take a while because it goes through the 10'000 images one by one

  0%|                                                                                          | 0/7000 [00:00<?, ?it/s]

25-11-14 18:03:31 - retinaface.h5 will be downloaded from the url https://github.com/serengil/deepface_models/releases/download/v1.0/retinaface.h5


Downloading...
From: https://github.com/serengil/deepface_models/releases/download/v1.0/retinaface.h5
To: /Users/leobideau/.deepface/weights/retinaface.h5

  0%|                                                                                        | 0.00/119M [00:00<?, ?B/s][A
  1%|█                                                                              | 1.57M/119M [00:00<00:08, 13.5MB/s][A
  4%|███▍                                                                           | 5.24M/119M [00:00<00:04, 24.1MB/s][A
  7%|█████▌                                                                         | 8.39M/119M [00:00<00:04, 25.0MB/s][A
  9%|███████▎                                                                       | 11.0M/119M [00:00<00:04, 22.4MB/s][A
 11%|█████████                                                                      | 13.6M/119M [00:00<00:04, 21.5MB/s][A
 14%|██████████▊                                                                    | 16.3M/119M [00

In [9]:
import os, pandas as pd

p = "results/baseline/pred_train.parquet"
print("exists:", os.path.exists(p))

if os.path.exists(p):
    try:
        df = pd.read_parquet(p)
        print("rows:", len(df))
        print(df.head())
    except:
        print("the file is corrupted or empty")
# quick sanitiy check to see if the train predictions file was created correctly

exists: False


In [12]:
import os
os.makedirs(OUT_DIR, exist_ok=True)

In [13]:
print("OUT_DIR =", OUT_DIR)
print("exists =", os.path.exists(OUT_DIR))

OUT_DIR = results/baseline
exists = True


In [14]:
def run_split(df, out_path, detector="retinaface", save_every=50):
    rows = []
    out_df = None
    
    if os.path.exists(out_path):
        try:
            out_df = pd.read_parquet(out_path)
            done_files = set(out_df['file'].tolist())
        except:
            out_df = None
            done_files = set()
    else:
        done_files = set()

    for i, r in enumerate(df.itertuples(), start=1):
        if r.file in done_files:
            continue

        pred = analyze_one(r.img_path, detector)
        pred.update({
            "file": r.file,
            "race_true": r.race,
            "gender_true": r.gender,
        })
        rows.append(pred)
        
        if i % save_every == 0:
            tmp_df = pd.DataFrame(rows)
            if out_df is not None:
                out_df = pd.concat([out_df, tmp_df], ignore_index=True)
            else:
                out_df = tmp_df
            out_df.to_parquet(out_path, index=False)
            rows = []

    if rows:
        tmp_df = pd.DataFrame(rows)
        if out_df is not None:
            out_df = pd.concat([out_df, tmp_df], ignore_index=True)
        else:
            out_df = tmp_df
        out_df.to_parquet(out_path, index=False)

    return out_df
# I use this function to run DeepFace on the split train or val 
# it also supports restarting if the process crashes: 
# if a partial parquet file already exists, it will be reloaded and only goes trough the remaining images
# Nb : it saves progress every 50 images so i don't lose everything if the kernel crashes (which happened before)

In [None]:
pred_train = run_split(train_df, os.path.join(OUT_DIR, "pred_train.parquet"))

Action: race: 100%|███████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 12.38it/s]
Action: race: 100%|███████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 50.15it/s]
Action: race: 100%|███████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 48.77it/s]
Action: race: 100%|███████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 47.95it/s]
Action: race: 100%|███████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 48.37it/s]
Action: race: 100%|███████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 48.55it/s]
Action: race: 100%|███████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 48.36it/s]
Action: race: 100%|███████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 46.20it/s]
Action: race: 100%|█████████████

In [16]:
import os

print("pred_train exists:", os.path.exists("results/baseline/pred_train.parquet"))
print("pred_val exists:",   os.path.exists("results/baseline/pred_val.parquet"))
# check both prediction worked 

pred_train exists: True
pred_val exists: False


In [None]:
pred_val = run_split(val_df, os.path.join(OUT_DIR, "pred_val.parquet"))

Action: race: 100%|███████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 48.26it/s]
Action: race: 100%|███████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 45.19it/s]
Action: race: 100%|███████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 48.25it/s]
Action: race: 100%|███████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 46.16it/s]
Action: race: 100%|███████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 48.19it/s]
Action: race: 100%|███████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 44.42it/s]
Action: race: 100%|███████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 46.59it/s]
Action: race: 100%|███████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 48.64it/s]
Action: race: 100%|█████████████

In [22]:
import os

print("pred_train exists:", os.path.exists("results/baseline/pred_train.parquet"))
print("pred_val exists:",   os.path.exists("results/baseline/pred_val.parquet"))
# quick check they both exist now 

pred_train exists: True
pred_val exists: True


In [23]:
import pandas as pd

train_pred = pd.read_parquet("results/baseline/pred_train.parquet")
val_pred   = pd.read_parquet("results/baseline/pred_val.parquet")

print(len(train_pred), len(val_pred))
train_pred.head()
# loaded the prediction files for train and val to check that everything is there and that the shapes are ok

7000 2100


Unnamed: 0,pred_gender,pred_gender_score,pred_race,pred_race_score,error,file,race_true,gender_true
0,Woman,88.418669,black,43.71421,,train/60423.jpg,Black,Female
1,Man,97.433734,black,78.286773,,train/45029.jpg,Black,Female
2,Woman,99.483669,latino hispanic,41.124514,,train/81730.jpg,Black,Female
3,Man,89.559507,indian,58.37732,,train/72069.jpg,Black,Female
4,Man,64.608073,black,34.814405,,train/37655.jpg,Black,Female
