In [1]:
# goal of this notebook:
# this notebook extracts visual face embeddings from all images using deepface.represent()
# these embeddings will later be merged with my existing ml features to improve the models as my other features didnt help much
# Nb: i do NOT touch any of my previous parquet files 
# i simply create new embedding files and keep everything modular and safe

In [2]:
import os
import pandas as pd
from tqdm import tqdm
from deepface import DeepFace

root = os.path.abspath("..")

train_parquet = os.path.join(root, "data", "ml_ready", "train_ml_ready.parquet")
val_parquet   = os.path.join(root, "data", "ml_ready", "val_ml_ready.parquet")

out_dir = os.path.join(root, "data", "embeddings")
os.makedirs(out_dir, exist_ok=True)

print("train parquet:", train_parquet)
print("val parquet:", val_parquet)
print("output dir:", out_dir)
# loading the 'ml_ready' datasets i built earlier and preparing a new folder where i will store clean embedding parquet files

train parquet: /Users/leobideau/Desktop/fairface-project/data/ml_ready/train_ml_ready.parquet
val parquet: /Users/leobideau/Desktop/fairface-project/data/ml_ready/val_ml_ready.parquet
output dir: /Users/leobideau/Desktop/fairface-project/data/embeddings


In [3]:
train_df = pd.read_parquet(train_parquet)
val_df   = pd.read_parquet(val_parquet)

print(train_df.shape, val_df.shape)
train_df.head()
# reading the ml-ready train/val datasets that contain all metadata, deepface predictions and image paths
# i will only use the img_path column here to extract embeddings

(7000, 12) (2100, 12)


Unnamed: 0,pred_gender,pred_gender_score,pred_race,pred_race_score,error,file,race_true,gender_true,img_path,brightness,contrast,saturation
0,Woman,88.418669,black,43.71421,,train/60423.jpg,Black,Female,/Users/leobideau/Desktop/fairface-project/data...,48.98708,59.403837,167.363665
1,Man,97.433734,black,78.286773,,train/45029.jpg,Black,Female,/Users/leobideau/Desktop/fairface-project/data...,141.144018,61.018735,126.112693
2,Woman,99.483669,latino hispanic,41.124514,,train/81730.jpg,Black,Female,/Users/leobideau/Desktop/fairface-project/data...,32.576097,43.355361,46.195073
3,Man,89.559507,indian,58.37732,,train/72069.jpg,Black,Female,/Users/leobideau/Desktop/fairface-project/data...,106.053985,67.849858,49.254235
4,Man,64.608073,black,34.814405,,train/37655.jpg,Black,Female,/Users/leobideau/Desktop/fairface-project/data...,55.268659,29.317591,129.966129


In [4]:
def get_embedding(path):
    try:
        res = DeepFace.represent(
            img_path = path,
            model_name = "Facenet512",
            enforce_detection = False
        )
        return res[0]["embedding"]
    except:
        return None
# this is the core function that extracts a 512 dimensional embedding vector from each image
# it's stable, and it will not crash the notebook because i catch every exception and return none if needed

In [5]:
train_df["embedding"] = [
    get_embedding(p) for p in tqdm(train_df["img_path"], desc="train embeddings")
]

train_emb_out = os.path.join(out_dir, "train_embeddings.parquet")
train_df.to_parquet(train_emb_out, index=False)

print("saved:", train_emb_out)
# running face embedding extraction on the train set
# each embedding is stored inside the dataframe, then exported to a dedicated parquet file.

train embeddings: 100%|█████████████████████████████████████████████████████████████| 7000/7000 [12:38<00:00,  9.23it/s]


saved: /Users/leobideau/Desktop/fairface-project/data/embeddings/train_embeddings.parquet


In [6]:
val_df["embedding"] = [
    get_embedding(p) for p in tqdm(val_df["img_path"], desc="val embeddings")
]

val_emb_out = os.path.join(out_dir, "val_embeddings.parquet")
val_df.to_parquet(val_emb_out, index=False)

print("saved:", val_emb_out)
# same for the validation set
# i have two brand-new files without touching previous work: train_embeddings.parquet nd val_embeddings.parquet

val embeddings: 100%|███████████████████████████████████████████████████████████████| 2100/2100 [03:37<00:00,  9.67it/s]

saved: /Users/leobideau/Desktop/fairface-project/data/embeddings/val_embeddings.parquet





In [8]:
train_df["embedding"].head()
# confirming that embeddings were correctly added

0    [0.24172157049179077, -0.7326277494430542, -1....
1    [-1.2366782426834106, 1.1582697629928589, -0.8...
2    [-0.12026417255401611, -0.3230658769607544, -2...
3    [-0.16084472835063934, -0.668351411819458, -0....
4    [0.6065793037414551, -0.4668246805667877, -1.1...
Name: embedding, dtype: object

In [9]:
# next step will happen in notebook 4BIS : merging embeddings + existing ML features