In [None]:
# goal of this notebook:
# merge the visual embeddings i extracted in notebook 2bis with the ML ready features from notebook 4
# Nb: no old parquet files are modified
# i'll create new datasets train_ml_final nd val_ml_final including:
# deepface score features, brightness, contrast, saturation nd the face embeddings
# these final datasets will be used in notebook 5BIS to train stronger ml models

In [1]:
import os
import pandas as pd
import numpy as np

root = os.path.abspath("..")

train_ml_path = os.path.join(root, "data", "ml_ready", "train_ml_ready.parquet")
val_ml_path   = os.path.join(root, "data", "ml_ready", "val_ml_ready.parquet")

train_emb_path = os.path.join(root, "data", "embeddings", "train_embeddings.parquet")
val_emb_path   = os.path.join(root, "data", "embeddings", "val_embeddings.parquet")

out_dir = os.path.join(root, "data", "ml_final")
os.makedirs(out_dir, exist_ok=True)

print(train_ml_path)
print(val_ml_path)
print(train_emb_path)
print(val_emb_path)
print(out_dir)
# defining all source parquet files and creating the output directory for the final merged datasets

/Users/leobideau/Desktop/fairface-project/data/ml_ready/train_ml_ready.parquet
/Users/leobideau/Desktop/fairface-project/data/ml_ready/val_ml_ready.parquet
/Users/leobideau/Desktop/fairface-project/data/embeddings/train_embeddings.parquet
/Users/leobideau/Desktop/fairface-project/data/embeddings/val_embeddings.parquet
/Users/leobideau/Desktop/fairface-project/data/ml_final


In [2]:
train_ml = pd.read_parquet(train_ml_path)
val_ml   = pd.read_parquet(val_ml_path)

train_emb = pd.read_parquet(train_emb_path)
val_emb   = pd.read_parquet(val_emb_path)

print(train_ml.shape, train_emb.shape)
print(val_ml.shape, val_emb.shape)
# loading both ml ready nd embedding enhanced datasets

(7000, 12) (7000, 13)
(2100, 12) (2100, 13)


In [3]:
def expand_embeddings(df):
    emb_matrix = np.vstack(df["embedding"].values)
    emb_cols = [f"emb_{i}" for i in range(emb_matrix.shape[1])]
    emb_df = pd.DataFrame(emb_matrix, columns=emb_cols)
    return emb_df

train_emb_expanded = expand_embeddings(train_emb)
val_emb_expanded   = expand_embeddings(val_emb)

print(train_emb_expanded.shape)
print(val_emb_expanded.shape)
# converting the embedding lists into a clean 512 column dataframe where each row becomes a 512 dimensional vector

(7000, 512)
(2100, 512)


In [4]:
train_final = pd.concat([train_ml.reset_index(drop=True),
                         train_emb_expanded.reset_index(drop=True)], axis=1)

val_final = pd.concat([val_ml.reset_index(drop=True),
                       val_emb_expanded.reset_index(drop=True)], axis=1)

print(train_final.shape)
print(val_final.shape)
# merging the original ml features with the expanded embeddings

(7000, 524)
(2100, 524)


In [5]:
train_final_path = os.path.join(out_dir, "train_ml_final.parquet")
val_final_path   = os.path.join(out_dir, "val_ml_final.parquet")

train_final.to_parquet(train_final_path, index=False)
val_final.to_parquet(val_final_path, index=False)

print("saved:", train_final_path)
print("saved:", val_final_path)
# exporting the final datasets -> these files will be the inputs for notebook 5BIS

saved: /Users/leobideau/Desktop/fairface-project/data/ml_final/train_ml_final.parquet
saved: /Users/leobideau/Desktop/fairface-project/data/ml_final/val_ml_final.parquet


In [6]:
train_final.head()
# quick check everything worked fine

Unnamed: 0,pred_gender,pred_gender_score,pred_race,pred_race_score,error,file,race_true,gender_true,img_path,brightness,...,emb_502,emb_503,emb_504,emb_505,emb_506,emb_507,emb_508,emb_509,emb_510,emb_511
0,Woman,88.418669,black,43.71421,,train/60423.jpg,Black,Female,../data/processed/balanced_images/train/60423.jpg,48.98708,...,-0.38336,0.31017,-1.224776,0.447964,0.245348,0.083696,-0.843259,0.022875,0.841157,0.046678
1,Man,97.433734,black,78.286773,,train/45029.jpg,Black,Female,../data/processed/balanced_images/train/45029.jpg,141.144018,...,-0.61093,-0.407818,0.366365,0.419494,-1.338491,0.949222,-0.272494,0.577557,2.479701,-0.574154
2,Woman,99.483669,latino hispanic,41.124514,,train/81730.jpg,Black,Female,../data/processed/balanced_images/train/81730.jpg,32.576097,...,-0.11148,0.934989,-0.916757,0.991272,0.842678,0.484414,0.006311,-0.357479,-0.067248,0.240468
3,Man,89.559507,indian,58.37732,,train/72069.jpg,Black,Female,../data/processed/balanced_images/train/72069.jpg,106.053985,...,-0.538655,-0.026991,-0.15006,0.86862,0.127137,-0.058055,-0.14864,-0.224628,0.565578,0.105068
4,Man,64.608073,black,34.814405,,train/37655.jpg,Black,Female,../data/processed/balanced_images/train/37655.jpg,55.268659,...,-0.441455,0.952773,-1.095437,0.918637,0.361822,-0.097846,-0.071504,-0.272044,0.446135,-0.287135


In [None]:
# final dataset ready