In [None]:
# the goal here is to :
# extract simple image features: brightness, contrast nd saturation
# merge them with the deepface predictions from notebook 3
# at the end save a clean dataset that i will use for the ml model in notebook 5

In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import cv2

root = ".."

train_pred_path = os.path.join(root, "results", "baseline", "pred_train.parquet")
val_pred_path   = os.path.join(root, "results", "baseline", "pred_val.parquet")

train_pred = pd.read_parquet(train_pred_path)
val_pred   = pd.read_parquet(val_pred_path)

print(train_pred.shape, val_pred.shape)
train_pred.head()
# i load the deepface prediction files from notebook 3 so i can attach my own image features on top

(7000, 8) (2100, 8)


Unnamed: 0,pred_gender,pred_gender_score,pred_race,pred_race_score,error,file,race_true,gender_true
0,Woman,88.418669,black,43.71421,,train/60423.jpg,Black,Female
1,Man,97.433734,black,78.286773,,train/45029.jpg,Black,Female
2,Woman,99.483669,latino hispanic,41.124514,,train/81730.jpg,Black,Female
3,Man,89.559507,indian,58.37732,,train/72069.jpg,Black,Female
4,Man,64.608073,black,34.814405,,train/37655.jpg,Black,Female


In [2]:
def extract_features(img_path):
    try:
        img = cv2.imread(img_path)

        if img is None:
            return {"brightness": None, "contrast": None, "saturation": None}

        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
        hsv  = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)

        brightness = gray.mean()
        contrast   = gray.std()
        saturation = hsv[:, :, 1].mean()

        return {
            "brightness": float(brightness),
            "contrast": float(contrast),
            "saturation": float(saturation),
        }
    except:
        return {"brightness": None, "contrast": None, "saturation": None}
# this function extracts simple image stats: i want to test if deepface fails more on dark or low contrast pictures

In [3]:
def add_img_path(df, root):
    df = df.copy()
    df["img_path"] = df["file"].apply(
        lambda f: os.path.join(root, "data", "processed", "balanced_images", 
                               "train" if "train" in f else "val", 
                               os.path.basename(f))
    )
    return df

train_pred = add_img_path(train_pred, root)
val_pred   = add_img_path(val_pred, root)

train_pred.head(3)
# rebuilt the correct path to each image so i can extract pixels in the next step

Unnamed: 0,pred_gender,pred_gender_score,pred_race,pred_race_score,error,file,race_true,gender_true,img_path
0,Woman,88.418669,black,43.71421,,train/60423.jpg,Black,Female,../data/processed/balanced_images/train/60423.jpg
1,Man,97.433734,black,78.286773,,train/45029.jpg,Black,Female,../data/processed/balanced_images/train/45029.jpg
2,Woman,99.483669,latino hispanic,41.124514,,train/81730.jpg,Black,Female,../data/processed/balanced_images/train/81730.jpg


In [4]:
def compute_features(df):
    rows = []
    for r in tqdm(df.itertuples(), total=len(df)):
        feats = extract_features(r.img_path)
        feats["file"] = r.file
        rows.append(feats)
    return pd.DataFrame(rows)

train_feats = compute_features(train_pred)
val_feats   = compute_features(val_pred)

train_feats.head()
# here i'm extracting brightness contrast saturation for each image

100%|█████████████████████████████████████████████████████████████████████████████| 7000/7000 [00:06<00:00, 1130.50it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2100/2100 [00:01<00:00, 1140.68it/s]


Unnamed: 0,brightness,contrast,saturation,file
0,48.98708,59.403837,167.363665,train/60423.jpg
1,141.144018,61.018735,126.112693,train/45029.jpg
2,32.576097,43.355361,46.195073,train/81730.jpg
3,106.053985,67.849858,49.254235,train/72069.jpg
4,55.268659,29.317591,129.966129,train/37655.jpg


In [6]:
out_dir = os.path.join(root, "results", "features")
os.makedirs(out_dir, exist_ok=True)

train_feats_path = os.path.join(out_dir, "train_features.parquet")
val_feats_path   = os.path.join(out_dir, "val_features.parquet")

train_feats.to_parquet(train_feats_path, index=False)
val_feats.to_parquet(val_feats_path, index=False)

train_feats.shape, val_feats.shape
# saving the extracted features so i can reuse it later without recomputing

((7000, 4), (2100, 4))

In [7]:
train_full = train_pred.merge(train_feats, on="file", how="left")
val_full   = val_pred.merge(val_feats, on="file", how="left")

train_full.head()
# by combining deepface outputs + my image features i'll have a clean dataset ready for ML part

Unnamed: 0,pred_gender,pred_gender_score,pred_race,pred_race_score,error,file,race_true,gender_true,img_path,brightness,contrast,saturation
0,Woman,88.418669,black,43.71421,,train/60423.jpg,Black,Female,../data/processed/balanced_images/train/60423.jpg,48.98708,59.403837,167.363665
1,Man,97.433734,black,78.286773,,train/45029.jpg,Black,Female,../data/processed/balanced_images/train/45029.jpg,141.144018,61.018735,126.112693
2,Woman,99.483669,latino hispanic,41.124514,,train/81730.jpg,Black,Female,../data/processed/balanced_images/train/81730.jpg,32.576097,43.355361,46.195073
3,Man,89.559507,indian,58.37732,,train/72069.jpg,Black,Female,../data/processed/balanced_images/train/72069.jpg,106.053985,67.849858,49.254235
4,Man,64.608073,black,34.814405,,train/37655.jpg,Black,Female,../data/processed/balanced_images/train/37655.jpg,55.268659,29.317591,129.966129


In [8]:
ml_dir = os.path.join(root, "data", "ml_ready")
os.makedirs(ml_dir, exist_ok=True)

train_full.to_parquet(os.path.join(ml_dir, "train_ml_ready.parquet"), index=False)
val_full.to_parquet(os.path.join(ml_dir, "val_ml_ready.parquet"), index=False)

print("done")

done


In [None]:
#clean dataset ready for notebook 5 where i will train the model that predicts deepface errors