In [None]:
# the goal of this notebook is to use 4 machine learning models:
# logistic regression, Knn, random forest, Svm 
# to predict deepface race-classification errors and compare their performance the baseline classifier

In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

train = pd.read_parquet("../data/ml_ready/train_ml_ready.parquet")
val   = pd.read_parquet("../data/ml_ready/val_ml_ready.parquet")

print(train.shape, val.shape)
train.head()
# loading the feature datasets i built previously so i can train ML models on them

(7000, 12) (2100, 12)


Unnamed: 0,pred_gender,pred_gender_score,pred_race,pred_race_score,error,file,race_true,gender_true,img_path,brightness,contrast,saturation
0,Woman,88.418669,black,43.71421,,train/60423.jpg,Black,Female,../data/processed/balanced_images/train/60423.jpg,48.98708,59.403837,167.363665
1,Man,97.433734,black,78.286773,,train/45029.jpg,Black,Female,../data/processed/balanced_images/train/45029.jpg,141.144018,61.018735,126.112693
2,Woman,99.483669,latino hispanic,41.124514,,train/81730.jpg,Black,Female,../data/processed/balanced_images/train/81730.jpg,32.576097,43.355361,46.195073
3,Man,89.559507,indian,58.37732,,train/72069.jpg,Black,Female,../data/processed/balanced_images/train/72069.jpg,106.053985,67.849858,49.254235
4,Man,64.608073,black,34.814405,,train/37655.jpg,Black,Female,../data/processed/balanced_images/train/37655.jpg,55.268659,29.317591,129.966129


In [7]:
train["race_true_clean"] = train["race_true"].str.lower().str.replace("_", " ")
train["pred_race_clean"] = train["pred_race"].str.lower()

val["race_true_clean"] = val["race_true"].str.lower().str.replace("_", " ")
val["pred_race_clean"] = val["pred_race"].str.lower()

y_train = (train["race_true_clean"] != train["pred_race_clean"]).astype(int)
y_val   = (val["race_true_clean"]   != val["pred_race_clean"]).astype(int)

feat_cols = ["pred_race_score", "pred_gender_score", "brightness", "contrast", "saturation"]

X_train = train[feat_cols]
X_val   = val[feat_cols]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)
# cleaning the labels and building the target 0 = correct, 1 = error
# then selecting and scaling the numeric features for all future ML models

# Nb: to make sure accuracy and the other metrics actually reflect the modelâ€™s ability to detect deepface errors 
# i defined the ML target here as 1 = error and 0 = correct.
# notebook 3 used the opposite convention but only for descriptive analysis

In [None]:
# logistic regression 

In [8]:
log_model = LogisticRegression(max_iter=300)

log_model.fit(X_train_scaled, y_train)

log_pred = log_model.predict(X_val_scaled)
print("logistic acc:", accuracy_score(y_val, log_pred))
print("class balance val:", np.bincount(y_val))

logistic acc: 0.6671428571428571
class balance val: [ 698 1402]


In [9]:
print(np.bincount(y_train))

[2440 4560]


In [None]:
# the logistic regression achieves 67,% accuracy but this is misleading: 
# the dataset has 67% errors (majority class) so the model is essentially 
# learning to always predict "error" without capturing meaningful patterns.

# this suggests that with current features, simple linear models cannot
# reliably distinguish between correct and incorrect deepface predictions

# either the features may not contain strong predictive signals
# either the problem requires more complex feature engineering or non-linear models.

In [13]:
#proof: 
naive_predictions = np.ones(len(y_val)) 
naive_accuracy = accuracy_score(y_val, naive_predictions)
print(f"accuracy if i always predcit 'error': {naive_accuracy}")

accuracy if i always predcit 'error': 0.6676190476190477
