In [None]:
# the goal of this notebook is to use 4 machine learning models:
# logistic regression, Knn, random forest, Svm 
# to predict deepface race-classification errors and compare their performance the baseline classifier

In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

train = pd.read_parquet("../data/ml_ready/train_ml_ready.parquet")
val   = pd.read_parquet("../data/ml_ready/val_ml_ready.parquet")

print(train.shape, val.shape)
train.head()
# loading the feature datasets i built previously so i can train ML models on them

(7000, 12) (2100, 12)


Unnamed: 0,pred_gender,pred_gender_score,pred_race,pred_race_score,error,file,race_true,gender_true,img_path,brightness,contrast,saturation
0,Woman,88.418669,black,43.71421,,train/60423.jpg,Black,Female,../data/processed/balanced_images/train/60423.jpg,48.98708,59.403837,167.363665
1,Man,97.433734,black,78.286773,,train/45029.jpg,Black,Female,../data/processed/balanced_images/train/45029.jpg,141.144018,61.018735,126.112693
2,Woman,99.483669,latino hispanic,41.124514,,train/81730.jpg,Black,Female,../data/processed/balanced_images/train/81730.jpg,32.576097,43.355361,46.195073
3,Man,89.559507,indian,58.37732,,train/72069.jpg,Black,Female,../data/processed/balanced_images/train/72069.jpg,106.053985,67.849858,49.254235
4,Man,64.608073,black,34.814405,,train/37655.jpg,Black,Female,../data/processed/balanced_images/train/37655.jpg,55.268659,29.317591,129.966129


In [7]:
train["race_true_clean"] = train["race_true"].str.lower().str.replace("_", " ")
train["pred_race_clean"] = train["pred_race"].str.lower()

val["race_true_clean"] = val["race_true"].str.lower().str.replace("_", " ")
val["pred_race_clean"] = val["pred_race"].str.lower()

y_train = (train["race_true_clean"] != train["pred_race_clean"]).astype(int)
y_val   = (val["race_true_clean"]   != val["pred_race_clean"]).astype(int)

feat_cols = ["pred_race_score", "pred_gender_score", "brightness", "contrast", "saturation"]

X_train = train[feat_cols]
X_val   = val[feat_cols]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)
# cleaning the labels and building the target 0 = correct, 1 = error
# then selecting and scaling the numeric features for all future ML models

# Nb: to make sure accuracy and the other metrics actually reflect the model’s ability to detect deepface errors 
# i defined the ML target here as 1 = error and 0 = correct.
# notebook 3 used the opposite convention but only for descriptive analysis

In [None]:
# logistic regression 

In [18]:
log_model = LogisticRegression(max_iter=300)

log_model.fit(X_train_scaled, y_train)

log_pred = log_model.predict(X_val_scaled)
log_acc = accuracy_score(y_val, log_pred)

print("logistic acc:", log_acc)
print("class balance val:", np.bincount(y_val))

logistic acc: 0.6671428571428571
class balance val: [ 698 1402]


In [19]:
print(np.bincount(y_train))

[2440 4560]


In [20]:
# the logistic regression achieves 66.76% accuracy but this is misleading: 
# the dataset has 66.76% errors (majority class) so the model is essentially 
# learning to always predict "error" without capturing meaningful patterns.

# this suggests that with current features, simple linear models cannot
# reliably distinguish between correct and incorrect deepface predictions

# either the features may not contain strong predictive signals
# either the problem requires more complex feature engineering or non-linear models.

In [21]:
#proof: 
naive_predictions = np.ones(len(y_val)) 
naive_accuracy = accuracy_score(y_val, naive_predictions)
print(f"accuracy if i always predcit 'error': {naive_accuracy}")

accuracy if i always predcit 'error': 0.6676190476190477


In [22]:
# KNN:

In [23]:
from sklearn.neighbors import KNeighborsClassifier

k_values = [3, 5, 7]
knn_results = {}

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    pred = knn.predict(X_val_scaled)
    
    acc = accuracy_score(y_val, pred)
    knn_results[k] = acc
    
    print(f"KNN (k={k}) accuracy:", acc)

print("\nclass balance val:", np.bincount(y_val))

KNN (k=3) accuracy: 0.5814285714285714
KNN (k=5) accuracy: 0.5919047619047619
KNN (k=7) accuracy: 0.6033333333333334

class balance val: [ 698 1402]


In [24]:
# across k = 3, 5, and 7, KNN reaches accuracies 58–60% that remain below the majority-class baseline 66.76% 
# this shows that KNN does not learn meaningful predictive structure with the current features
# this provides further evidence that more expressive models or richer feature sets are needed to capture deepface error patterns

In [25]:
# random forest: 

In [26]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    class_weight=None
)

rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_val)

rf_acc = accuracy_score(y_val, rf_pred)

print("Random Forest accuracy:", rf_acc)
print("class balance val:", np.bincount(y_val))

Random Forest accuracy: 0.6747619047619048
class balance val: [ 698 1402]


In [27]:
# random forest reaches 67.47% accuracy slightly above the majority-class baseline 66.76% ->
# this means it's the first model that actually learns a bit of structure from the features even if the improvement is quite small! 
# it captures some non-linear patterns unlike logistic regression or KNN.
# However the gain is limited, suggesting the feature space is still too weak to model deepface failures

In [28]:
# support vector machine -> SVM :

In [29]:
svm_linear = SVC(kernel="linear")
svm_linear.fit(X_train_scaled, y_train)
svm_linear_pred = svm_linear.predict(X_val_scaled)

svm_linear_acc = accuracy_score(y_val, svm_linear_pred)
print("SVM (linear) accuracy:", svm_linear_acc)

svm_rbf = SVC(kernel="rbf")
svm_rbf.fit(X_train_scaled, y_train)
svm_rbf_pred = svm_rbf.predict(X_val_scaled)

svm_rbf_acc = accuracy_score(y_val, svm_rbf_pred)
print("SVM (RBF) accuracy:", svm_rbf_acc)

print("class balance val:", np.bincount(y_val))

SVM (linear) accuracy: 0.6676190476190477
SVM (RBF) accuracy: 0.6638095238095238
class balance val: [ 698 1402]


In [30]:
# Both SVM models linear and rbf reach accuracies close to the majority baseline 66.76%
# linear SVM behaves almost exactly like logistic regression, rbf captures more non-linear structure but the improvement is minimal
# this confirms that the current feature set does not contain strong predictive signals about when deepface will fail

In [33]:
import pandas as pd

df_results = pd.DataFrame({
    "Model": [
        "Logistic Regression",
        "KNN (k=3)",
        "KNN (k=5)",
        "KNN (k=7)",
        "Random Forest",
        "SVM (linear)",
        "SVM (RBF)",
        "Baseline (always error)"
    ],
    "Accuracy": [
        log_acc,
        knn_results[3],
        knn_results[5],
        knn_results[7],
        rf_acc,
        svm_linear_acc,
        svm_rbf_acc,
        naive_accuracy
    ]
})
df_results["Accuracy"] = df_results["Accuracy"].round(4)

df_results

Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.6671
1,KNN (k=3),0.5814
2,KNN (k=5),0.5919
3,KNN (k=7),0.6033
4,Random Forest,0.6748
5,SVM (linear),0.6676
6,SVM (RBF),0.6638
7,Baseline (always error),0.6676


In [None]:
# final conclusion: 
# unfortunately across all four models performance stays close to the majority-class baseline
# KNN performs the worst, while Random Forest is the only model that slightly do better
# this shows that the available features do not contain strong predictive signals about when deepface will fail...
# I'll try to achieve higher performance with richer feature representations by updating previous notebooks