In [1]:
import os
import cv2
import random
import numpy as np

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go


from deepface import DeepFace

from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score


from collections import Counter




# Step 1: Evaluate Deepface Models

- Facenet
- Facenet512
- GhostFaceNet
- ArcFace
- SFace
- Dlib

In [2]:
def evaluate_multiclass_deepface_2images(
    data_dir,
    model_name="Facenet",
    enforce_detection=False,
    title="DeepFace - Multi-class Evaluation (2-images Test)"
):
    """
    Evaluates a DeepFace model in a multi-class setting where each class folder
    in `data_dir` has exactly 2 images.

    1. Compute a reference embedding per class by averaging the embeddings of its 2 images.
    2. For each image, find the class whose reference embedding is closest (Euclidean distance).
    3. Generate a multi-class classification report (precision, recall, F1) for each class.

    Parameters
    ----------
    data_dir : str
        Path to the main folder containing subfolders (each subfolder is a class).
    model_name : str, optional
        DeepFace model to use (e.g., 'Facenet', 'VGG-Face', 'OpenFace', 'ArcFace', 'Dlib').
    enforce_detection : bool, optional
        Whether to enforce face detection (True) or allow images with no face (False).
    title : str, optional
        Title for printing the evaluation results.

    Returns
    -------
    dict
        Dictionary containing accuracy, precision, recall, and f1-score (macro-averaged).
    """

    # 1. Gather all subfolders that have exactly 2 images
    subfolders = [f.path for f in os.scandir(data_dir) if f.is_dir()]
    class_images = {}
    for subfolder in subfolders:
        class_name = os.path.basename(subfolder)
        images = [
            os.path.join(subfolder, img)
            for img in os.listdir(subfolder)
            if img.lower().endswith(('.jpg', '.jpeg', '.png'))
        ]
        # Only store classes that have exactly 2 images
        if len(images) == 2:
            class_images[class_name] = images

    if not class_images:
        print("No classes found with exactly 2 images.")
        return {}

    # 2. Compute reference embedding for each class
    class_reference_embeddings = {}
    for class_name, images in class_images.items():
        emb_list = []
        for img_path in images:
            try:
                # Generate embedding for each image
                # Here we pass the path as the first positional argument
                # If you want to pass a numpy array, use DeepFace.represent(array, ...)
                rep = DeepFace.represent(
                    img_path,
                    model_name=model_name,
                    enforce_detection=enforce_detection
                )
                if rep and len(rep) > 0:
                    emb = rep[0]["embedding"]
                    emb_list.append(emb)
            except Exception as e:
                print(f"Error generating embedding for {img_path}: {e}")

        # Average embedding if we got any
        if emb_list:
            avg_emb = np.mean(emb_list, axis=0)
            class_reference_embeddings[class_name] = avg_emb
        else:
            print(f"No embeddings found for class '{class_name}'. Skipping.")
    
    # Remove any classes that failed to produce a reference embedding
    for c in list(class_reference_embeddings.keys()):
        if c not in class_images:
            class_reference_embeddings.pop(c, None)

    # 3. Classify each image in the dataset by nearest reference embedding
    true_labels = []
    pred_labels = []
    for class_name, images in class_images.items():
        for img_path in images:
            true_labels.append(class_name)
            try:
                # Represent the test image
                rep = DeepFace.represent(
                    img_path,
                    model_name=model_name,
                    enforce_detection=enforce_detection
                )
                if rep and len(rep) > 0:
                    emb = rep[0]["embedding"]
                else:
                    # If no embedding found, predict "Unknown"
                    pred_labels.append("Unknown")
                    continue
            except Exception as e:
                print(f"Error generating embedding for {img_path}: {e}")
                pred_labels.append("Unknown")
                continue
            
            # Find the closest class reference by Euclidean distance
            best_class = None
            best_dist = float("inf")
            for ref_class, ref_emb in class_reference_embeddings.items():
                dist = np.linalg.norm(emb - ref_emb)
                if dist < best_dist:
                    best_dist = dist
                    best_class = ref_class
            
            if best_class is not None:
                pred_labels.append(best_class)
            else:
                pred_labels.append("Unknown")

    # 4. Generate a multi-class classification report
    all_classes = sorted(list(class_reference_embeddings.keys()))
    # Optionally include "Unknown" if you want to see if any images were labeled that way
    if "Unknown" in pred_labels:
        all_classes = all_classes + ["Unknown"]

    print(title)
    print("=" * len(title))
    
    # Print a summary of overall metrics
    accuracy = accuracy_score(true_labels, pred_labels)
    precision = precision_score(true_labels, pred_labels, average="macro", zero_division=0)
    recall = recall_score(true_labels, pred_labels, average="macro", zero_division=0)
    f1 = f1_score(true_labels, pred_labels, average="macro", zero_division=0)
    print(f"Accuracy : {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall   : {recall:.4f}")
    print(f"F1-score : {f1:.4f}")

    # Print the full classification report
    print("\nFull Classification Report:")
    report = classification_report(true_labels, pred_labels, labels=all_classes, zero_division=0)
    print(report)

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1
    }


In [3]:
test_data_dir = "lfw-deepfunneled-2images-test" # Test dataset directory

evaluate_multiclass_deepface_2images(
    data_dir=test_data_dir,
    title="DeepFace - Facenet Multi-class Evaluation",
    model_name="Facenet",
    enforce_detection=False,
)

DeepFace - Facenet Multi-class Evaluation
Accuracy : 0.9450
Precision: 0.9915
Recall   : 0.9450
F1-score : 0.9560

Full Classification Report:
                      precision    recall  f1-score   support

    Bertrand_Bonello       1.00      1.00      1.00         2
       Bill_Parcells       1.00      1.00      1.00         2
        Billy_Graham       1.00      1.00      1.00         2
      Blythe_Hartley       1.00      1.00      1.00         2
             Bo_Ryan       1.00      1.00      1.00         2
     Bobby_Goldwater       1.00      1.00      1.00         2
        Bobby_Robson       1.00      0.50      0.67         2
       Boris_Yeltsin       1.00      1.00      1.00         2
      Brendan_Hansen       1.00      1.00      1.00         2
 Brigitte_Boisselier       1.00      1.00      1.00         2
         Carl_Reiner       1.00      1.00      1.00         2
      Carlos_Bianchi       1.00      1.00      1.00         2
         Carson_Daly       1.00      0.50      0.6

{'accuracy': 0.945,
 'precision': 0.9915384615384616,
 'recall': 0.945,
 'f1_score': 0.9560000000000001}

In [4]:
evaluate_multiclass_deepface_2images(
    data_dir=test_data_dir,
    title="DeepFace - Facenet512 Multi-class Evaluation",
    model_name="Facenet512",
    enforce_detection=False,
)


DeepFace - Facenet512 Multi-class Evaluation
Accuracy : 0.9900
Precision: 0.9933
Recall   : 0.9900
F1-score : 0.9893

Full Classification Report:
                      precision    recall  f1-score   support

    Bertrand_Bonello       1.00      1.00      1.00         2
       Bill_Parcells       1.00      1.00      1.00         2
        Billy_Graham       1.00      1.00      1.00         2
      Blythe_Hartley       1.00      1.00      1.00         2
             Bo_Ryan       1.00      1.00      1.00         2
     Bobby_Goldwater       1.00      1.00      1.00         2
        Bobby_Robson       1.00      1.00      1.00         2
       Boris_Yeltsin       1.00      1.00      1.00         2
      Brendan_Hansen       1.00      1.00      1.00         2
 Brigitte_Boisselier       1.00      1.00      1.00         2
         Carl_Reiner       1.00      1.00      1.00         2
      Carlos_Bianchi       1.00      1.00      1.00         2
         Carson_Daly       1.00      1.00      

{'accuracy': 0.99,
 'precision': 0.9933333333333334,
 'recall': 0.99,
 'f1_score': 0.9893333333333334}

In [5]:
evaluate_multiclass_deepface_2images(
    data_dir=test_data_dir,
    title="DeepFace - GhostFaceNet Multi-class Evaluation",
    model_name="GhostFaceNet",
    enforce_detection=False,
)

DeepFace - GhostFaceNet Multi-class Evaluation
Accuracy : 1.0000
Precision: 1.0000
Recall   : 1.0000
F1-score : 1.0000

Full Classification Report:
                      precision    recall  f1-score   support

    Bertrand_Bonello       1.00      1.00      1.00         2
       Bill_Parcells       1.00      1.00      1.00         2
        Billy_Graham       1.00      1.00      1.00         2
      Blythe_Hartley       1.00      1.00      1.00         2
             Bo_Ryan       1.00      1.00      1.00         2
     Bobby_Goldwater       1.00      1.00      1.00         2
        Bobby_Robson       1.00      1.00      1.00         2
       Boris_Yeltsin       1.00      1.00      1.00         2
      Brendan_Hansen       1.00      1.00      1.00         2
 Brigitte_Boisselier       1.00      1.00      1.00         2
         Carl_Reiner       1.00      1.00      1.00         2
      Carlos_Bianchi       1.00      1.00      1.00         2
         Carson_Daly       1.00      1.00    

{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0}

In [6]:
evaluate_multiclass_deepface_2images(
    data_dir=test_data_dir,
    title="DeepFace - ArcFace Multi-class Evaluation",
    model_name="ArcFace",
    enforce_detection=False,
)

DeepFace - ArcFace Multi-class Evaluation
Accuracy : 0.8950
Precision: 0.8830
Recall   : 0.8950
F1-score : 0.8811

Full Classification Report:
                      precision    recall  f1-score   support

    Bertrand_Bonello       1.00      1.00      1.00         2
       Bill_Parcells       1.00      1.00      1.00         2
        Billy_Graham       1.00      1.00      1.00         2
      Blythe_Hartley       1.00      1.00      1.00         2
             Bo_Ryan       0.67      1.00      0.80         2
     Bobby_Goldwater       1.00      1.00      1.00         2
        Bobby_Robson       0.00      0.00      0.00         2
       Boris_Yeltsin       0.67      1.00      0.80         2
      Brendan_Hansen       1.00      1.00      1.00         2
 Brigitte_Boisselier       1.00      1.00      1.00         2
         Carl_Reiner       1.00      1.00      1.00         2
      Carlos_Bianchi       1.00      1.00      1.00         2
         Carson_Daly       0.00      0.00      0.0

{'accuracy': 0.895,
 'precision': 0.883,
 'recall': 0.895,
 'f1_score': 0.881095238095238}

In [7]:
evaluate_multiclass_deepface_2images(
    data_dir=test_data_dir,
    title="DeepFace - SFace Multi-class Evaluation",
    model_name="SFace",
    enforce_detection=False,
)

DeepFace - SFace Multi-class Evaluation
Accuracy : 0.9900
Precision: 0.9933
Recall   : 0.9900
F1-score : 0.9893

Full Classification Report:
                      precision    recall  f1-score   support

    Bertrand_Bonello       1.00      1.00      1.00         2
       Bill_Parcells       1.00      1.00      1.00         2
        Billy_Graham       1.00      1.00      1.00         2
      Blythe_Hartley       1.00      1.00      1.00         2
             Bo_Ryan       1.00      1.00      1.00         2
     Bobby_Goldwater       1.00      1.00      1.00         2
        Bobby_Robson       1.00      1.00      1.00         2
       Boris_Yeltsin       1.00      1.00      1.00         2
      Brendan_Hansen       1.00      1.00      1.00         2
 Brigitte_Boisselier       1.00      1.00      1.00         2
         Carl_Reiner       1.00      1.00      1.00         2
      Carlos_Bianchi       1.00      1.00      1.00         2
         Carson_Daly       1.00      0.50      0.67 

{'accuracy': 0.99,
 'precision': 0.9933333333333334,
 'recall': 0.99,
 'f1_score': 0.9893333333333334}