In [1]:
import os
import cv2

import hashlib
import chromadb
import numpy as np

from deepface import DeepFace






In [2]:
def get_image_hash(image_path: str) -> str:
    """
    Compute MD5 hash for a file.
    """
    with open(image_path, "rb") as f:
        file_bytes = f.read()
    return hashlib.md5(file_bytes).hexdigest()


In [3]:
def vectorize_class_folders(
    base_path: str = "augmented-lfw-deepfunneled",
    image_size: tuple = (160, 160),
    model_name: str = "Facenet",
    persist_directory: str = "chroma_db"  # Directory where the DB will be saved.
):
    """
    For each person (class) folder in the dataset, detect, align, and normalize faces in each unique image using DeepFace,
    generate embeddings for the aligned face, compute the average embedding, and store it in a persistent local Chroma 
    collection. Duplicate images (checked via file hash) are skipped.
    
    Args:
        base_path (str): Path to the dataset with one folder per person.
        image_size (tuple): Target size for the aligned face images.
        model_name (str): DeepFace model to use (e.g., "Facenet").
        persist_directory (str): Directory where the Chroma database will be stored.
    
    Returns:
        collection: The Chroma collection containing face embeddings.
    """
    # Initialize the persistent Chroma client.
    client = chromadb.PersistentClient(path=persist_directory)
    
    # (Optional) Check the heartbeat to ensure the client is connected.
    print("Chroma client heartbeat (ns):", client.heartbeat())
    
    collection = client.get_or_create_collection(name="face_embeddings")

    # Loop over each person's folder.
    for person in os.listdir(base_path):
        person_dir = os.path.join(base_path, person)
        if not os.path.isdir(person_dir):
            continue

        embeddings = []  # Collect embeddings for this person.
        seen_hashes = set()  # Track image hashes to avoid duplicate embeddings.
        
        # Process each image in the person's folder.
        for image_file in os.listdir(person_dir):
            image_path = os.path.join(person_dir, image_file)
            
            # Compute hash and check for duplicates.
            try:
                file_hash = get_image_hash(image_path)
            except Exception as e:
                print(f"Error reading {image_path}: {e}")
                continue
            
            if file_hash in seen_hashes:
                print(f"Duplicate image skipped: {image_path}")
                continue
            seen_hashes.add(file_hash)
            
            # Face detection, alignment, and normalization.
            try:
                # Extract faces; returns a list of dicts with keys "face" (aligned image) and "facial_area"
                faces = DeepFace.extract_faces(
                    img_path=image_path,
                    detector_backend="opencv",
                    enforce_detection=True
                )
            except Exception as e:
                print(f"Face detection failed for {image_path}: {e}")
                continue
            
            if not faces:
                print(f"No face detected in {image_path}")
                continue
            
            # Use the first detected face (aligned and normalized).
            aligned_face = faces[0]["face"]
            # Resize the aligned face to the target size.
            aligned_face = cv2.resize(aligned_face, image_size)
            
            # Generate embedding from the aligned face.
            try:
                # Pass the aligned face as a positional argument.
                representation = DeepFace.represent(
                    aligned_face,
                    model_name=model_name,
                    enforce_detection=False
                )
            except Exception as e:
                print(f"Error generating embedding for {image_path}: {e}")
                continue

            if representation and len(representation) > 0:
                embedding = representation[0]["embedding"]
                embeddings.append(embedding)
        
        # If unique embeddings were found for this person, average them and store in the collection.
        if embeddings:
            avg_embedding = np.mean(embeddings, axis=0).tolist()
            collection.add(
                ids=[person],
                embeddings=[avg_embedding],
                metadatas=[{"name": person, "num_images": len(embeddings)}]
            )
            
            print(f"Added {person} with {len(embeddings)} unique images.")
        else:
            print(f"No embeddings computed for {person}.")
    
    # Data is automatically persisted by PersistentClient.
    return collection


In [4]:
collection = vectorize_class_folders(
    base_path="lfw-deepfunneled-50class",
    image_size=(160, 160),
    model_name="Facenet",
    persist_directory="chroma_db"
)

Chroma client heartbeat (ns): 1743173315462164100
Added Aaron_Eckhart with 1 unique images.
Added Aaron_Guiel with 1 unique images.
Added Aaron_Patterson with 1 unique images.
Added Aaron_Peirsol with 4 unique images.
Added Aaron_Pena with 1 unique images.
Added Aaron_Sorkin with 2 unique images.
Added Aaron_Tippin with 1 unique images.
Added Abbas_Kiarostami with 1 unique images.
Added Abba_Eban with 1 unique images.
Added Abdel_Aziz_Al-Hakim with 1 unique images.
Added Abdel_Madi_Shabneh with 1 unique images.
Added Abdel_Nasser_Assidi with 2 unique images.
Added Abdoulaye_Wade with 4 unique images.
Added Abdulaziz_Kamilov with 1 unique images.
Added Abdullah with 4 unique images.
Added Abdullah_Ahmad_Badawi with 1 unique images.
Added Abdullah_al-Attiyah with 3 unique images.
Added Abdullah_Nasseef with 1 unique images.
Added Abdullatif_Sener with 2 unique images.
Added Abdul_Majeed_Shobokshi with 1 unique images.
Added Abdul_Rahman with 1 unique images.
Added Abel_Aguilar with 1 uni

In [5]:
print("Number of embeddings in the collection:", len(collection.get()['ids']))



Number of embeddings in the collection: 50
