In [8]:
# Import necessary libraries
import os
import face_recognition
import numpy as np
from sklearn.cluster import DBSCAN
from PIL import Image
from shutil import copy2
from sklearn.preprocessing import normalize


In [9]:
# This function loads the images and encodes the faces found in them
def load_and_encode_faces(image_paths):
    encoded_faces = []
    image_path_list = []
    face_locations = []

    for path in image_paths:
        # Load the image file
        image = face_recognition.load_image_file(path)
        # Locate the faces in the image
        face_locations_in_image = face_recognition.face_locations(image)
        # Encode the faces in the image
        face_encodings_in_image = face_recognition.face_encodings(image, face_locations_in_image)

        # Only keep the images with faces
        if len(face_encodings_in_image) > 0:
            encoded_faces.extend(face_encodings_in_image)
            image_path_list.extend([path] * len(face_encodings_in_image))
            face_locations.extend(face_locations_in_image)

    # Return the encoded faces, paths of the images, and the face locations
    return np.array(encoded_faces), image_path_list, face_locations

In [10]:
# This function clusters the encoded faces using the DBSCAN clustering algorithm
def cluster_faces(encoded_faces, similarity_threshold):
    # Normalize the face encodings
    normalized_faces = normalize(encoded_faces)
    
    # Perform clustering
    clustering_model = DBSCAN(metric='euclidean', eps=1 - similarity_threshold, min_samples=2, n_jobs=-1)
    labels = clustering_model.fit_predict(normalized_faces)

    # Return the cluster labels
    return labels

In [11]:
# Function to find all indices of a value in a list
def find_indices(l, value):
    return [
        index for index, item in enumerate(l)
        if item == value
    ]

In [12]:
# This function saves the clustered photos in separate folders
def save_clustered_photos(image_paths, labels, encoded_faces, face_locations):
    # Create an output directory if it doesn't exist
    os.makedirs('output', exist_ok=True)
    
    # Initialize dictionary to hold clusters of images
    image_clusters = {}
    # Group the images by their cluster labels
    for label, path in zip(labels, image_paths):
        if label not in image_clusters:
            image_clusters[label] = []
        image_clusters[label].append(path)

    # Deal with unassigned images (-1 label from DBSCAN algorithm)
    unassigned_images = image_clusters.get(-1, [])
    doubtful_faces = image_clusters.pop(-1, [])

    # For each unassigned image, try to assign it to the closest cluster
    for unassigned_image_path in unassigned_images:
        unassigned_image_index = find_indices(image_paths, unassigned_image_path)
        
        # Process each face found in the image
        for face in unassigned_image_index:
            unassigned_image_encoding = encoded_faces[face]

            # Initialize variables to track the best match
            best_match_label = None
            best_match_similarity = 0.55  # Initialize with the maximum possible value

            # Iterate over each cluster to find the best match
            for label in image_clusters.keys():
                if label == -1:
                    continue

                # Compute the average encoding of the cluster
                cluster_encodings = [encoded_faces[image_paths.index(path)] for path in image_clusters[label]]
                cluster_mean_encoding = np.mean(cluster_encodings, axis=0)
                # Compute the distance between the unassigned image and the average encoding
                distance = np.linalg.norm(unassigned_image_encoding - cluster_mean_encoding)

                # If this cluster is a better match, update the best match
                if distance < best_match_similarity:
                    best_match_similarity = distance
                    best_match_label = label

            # If a best match was found, assign the image to that cluster
            if best_match_label is not None:
                image_clusters[best_match_label].append(unassigned_image_path)

    # Save the photos in the updated clusters
    for label, paths in image_clusters.items():
        # Create a new directory for the cluster
        cluster_folder = f'output/cluster_{label}'
        os.makedirs(cluster_folder, exist_ok=True)

        # Save the representative face of the cluster as a separate image
        representative_face_location = face_locations[np.where(labels == label)[0][0]]
        top, right, bottom, left = representative_face_location
        representative_image_path = image_paths[np.where(labels == label)[0][0]]
        representative_image = face_recognition.load_image_file(representative_image_path)
        cropped_face_image = representative_image[top:bottom, left:right]
        cropped_face_pil_image = Image.fromarray(cropped_face_image)
        cropped_face_output_path = os.path.join(cluster_folder, f'person_{label}.png')
        cropped_face_pil_image.save(cropped_face_output_path)

        # Save the images in the cluster
        for path in paths:
            output_path = os.path.join(cluster_folder, os.path.basename(path))
            copy2(path, output_path)
    
    # Save the doubtful faces in a separate folder
    doubtful_folder = 'output/doubtful (possibly clustered)'
    os.makedirs(doubtful_folder, exist_ok=True)
    for doubtful_image_path in doubtful_faces:
        doubtful_image = face_recognition.load_image_file(doubtful_image_path)
        doubtful_face_locations = face_recognition.face_locations(doubtful_image)
        if doubtful_face_locations:
            top, right, bottom, left = doubtful_face_locations[0]
            cropped_doubtful_face_image = doubtful_image[top:bottom, left:right]
            cropped_doubtful_face_pil_image = Image.fromarray(cropped_doubtful_face_image)
            output_path = os.path.join(doubtful_folder, f'doubtful_{os.path.basename(doubtful_image_path)}')
            cropped_doubtful_face_pil_image.save(output_path)

In [13]:
if __name__ == "__main__":
    # Define the path of the folder containing the images
    image_folder = 'path' #enter your image folder path here
    image_paths = [os.path.join(image_folder, f) for f in os.listdir(image_folder) if f.endswith(('.jpg', '.png', '.jpeg'))]

    # Load and encode the faces in the images
    encoded_faces, image_path_list, face_locations = load_and_encode_faces(image_paths)
    similarity_threshold = 0.68
    # Cluster the encoded faces
    labels = cluster_faces(encoded_faces, similarity_threshold)
    # Save the clustered photos in separate folders
    save_clustered_photos(image_path_list, labels, encoded_faces, face_locations)
    print("Photos have been successfully clustered based on faces and saved in the 'output' folder.")


Photos have been successfully clustered based on faces and saved in the 'output' folder.
