<a href="https://colab.research.google.com/github/johnhewi/face_recognition_score_filter/blob/main/ImageScore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<font size="6" color="orange">Facial Recognition Filtering Tool</font>

This tool allows the user to create a facial recognition model from a set of images. The model can then be used to analyze another set of images and score each photo on its similarity to the model. The list of scores can then be used to filter the image files by score.

In [None]:
#@title <font size="4" color="orange">Mount Drive and Install Dependencies (GPU Runtime Required)</font>
#@markdown #### Mount Google Drive and then Install Necessary Packages (GPU Runtime Required)
#@markdown
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


# isntall necessary packages

!pip install dlib
!pip install face_recognition
!apt-get -qq install -y libsm6 libxext6 && pip install -q -U opencv-python




In [None]:
#@title <font size="4" color="orange">Create Model</font>
#@markdown Create a facial recognition model from a set of photos. (If you already have a .npy file you'd like to use, you can skip this step) You will need to enter the path to the folder that contains the images you want to use to train your facial recognition model. Additionally, you will need to select the path to the folder where you want the model to be stored. Finally, you must pick a name for the model file itself.
# Import necessary libraries
#@markdown <br><br>
#@markdown <br>


import os
import numpy as np
import face_recognition
from google.colab import drive
from shutil import copy
from PIL import Image



#@markdown ##### training_dir: folder of images with known subject

# Define the paths to the training and test dataset directories
training_dir = '' #@param {type:"string"}
#@markdown ##### model_dir: folder where model is to be stored
model_dir = '' #@param {type:"string"}
#@markdown ##### model_name: name your model
model_name = 'facial_recognition_model_1' #@param {type:"string"}



# Get the list of training images
training_images_paths = [os.path.join(training_dir, f) for f in os.listdir(training_dir) if f.endswith(('.jpg', '.png'))]

# Load and encode each training image
model_encodings = []
for path in training_images_paths:
   # Load the image from file path
    image = Image.open(path)

    # If the image has an alpha channel, remove it
    if image.mode == 'RGBA':
        print("removing alpha channel...")
        image = image.convert('RGB')
    print(f"Loaded {path}, checking for faces...")

    # Convert the image to a NumPy array
    image_array = np.array(image)

    # Check if the image contains any faces
    face_locations = face_recognition.face_locations(image_array)

    # If the image contains faces, encode the faces
    if face_locations:
        print(f"Found {len(face_locations)} faces in {path}, adding to model...")
        encoding = face_recognition.face_encodings(image_array, known_face_locations=face_locations)[0]
        model_encodings.append(encoding)

# Calculate the average encoding
print(f"Calculating average encoding for {os.path.basename(training_dir)}...")
average_encoding = np.mean(model_encodings, axis=0)
print(f"Average encoding for {model_name}: {average_encoding}")

# Save the average encoding to a file
np.save(os.path.join(model_dir, f"{model_name}.npy"), average_encoding)


In [None]:
import os
import face_recognition
import numpy as np
import time

#@title <font size="4" color="orange">Test Images Against Model</font>
#@markdown Create the similarity scores for a set of images using the facial recognition model created in a previous step. You will need to input the filepath to your facial recognition model, the filepath to the directory containing the images you want to generate similarity scores for, whether or not you want to search all subdirectories (select "search_recursively"), your desired name for the file containing the similarity score and the filepath to the directory where you want the similarity score file saved. <br> If you search the directory recurively, a similarity score file will be created in all subdirectories containing image files. (.jpg or .png) A file containing the similarity scores for all the images will also be created and placed in the directory you specify.


#@markdown ##### model_file_path: Path of model to be tested against
model_file_path = '' #@param {type:"string"}

# Load the model (average encoding)
average_encoding = np.load(model_file_path)

#@markdown ##### test_dir: folder of images with test images to be scored
test_dir = '' #@param {type:"string"}

#@markdown ##### search_recursively: Search test_dir recursively
search_recursively = False #@param {type:"boolean"}

# Initialize a dictionary to store the similarity scores
similarity_scores = {}

#@markdown ##### output_file_name: Name of image score file
output_file_name = 'similarity_scores' #@param {type:"string"}

#@markdown ##### output_file_path: Path to store image score file
output_file_path = '' #@param {type:"string"}

# Join the output_file_path and output_file_name to create the full path
full_output_file_path = os.path.join(output_file_path, output_file_name)

# Function to recursively get all image files from a directory
def get_all_image_files(directory, recursive):
    image_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.png'):
                image_files.append(os.path.join(root, file))
        if not recursive:
            break
    return image_files

def get_processed_files(directory):
    """
    Check if a similarity_scores.txt file exists in the given directory.
    If it exists, read the file and extract the filenames listed in it.
    Return a set containing the filenames.
    """
    processed_files = set()
    scores_file_path = os.path.join(directory, "similarity_scores.txt")

    if os.path.exists(scores_file_path):
        with open(scores_file_path, 'r') as scores_file:
            for line in scores_file:
                # Extract the filename from each line in the file
                filename = line.split(":")[0].strip()
                processed_files.add(filename)

    return processed_files

# Get the list of test images
test_images_paths = get_all_image_files(test_dir, search_recursively)

# Display the number of image files found
num_images = len(test_images_paths)
print(f"Number of image files found: {num_images}")

# Initialize the start time
start_time = time.time()

# Initialize a dictionary to store the similarity scores for each subdirectory
subdirectory_scores = {}

# Compare each test image with the average encoding and calculate similarity scores
current_subdirectory = None

# Initialize a set to store the names of processed files for the current subdirectory
processed_files = set()

# Compare each test image with the average encoding and calculate similarity scores
for idx, path in enumerate(test_images_paths, start=1):

    # Check if the file exists
    if not os.path.exists(path):
        print(f"File not found: {path}, skipping...")

    # Extract the subdirectory from the path
    subdirectory = os.path.dirname(path)

    # Check if the subdirectory has changed
    if subdirectory != current_subdirectory:
        processed_files = get_processed_files(subdirectory)

    # Check if the file has been processed before and skip it if it has
    if path in processed_files:
        print(f"File {path} has already been processed, skipping...")
        continue

    # Load a test image
    test_image = face_recognition.load_image_file(path)

    # Encode the test image to get the facial features
    encodings = face_recognition.face_encodings(test_image)

    # Check if any face is detected
    if len(encodings) == 0:
        print(f"No faces found in {path}, skipping...")
        continue

    test_encoding = encodings[0]

    # Compute the Euclidean distance between the average encoding and the test encoding
    distance = face_recognition.face_distance([average_encoding], test_encoding)
    similarity_score = 1 / (1 + distance)  # Convert distance to similarity score

    # Print the similarity score
    print(f"Similarity score for {path}: {similarity_score[0]}")

    # Store the similarity score in the dictionary using the full file path
    similarity_scores[path] = similarity_score[0]



    # Check if the subdirectory has changed
    if subdirectory != current_subdirectory:
        # If it's not the first iteration, write the scores of the previous subdirectory to a file
        if current_subdirectory is not None:
            subdirectory_file_path = os.path.join(current_subdirectory, "similarity_scores.txt")
            with open(subdirectory_file_path, 'w') as subdirectory_file:
                for image_path, score in subdirectory_scores[current_subdirectory].items():
                    subdirectory_file.write(f"{image_path}: {score}\n")
            print(f"Similarity scores for {current_subdirectory} saved to {subdirectory_file_path}")

        # Update the current subdirectory and clear the scores for the new subdirectory
        current_subdirectory = subdirectory
        subdirectory_scores[current_subdirectory] = {}

    # Store the similarity score in the subdirectory_scores dictionary
    subdirectory_scores[current_subdirectory][path] = similarity_score[0]

    # Calculate and print the estimated time remaining
    elapsed_time = time.time() - start_time
    avg_time_per_image = elapsed_time / idx
    remaining_images = num_images - idx
    estimated_time_remaining = avg_time_per_image * remaining_images
    print(f"Estimated time remaining: {estimated_time_remaining:.2f} seconds")

# After the loop, write the similarity scores of the last subdirectory to a file
if current_subdirectory is not None:
    subdirectory_file_path = os.path.join(current_subdirectory, "similarity_scores.txt")
    with open(subdirectory_file_path, 'w') as subdirectory_file:
        for image_path, score in subdirectory_scores[current_subdirectory].items():
            subdirectory_file.write(f"{image_path}: {score}\n")
    print(f"Similarity scores for {current_subdirectory} saved to {subdirectory_file_path}")


# Write the similarity scores to a file
with open(full_output_file_path, 'w') as output_file:
    for image_path, score in similarity_scores.items():
        output_file.write(f"{image_path}: {score}\n")

print(f"Similarity scores saved to {full_output_file_path}. (Similiarity scores for each subdirectory saved in each subdirectory as well)")

In [None]:
import os
import numpy as np
from scipy import stats

#@title <font size="4" color="orange">Compile and Analyze Similarity Scores</font>

#@markdown ##### <b>directory</b>: Path of directory containing photos or subfolders. Will look for similarity_scores.txt in all subdirectories. These will have been created automatically if you tested images against model recursively in previous step.
directory = '' #@param {type:"string"}

#@markdown ##### search_recursively: Search all subfolders for folders with a similarity_scores.txt file
search_recursively = True #@param {type:"boolean"}

#@markdown ##### similarity_scores_file_path: If you are not searching recursively, specify a path to the similarity scores file
similarity_scores_file_path = '' #@param {type:"string"}

#@markdown ##### output_file_path: The directory where the analysis file will be written
output_file_path = '' #@param {type:"string"}

#@markdown ##### output_file_name: The name of the file with the analyis
analysis_file_name = 'analysis_file' #@param {type:"string"}



# Function to recursively get all similarity scores files and read them
def get_all_similarity_scores(directory):
    scores = {}
    for root, dirs, files in os.walk(directory):
        if "similarity_scores.txt" in files:
            with open(os.path.join(root, "similarity_scores.txt"), 'r') as scores_file:
                for line in scores_file:
                    parts = line.strip().split(":")
                    scores[parts[0].strip()] = float(parts[1].strip())
    return scores

if search_recursively:
    # Fetch all the similarity scores
    similarity_scores = get_all_similarity_scores(directory)
else:
    similarity_scores = {}
    with open(similarity_scores_file_path, 'r') as scores_file:
        for line in scores_file:
            parts = line.strip().split(":")


# Calculate various statistics
max_score = max(similarity_scores.values())
min_score = min(similarity_scores.values())
average_score = np.mean(list(similarity_scores.values()))
median_score = np.median(list(similarity_scores.values()))
std_dev_score = np.std(list(similarity_scores.values()))
number_of_scores = len(similarity_scores)

# Calculate the percentiles
percentiles = {percentile: np.percentile(list(similarity_scores.values()), percentile) for percentile in range(0, 101)}

# Preparing the statistics string
stats_str = f"Statistics for file {directory} \n \n"
stats_str += f"Number of Files/Scores: {number_of_scores}\n"
stats_str += f"Max Score: {max_score}\nMin Score: {min_score}\nAverage Score: {average_score}\n"
stats_str += f"Median Score: {median_score}\nStandard Deviation: {std_dev_score}\n"
stats_str += "\nPercentiles:\n"

for percentile, score in percentiles.items():
    # Counting how many scores are above or equal to, and below or equal to the current percentile value
    above_or_equal_count = sum(1 for val in similarity_scores.values() if val >= score)
    below_or_equal_count = sum(1 for val in similarity_scores.values() if val <= score)

        # Calculating how many standard deviations away from the mean the current percentile value is
    z_score = (score - average_score) / std_dev_score

    # Adding the percentile value, score, and counts to the statistics string
    stats_str += f"{percentile}th Percentile: {score}, Above or Equal: {above_or_equal_count}, Below or Equal: {below_or_equal_count}, {z_score} std dv from mean\n"

# Print the statistics
print(stats_str)

# Save the statistics to a file
analysis_file_path = os.path.join(output_file_path, analysis_file_name)
with open(analysis_file_path, 'w') as file:
    file.write(stats_str)

print(f"Statistics saved to {output_file_path}")

In [None]:

#@title <font size="4" color="orange">Filter Files by Similarity Score</font>

#@markdown ##### <b>directory</b>: Path of directory containing photos or subfolders. Will look for similarity_scores.txt in all subdirectories. These will have been created automatically if you tested images against model recursively in previous step.
directory = '' #@param {type:"string"}

#@markdown ##### search_recursively: Search all subfolders for folders with a similarity_scores.txt file
search_recursively = True #@param {type:"boolean"}

#@markdown ##### similarity_scores_file_path: If you are not searching recursively, specify a path to the similarity scores file
similarity_scores_file_path = '' #@param {type:"string"}

#@markdown ##### minimum_percentile: the minimum percentile for filepaths in the list
minimum_percentile = 90 #@param {type:"number"}
minimum_percentile = float(minimum_percentile)

#@markdown ##### maximum_percentile: the maximum percentile for filepaths in the list
maximum_percentile = 100 #@param {type:"number"}
maximum_percentile = float(maximum_percentile)

#@markdown ##### filter_by_score: Select to filter files by similarity score. If unchecked, images will be filtered by percentile.
filter_by_score = False #@param {type:"boolean"}

#@markdown ##### minimum_similarity_score: the minimum similarity score for filepaths in the list
minimum_similarity_score = 0 #@param {type:"number"}
minimum_similarity_score = float(minimum_similarity_score)

#@markdown ##### maximum_similarity_score: the maximum percentile for filepaths in the list
maximum_similarity_score = 1 #@param {type:"number"}
maximum_similarity_score = float(maximum_similarity_score)



#@markdown ##### output_file_path: The directory where the file listing filtered images will be written
output_file_path = '' #@param {type:"string"}

#@markdown ##### output_file_name: The name of the file with the list of top percentile files
output_file_name = 'filtered_filepaths_and_scores' #@param {type:"string"}

# Function to recursively get all similarity scores files and read them
def get_all_similarity_scores(directory):
    scores = {}
    for root, dirs, files in os.walk(directory):
        if "similarity_scores.txt" in files:
            with open(os.path.join(root, "similarity_scores.txt"), 'r') as scores_file:
                for line in scores_file:
                    parts = line.strip().split(":")
                    scores[parts[0].strip()] = float(parts[1].strip())
    return scores

if search_recursively:
    # Fetch all the similarity scores
    similarity_scores = get_all_similarity_scores(directory)
else:
    similarity_scores = {}
    with open(similarity_scores_file_path, 'r') as scores_file:
        for line in scores_file:
            parts = line.strip().split(":")

# Check whether to filter by score or by percentile
if filter_by_score:
    # Filter images whose similarity scores are within the score boundaries
    filtered_images = {
        image_path: score for image_path, score in similarity_scores.items()
        if minimum_similarity_score <= score <= maximum_similarity_score
    }
else:
    # Calculate the minimum and maximum percentile thresholds
    min_percentile_threshold = np.percentile(list(similarity_scores.values()), minimum_percentile)
    max_percentile_threshold = np.percentile(list(similarity_scores.values()), maximum_percentile)

    # Filter images whose similarity scores are within the percentile boundaries
    filtered_images = {
        image_path: score for image_path, score in similarity_scores.items()
        if min_percentile_threshold <= score <= max_percentile_threshold
    }

# Save this list to a file
top_images_file_path = os.path.join(output_file_path, output_file_name)

with open(top_images_file_path, 'w') as file:
    for image_path, score in filtered_images.items():
        file.write(f"{image_path}: {score}\n")

print(f"Filtered files saved to {top_images_file_path}")

Filtered files saved to /content/drive/MyDrive/ImageScore/avgncle/avgscores


In [None]:
import os
import shutil

#@title <font size="4" color="orange">Move Filtered Images</font>

#@markdown ##### filtered_images_file_path: Path of the file containing the list of filtered images
filtered_images_file_path = '' #@param {type:"string"}

#@markdown ##### destination_dir: Path of the directory to move the images to
destination_dir = '' #@param {type:"string"}

# Ensure the destination directory exists
os.makedirs(destination_dir, exist_ok=True)

# Read the file containing the list of top percentile images
with open(top_images_file_path, 'r') as file:
    lines = file.readlines()

# Iterate through each line in the file
for line in lines:
    # Extract the full image path from the line
    source_path = line.split(':')[0].strip()

    # Check if the source file exists
    if not os.path.exists(source_path):
        print(f"Warning: Source file does not exist at {source_path}")
        continue

    # Define the destination path of the image
    destination_path = os.path.join(destination_dir, os.path.basename(source_path))

    # Move the image from the source path to the destination path
    shutil.move(source_path, destination_path)

    print(f"{os.path.basename(source_path)} has been moved to {destination_dir}")
