In [4]:
import json
import os
import shutil
import copy
import numpy as np
from tqdm import tqdm
import cv2

## Import json data

In [5]:
os.getcwd()
data_path = os.getcwd() + '/data'
# demo bob
mldata_demo_bob_path = data_path + '/mldata_demo_bob'
# blazeface
mldata_full_laurens_blazeface_path = data_path + '/ente_mldata_full_laurens_blazeface'
# yolo 
mldata_full_laurens_yolo_path = data_path + '/ente_mldata_full_laurens_yolo_70'
# mldata_full_laurens_yolo_path = data_path + '/ente-mldata-yolo-70-38000_FILES'
lfw_demonstration_data_path = data_path + '/lfw_demonstration_data'

# Define the data to use 
data_to_use = mldata_full_laurens_yolo_path

In [6]:
# DONT CHANGE ANYTHING HERE
json_path = data_to_use + '/indexeddb/mldata.json'
json_metadata_path = data_to_use + '/indexeddb/meta_data_10000025.json'

with open(json_path, 'r') as file:
    mldata_json = json.load(file)

with open(json_metadata_path, 'r') as file:
    mldata_metadata_json = json.load(file)

indexed_files_json = mldata_json['files']
clusters_web_json = mldata_json['library']
people_web_json = mldata_json['people']

In [7]:
face_detection_threshold = 0.8

In [None]:
# get the embeddings from the indexed files
low_score_faces = 0
embeddings = []
file_ids_of_embeddings = []
face_ids_of_embeddings = []
for file in indexed_files_json.values():
    # skip files with errors
    if 'faces' not in file:
        continue
    for face in file['faces']:
        # skip faces with low detection confidence
        if face['detection']['probability'] < face_detection_threshold:
            low_score_faces += 1
            continue
        embeddings.append(face['embedding'])
        file_ids_of_embeddings.append(str(file['fileId']))  # Convert file ID to string
        face_ids_of_embeddings.append(str(face['id']))

# Sort the embeddings, file_ids_of_embeddings, and face_ids_of_embeddings based on creationTime
embeddings, file_ids_of_embeddings, face_ids_of_embeddings = zip(*sorted(zip(embeddings, file_ids_of_embeddings, face_ids_of_embeddings), key=lambda x: mldata_metadata_json[x[1]]['creationTime'], reverse=False))

print(f"Amount of files: {len(indexed_files_json.values())}")
print(f"Amount of embeddings/faces: {len(embeddings)}")
print(f"Amount of low score faces: {low_score_faces}")

Amount of files: 40577
Amount of embeddings/faces: 42739
Amount of low score faces: 11806


In [6]:
# # check if all embeddings are unique
# for idx, embedding in enumerate(embeddings):
#     if embeddings.count(embedding) > 1:
#         print(f"embedding of index {idx}: [{embedding[0]}, {embedding[1]}...{embedding[-1]}] is not unique")

# check if all entries in face_ids_of_embeddings are unique
# for id in face_ids_of_embeddings:
#     if face_ids_of_embeddings.count(id) > 1:
#         print(f"face_id {id} is not unique")

In [7]:
# get the clusters from the clusters_web_json
clusters_int_web = []
clusters_fileids_web = []
for cluster in clusters_web_json['data']['faceClusteringResults']['clusters']:
    clusters_int_web.append(cluster)
noise_web = clusters_web_json['data']['faceClusteringResults']['noise'] 
for cluster in people_web_json.values():
    clusters_fileids_web.append(cluster['files'])

cluster_entries_number = len(noise_web)
for cluster in clusters_fileids_web:
    cluster_entries_number += len(cluster)
print(f"Amount of cluster entries: {cluster_entries_number}")
print(f"Amount of clusters: {len(clusters_int_web)}")

Amount of cluster entries: 54545
Amount of clusters: 520


In [8]:
len(clusters_fileids_web)

520

In [9]:
len(file_ids_of_embeddings)

42739

In [10]:
len(face_ids_of_embeddings)

42739

## Linear clustering

This is based on simply comparing cosine distances, similar to Immich. 

In [11]:
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
# embeddings_cosine_distances = cosine_distances(embeddings)

In [12]:
threshold_cosine_distance = 0.3

In [13]:
def calculate_normalized_embeddings(embeddings_matrix):
    # Normalize the embeddings_matrix
    embeddings_matrix_norm = np.linalg.norm(embeddings_matrix, axis=1)
    normalized_embeddings_matrix = embeddings_matrix / embeddings_matrix_norm[:, np.newaxis]
    return normalized_embeddings_matrix

def calculate_cosine_distance_using_normalized_embedding(normalized_embedding, normalized_embeddings_matrix):
    # Calculate the dot product between the normalized embedding and normalized embeddings_matrix
    cosine_similarity = np.dot(normalized_embeddings_matrix, normalized_embedding)
    
    # Calculate the cosine distance
    cosine_distance = 1 - cosine_similarity
    
    return cosine_distance

def calculate_cosine_distance_using_normalized_embedding_matrix(normalized_embeddings_matrix):
    # Calculate the dot product between the normalized embedding and normalized embeddings_matrix
    cosine_similarity = np.dot(normalized_embeddings_matrix, normalized_embeddings_matrix.T)
    
    # Calculate the cosine distance
    cosine_distance = 1 - cosine_similarity
    
    return cosine_distance

def calculate_cosine_distance_between_two_normalized_embedding_matrices(normalized_embeddings_matrix_1, normalized_embeddings_matrix_2):
    # Calculate the dot product between the normalized embedding and normalized embeddings_matrix
    cosine_similarity = np.dot(normalized_embeddings_matrix_1, normalized_embeddings_matrix_2.T)
    
    # Calculate the cosine distance
    cosine_distance = 1 - cosine_similarity
    
    return cosine_distance

In [3]:
test = [x / 100.0 for x in range(10, 55, 5)]
print(test)
print(type(test))

[0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
<class 'list'>


In [14]:
# Initialize an empty list to store the cluster assignments
cluster_counter = 0
cluster_results = np.zeros(len(embeddings), dtype=int)

normalized_embeddings = calculate_normalized_embeddings(embeddings)

# Iterate over each embedding, skipping the first one
cluster_counter += 1
cluster_results[0] = cluster_counter
for idx, norm_embedding in tqdm(enumerate(normalized_embeddings[1:], start=1)):
    # Calculate the cosine distances between the embedding and all other embeddings
    # distances = cosine_distances([embedding], embeddings[:idx])[0]
    distances = calculate_cosine_distance_using_normalized_embedding(norm_embedding, normalized_embeddings[:idx])
    # distances = embeddings_cosine_distances[idx]
    
    # Find the index of the closest embedding
    closest_idx = np.argmin(distances)
    
    # Get the cosine distance to the closest embedding
    closest_distance = distances[closest_idx]
    
    # Check if the distance is below the threshold
    if closest_distance < threshold_cosine_distance:
        other_embedding_cluster = cluster_results[closest_idx]
        # if other_embedding_cluster == 0:
        #     cluster_counter += 1
        #     cluster_results[idx] = cluster_counter
        #     # cluster_results[closest_idx] = cluster_counter
        #     print('TEST: do we ever get here?')
        # else:
        cluster_results[idx] = other_embedding_cluster
    else:
        cluster_counter += 1
        cluster_results[idx] = cluster_counter

# fill in the clusters list
clusters = []
for cluster_number in range(1, cluster_counter + 1):
    cluster = []
    for idx, cluster_result in enumerate(cluster_results):
        if cluster_result == cluster_number:
            cluster.append(idx)
    clusters.append(cluster)

clusters_larger_than_1 = [cluster for cluster in clusters if len(cluster) > 1]
clusters_larger_than_2 = [cluster for cluster in clusters if len(cluster) > 2]
# clusters_larger_than_1

# get the file_ids of the clusters
clusters_fileids = copy.deepcopy(clusters)
clusters_faceids = copy.deepcopy(clusters)
for cluster in clusters_fileids:
    for idx, file_id in enumerate(cluster):
        cluster[idx] = file_ids_of_embeddings[file_id]
for cluster in clusters_faceids:
    for idx, face_id in enumerate(cluster):
        cluster[idx] = face_ids_of_embeddings[face_id]

# get the clusters with more than 1 file_id
clusters_fileids_larger_than_1 = [cluster for cluster in clusters_fileids if len(cluster) > 1]
clusters_fileids_larger_than_2 = [cluster for cluster in clusters_fileids if len(cluster) > 2]

# get the clusters with more than 1 face_id
clusters_faceids_larger_than_1 = [cluster for cluster in clusters_faceids if len(cluster) > 1]
clusters_faceids_larger_than_2 = [cluster for cluster in clusters_faceids if len(cluster) > 2]

# clusters thumbnails by adding `.jpg` to every face_id
clusters_thumbnails = copy.deepcopy(clusters_faceids)
for cluster in clusters_thumbnails:
    for idx, face_id in enumerate(cluster):
        cluster[idx] = face_id + '.jpg'
clusters_thumbnails_larger_than_1 = [cluster for cluster in clusters_thumbnails if len(cluster) > 1]
clusters_thumbnails_larger_than_2 = [cluster for cluster in clusters_thumbnails if len(cluster) > 2]
clusters_thumbnails_larger_than_10 = [cluster for cluster in clusters_thumbnails if len(cluster) > 10]
clusters_thumbnails_larger_than_20 = [cluster for cluster in clusters_thumbnails if len(cluster) > 20]
clusters_thumbnails_larger_than_30 = [cluster for cluster in clusters_thumbnails if len(cluster) > 30]
clusters_thumbnails_larger_than_40 = [cluster for cluster in clusters_thumbnails if len(cluster) > 40]
clusters_thumbnails_larger_than_50 = [cluster for cluster in clusters_thumbnails if len(cluster) > 50]

42738it [00:59, 717.25it/s]  


In [15]:
# Taking final clusters with more than 2 faces
final_clusters = sorted(clusters_larger_than_2, key=len, reverse=True)
final_clusters_fileids = sorted(clusters_fileids_larger_than_2, key=len, reverse=True)
final_clusters_faceids = sorted(clusters_faceids_larger_than_2, key=len, reverse=True)
final_clusters_thumbnails = sorted(clusters_thumbnails_larger_than_2, key=len, reverse=True)
final_clusters_embeddings = sorted(clusters_larger_than_2, key=len, reverse=True)
face_ids_to_cluster_map = {}
for idx, cluster in enumerate(final_clusters_faceids):
    for face_id in cluster:
        face_ids_to_cluster_map[face_id] = idx

In [16]:
print(f"Total number of clusters: {len(clusters_thumbnails)}")
print(f"Number of clusters with more than 1 face_id: {len(clusters_thumbnails_larger_than_1)}")
print(f"Number of clusters with more than 2 face_id: {len(clusters_thumbnails_larger_than_2)}")
print(f"Number of clusters with more than 10 face_id: {len(clusters_thumbnails_larger_than_10)}")
print(f"Number of clusters with more than 20 face_id: {len(clusters_thumbnails_larger_than_20)}")
print(f"Number of clusters with more than 30 face_id: {len(clusters_thumbnails_larger_than_30)}")
print(f"Number of clusters with more than 40 face_id: {len(clusters_thumbnails_larger_than_40)}")
print(f"Number of clusters with more than 50 face_id: {len(clusters_thumbnails_larger_than_50)}")

Total number of clusters: 12899
Number of clusters with more than 1 face_id: 3464
Number of clusters with more than 2 face_id: 1405
Number of clusters with more than 10 face_id: 175
Number of clusters with more than 20 face_id: 91
Number of clusters with more than 30 face_id: 67
Number of clusters with more than 40 face_id: 62
Number of clusters with more than 50 face_id: 53


In [17]:
cluster_size = []
for cluster in final_clusters:
    cluster_size.append(len(cluster))
# cluster_size.sort(reverse=True)
print(f"Cluster sizes of first 200 clusters: {np.array(cluster_size[:200], dtype=int)}")

Cluster sizes of first 200 clusters: [6362 2167 1557 1311 1099  856  773  599  519  501  489  472  461  291
  279  259  256  181  178  170  169  156  140  138  122  107  106  102
  100   99   99   98   97   85   84   80   77   76   75   72   68   65
   64   64   63   62   60   57   56   55   55   53   51   50   49   47
   47   46   44   42   42   41   37   37   34   33   32   30   29   29
   28   28   27   27   27   26   26   25   25   25   24   23   23   23
   22   21   21   21   21   21   21   20   20   20   19   19   19   19
   18   18   18   18   18   17   17   17   17   16   16   16   16   16
   16   16   16   16   15   15   15   15   15   15   15   15   14   14
   14   14   14   14   14   14   14   14   13   13   13   13   13   13
   13   13   13   13   13   13   12   12   12   12   12   12   12   12
   12   12   12   12   12   11   11   11   11   11   11   11   11   11
   11   11   11   11   11   11   11   10   10   10   10   10   10   10
   10   10   10   10   10   10   10   10

## Creating folders with thumbnails of the new clusters

In [18]:
new_clusters_path = data_to_use + '/caches/new_clusters'
new_clusters_folder = new_clusters_path + '/thumbnails_clusters'
thumbnails_path = data_to_use + '/caches/face-crops'

# Create the new_clusters_folder if it doesn't exist
if not os.path.exists(new_clusters_folder):
    os.makedirs(new_clusters_folder)
else: # empty the directory if it already exists
    shutil.rmtree(new_clusters_folder)
    os.makedirs(new_clusters_folder)

# Iterate over each cluster in final_clusters_thumbnails
for cluster_idx, cluster in enumerate(final_clusters_thumbnails):
    # Create a new folder for the cluster
    cluster_folder = os.path.join(new_clusters_folder, f'cluster_{cluster_idx+1}_size_{len(cluster)}')
    os.makedirs(cluster_folder)
    
    # Copy the thumbnails from the cluster to the cluster folder
    for thumbnail_filename in cluster:
        thumbnail_path = os.path.join(thumbnails_path, thumbnail_filename)
        shutil.copy(thumbnail_path, cluster_folder)

## Second stage improvements

### Finding trash clusters

Approaches to try:
- [x] Computing the distances among the entries in a cluster, flag cluster if the average is high
- [] Computing the 10 most extreme distances in a cluster, flag cluster if the sum is high

In [43]:
find_trash_clustes = False

In [44]:
face_ids_to_normalized_embeddings = {}
for idx, face_id in enumerate(face_ids_of_embeddings):
    face_ids_to_normalized_embeddings[face_id] = normalized_embeddings[idx]

In [45]:
# Finding trash clusters

# first approach: computing the distances among the entries in a cluster, and flag if the average is above a threshold 
if find_trash_clustes:
    average_intra_cluster_distances = []
    for cluster in tqdm(clusters_faceids_larger_than_2):
        cluster_normalized_embeddings = []
        for face_id in cluster:
            cluster_normalized_embeddings.append(face_ids_to_normalized_embeddings[face_id])
        cluster_normalized_embeddings = np.array(cluster_normalized_embeddings)
        cluster_embeddings_distances = calculate_cosine_distance_using_normalized_embedding_matrix(cluster_normalized_embeddings)
        average_intra_cluster_distances.append(np.mean(cluster_embeddings_distances))
    sorted_average_intra_cluster_distances_indices = np.argsort(average_intra_cluster_distances)[::-1]
    print(f"First approach: computing the distances among the entries in a cluster, and flag if the average is above a threshold:")
    print(f"The 10 clusters with the highest average intra cluster distances are: {sorted_average_intra_cluster_distances_indices[:10]} \n with average intra cluster distances: {np.array(average_intra_cluster_distances)[sorted_average_intra_cluster_distances_indices[:10]]} \n \n \n")

    # second approach: computing the 10 most extreme distances in a cluster, and flag if the sum is above a threshold
    sum_of_extreme_distances = []
    for cluster in tqdm(clusters_faceids_larger_than_2):
        cluster_normalized_embeddings = []
        for face_id in cluster:
            cluster_normalized_embeddings.append(face_ids_to_normalized_embeddings[face_id])
        cluster_normalized_embeddings = np.array(cluster_normalized_embeddings)
        cluster_embeddings_distances = calculate_cosine_distance_using_normalized_embedding_matrix(cluster_normalized_embeddings)
        cluster_embeddings_distances = np.sort(cluster_embeddings_distances, axis=None)[::-1]
        sum_of_extreme_distances.append(np.sum(cluster_embeddings_distances[:10]))
    sorted_sum_of_extreme_distances_indices = np.argsort(sum_of_extreme_distances)[::-1]
    print(f"Second approach: computing the 10 most extreme distances in a cluster, and flag if the sum is above a threshold:")
    print(f"The 10 clusters with the highest sum of extreme distances are: {sorted_sum_of_extreme_distances_indices[:10]} \n with sum of extreme distances: {np.array(sum_of_extreme_distances)[sorted_sum_of_extreme_distances_indices[:10]]} \n \n \n")


## Detecting and filter low quality faces

Filter rules for faces we should consider here:
- [X] set threshold for yolo confidence to `0.8`
- [X] sideways faces using landmarks
- [X] rotated faces
- [X] low resolution faces using bounding box plus image size
- [X] blur using FFT on aligned face

In [None]:
filter_faces = False

In [46]:
face_filters_path = data_to_use + '/caches/face_filters'

# Create the face_filters_path if it doesn't exist
if not os.path.exists(face_filters_path):
    os.makedirs(face_filters_path)

In [47]:
if filter_faces:
    # Filter faces with low detection confidence
    score_70_75_path = face_filters_path + '/score_70_75'
    score_75_80_path = face_filters_path + '/score_75_80'

    # Create the score_70_75_path and score_75_80_path if they don't exist
    if not os.path.exists(score_70_75_path):
        os.makedirs(score_70_75_path)
    else:
        shutil.rmtree(score_70_75_path)
        os.makedirs(score_70_75_path)
    if not os.path.exists(score_75_80_path):
        os.makedirs(score_75_80_path)
    else:
        shutil.rmtree(score_75_80_path)
        os.makedirs(score_75_80_path)

    score_70_75_counter = 0
    score_75_80_counter = 0
    for file in indexed_files_json.values():
        # skip files with errors
        if 'faces' not in file:
            continue
        for face in file['faces']:
            # skip faces with low detection confidence
            if face['detection']['probability'] >= 0.7 and face['detection']['probability'] < 0.75:
                face_path = os.path.join(thumbnails_path, face['id'] + '.jpg') 
                shutil.copy(face_path, score_70_75_path)
                score_70_75_counter += 1
            elif face['detection']['probability'] >= 0.75 and face['detection']['probability'] < 0.8:
                face_path = os.path.join(thumbnails_path, face['id'] + '.jpg') 
                shutil.copy(face_path, score_75_80_path)
                score_75_80_counter += 1
    score_70_80_counter = score_70_75_counter + score_75_80_counter
    print(f"Amount of faces with detection score between 70 and 75: {score_70_75_counter}")
    print(f"Amount of faces with detection score between 75 and 80: {score_75_80_counter}")
    print(f"Amount of faces with detection score between 70 and 80: {score_70_80_counter}")

Amount of faces with detection score between 70 and 75: 5023
Amount of faces with detection score between 75 and 80: 6783
Amount of faces with detection score between 70 and 80: 11806


In [48]:
def detect_sideways_naive(facial_landmarks):
    left_eye = facial_landmarks[0]
    right_eye = facial_landmarks[1]
    nose_tip = facial_landmarks[2]
    # left_mouth_corner = facial_landmarks[3]
    # right_mouth_corner = facial_landmarks[4]

    eyes_height = (left_eye['y'] + right_eye['y']) / 2
    nose_height = nose_tip['y']
    eyes_to_nose_height = abs(eyes_height - nose_height)
    eyes_width = abs(left_eye['x'] - right_eye['x'])

    eyes_widht_to_nose_height_ratio = eyes_width / eyes_to_nose_height

    if eyes_widht_to_nose_height_ratio < 0.5:
        return True
    else:
        return False

In [66]:
if filter_faces:
    # Filter sideways faces
    sideways = face_filters_path + '/sideways'

    # Create the sideways path if it doesn't exist
    if not os.path.exists(sideways):
        os.makedirs(sideways)
    else:
        shutil.rmtree(sideways)
        os.makedirs(sideways)

    sidways_counter = 0
    sideways_face_ids = []
    sideways_file_ids = []
    for file in indexed_files_json.values():
        # skip files with errors
        if 'faces' not in file:
            continue
        for face in file['faces']:
            # detect if face is sideways and copy it to the sideways folder if it is
            if detect_sideways_naive(face['detection']['landmarks']) and face['detection']['probability'] >= 0.8:
                face_path = os.path.join(thumbnails_path, face['id'] + '.jpg') 
                sideways_face_ids.append(face['id'])
                sideways_file_ids.append(file['fileId'])
                shutil.copy(face_path, sideways)
                sidways_counter += 1
    print(f"Amount of sideways faces: {sidways_counter}")

Amount of sideways faces: 1804


In [67]:
if filter_faces:
    sideways_faces_in_clusters = []
    useful_sideways_faces_counter = 0
    check_clusters = final_clusters_faceids[:200]
    useful_sideways_faces_in_clusters = np.zeros(len(check_clusters), dtype=int)
    for idx, cluster in enumerate(check_clusters):
        sideways_faces = []
        for face_id in cluster:
            if face_id in sideways_face_ids:
                sideways_faces.append(face_id)
                useful_sideways_faces_counter += 1
                useful_sideways_faces_in_clusters[idx] += 1
        sideways_faces_in_clusters.append(sideways_faces)
    print(f"Amount of useful sideways faces: {useful_sideways_faces_counter} out of {sidways_counter}, which is {(useful_sideways_faces_counter / sidways_counter * 100)}%")
    print(f"Amount of useful sideways faces in the checked clusters: {useful_sideways_faces_in_clusters}")

Amount of useful sideways faces: 496 out of 1804, which is 27.494456762749447%
Amount of useful sideways faces in the checked clusters: [34 16 22  1  2  3  0  6  1  1  0  6  3 51  0  0  0 13  1  0 59  2 43  0
  0  2  0  0  0  1  0  0  0  0  0  0  0 57  0  0  0  0  8  2  0  0  0 46
  0  0  3  0  0  0  0  0  0  0 12  0  0  0  0  0  0  0  0  0  0  0  0  1
  0 18  3  0  0  0  0  0  0  7  0  0  0  0  0  0  0  0  0  0  0  0  3  0
  0  0 16  0  0  0  0  0 11  0  0  0  0  0  0  0  0  0  0  0  0  0  8  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  5  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  7  3  0  0  9  9  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0]


In [75]:
threshold_side = 40
threshold_surface = threshold_side * threshold_side

def detect_small_faces(box, threshold_surface = threshold_surface):
    box_width = box['width']
    box_height = box['height']

    surface = box_width * box_height

    if surface < threshold_surface:
        return True
    else:
        return False

In [76]:
# Filter faces based on small bounding box size
if filter_faces:
    # Filter sideways faces
    small_bounding_box = face_filters_path + '/small_bounding_box'

    # Create the sideways path if it doesn't exist
    if not os.path.exists(small_bounding_box):
        os.makedirs(small_bounding_box)
    else:
        shutil.rmtree(small_bounding_box)
        os.makedirs(small_bounding_box)

    small_bounding_box_counter = 0
    small_bounding_box_face_ids = []
    small_bounding_box_file_ids = []
    for file in indexed_files_json.values():
        # skip files with errors
        if 'faces' not in file:
            continue
        for face in file['faces']:
            # detect if face is sideways and copy it to the sideways folder if it is
            if detect_small_faces(face['detection']['box']) and face['detection']['probability'] >= 0.8:
                face_path = os.path.join(thumbnails_path, face['id'] + '.jpg') 
                small_bounding_box_face_ids.append(face['id'])
                small_bounding_box_file_ids.append(file['fileId'])
                shutil.copy(face_path, small_bounding_box)
                small_bounding_box_counter += 1
    print(f"Amount of small faces (bounding box): {small_bounding_box_counter}")

Amount of small faces (bounding box): 1827


In [77]:
if filter_faces:
    small_bounding_box_faces_in_clusters = []
    useful_small_bounding_box_faces_counter = 0
    check_clusters = final_clusters_faceids[:200]
    useful_small_bounding_box_faces_in_clusters = np.zeros(len(check_clusters), dtype=int)
    for idx, cluster in enumerate(check_clusters):
        small_bounding_box_faces = []
        for face_id in cluster:
            if face_id in small_bounding_box_face_ids:
                small_bounding_box_faces.append(face_id)
                useful_small_bounding_box_faces_counter += 1
                useful_small_bounding_box_faces_in_clusters[idx] += 1
        small_bounding_box_faces_in_clusters.append(small_bounding_box_faces)
    print(f"Amount of useful small_bounding_box faces: {useful_small_bounding_box_faces_counter} out of {small_bounding_box_counter}, which is {(useful_small_bounding_box_faces_counter / small_bounding_box_counter * 100)}%")
    print(f"Amount of useful small_bounding_box faces in the checked clusters: {useful_small_bounding_box_faces_in_clusters}")

Amount of useful small_bounding_box faces: 263 out of 1827, which is 14.395183360700603%
Amount of useful small_bounding_box faces in the checked clusters: [33  1  9 10  1  1  3 10  1  0  0  2  1 45  0  0  0 17  0  2  6  1 14  1
  0  1  1  2  8  1  1  0  0  0  0  0  0  4  0  2  1  1  0 17  1  1  0  3
  0  0 10  0  1  0  4  0  0  0  0  0  6  0  0  0  0  0  1  0  0  0  3  1
  0  2  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  3  0
  0  1  0  0  0  0  1  0  1  2  0  0  0  0  0  0  0  0  0  0  0  0  3  0
  0  0  1  0  0  0  0  0 10  0  0  0  0  0  4  0  0  0  0  0  0  0  0  1
  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  3  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0]


In [82]:
def calculate_cluster_average_image_size(clusters_fileids, metadata_fileids):
    cluster_average_image_size = np.zeros(len(clusters_fileids), dtype=int)
    for idx, cluster in enumerate(clusters_fileids):
        sum_size = 0
        size_counter = 0
        for file_id in cluster:
            if metadata_fileids[file_id]['size'] != 0:
                sum_size += metadata_fileids[file_id]['size']
                size_counter += 1
        if size_counter != 0:
            cluster_average_image_size[idx] = sum_size / size_counter

    return cluster_average_image_size

In [8]:
clusters_average_image_size = calculate_cluster_average_image_size(final_clusters_fileids, mldata_metadata_json)
clusters_average_image_size[:200]/1000000

In [91]:
# Filter faces based on low image size
if filter_faces:
    # Filter sideways faces
    small_image = face_filters_path + '/small_image'

    # Create the sideways path if it doesn't exist
    if not os.path.exists(small_image):
        os.makedirs(small_image)
    else:
        shutil.rmtree(small_image)
        os.makedirs(small_image)

    small_image_counter = 0
    small_image_face_ids = []
    small_image_file_ids = []
    for file in indexed_files_json.values():
        # skip files with errors
        if 'faces' not in file:
            continue
        if mldata_metadata_json[str(file['fileId'])]['size'] < 100000:
            for face in file['faces']:
                # detect if face is sideways and copy it to the sideways folder if it is
                if face['detection']['probability'] >= 0.8:
                    face_path = os.path.join(thumbnails_path, face['id'] + '.jpg') 
                    small_image_face_ids.append(face['id'])
                    small_image_file_ids.append(file['fileId'])
                    shutil.copy(face_path, small_image)
                    small_image_counter += 1
    print(f"Amount of faces from small images: {small_image_counter}")

Amount of faces from small images: 8162


In [None]:
# Filter faces based on low image size and bounding box size ratio


## Second stage merging of clusters

In [110]:
len(final_clusters_embeddings[0])

6362

In [121]:
len(final_clusters_embeddings[1])

2167

In [106]:
normalized_embeddings.shape

(42739, 192)

In [130]:
# Let's investigate which clusters can be merged with the first cluster
embeddings_first_cluster = normalized_embeddings[final_clusters_embeddings[0]]
embeddings_second_cluster = normalized_embeddings[final_clusters_embeddings[1]]
inner_cluster_distances_first_cluster = calculate_cosine_distance_using_normalized_embedding_matrix(embeddings_first_cluster)

# Let's investigate which clusters can be merged with the first cluster
embeddings_first_cluster = normalized_embeddings[final_clusters_embeddings[0]]
median_distances = []
for idx, cluster in tqdm(enumerate(final_clusters_embeddings[1:200])):
    embeddings_second_cluster = normalized_embeddings[cluster]
    all_distances_between_two_clusters = calculate_cosine_distance_between_two_normalized_embedding_matrices(embeddings_first_cluster, embeddings_second_cluster)
    median_distance = np.median(all_distances_between_two_clusters.flatten())
    median_distances.append(median_distance)

# Now let's sort the median_distances and get the indices of the sorted median_distances
sorted_median_distances_indices = np.argsort(median_distances)
sorted_median_distances = np.array(median_distances)[sorted_median_distances_indices]
print(f"The 10 clusters with the lowest median distances to the first cluster are: {sorted_median_distances_indices[:10] +2} \n with median distances: {sorted_median_distances[:10]} \n \n \n")

199it [00:02, 90.74it/s] 

The 10 clusters with the lowest median distances to the first cluster are: [ 86  10   8 131 108 184 137 102  96 141] 
 with median distances: [0.44778493 0.45415174 0.47975667 0.50488278 0.54223017 0.58516732
 0.59617362 0.62022092 0.62487244 0.62797093] 
 
 




