In [1]:
import json
import os
import shutil
import copy
import numpy as np
from tqdm import tqdm
import cv2

## Import json data

In [2]:
os.getcwd()
data_path = os.getcwd() + '/data'
# demo bob
mldata_demo_bob_path = data_path + '/mldata_demo_bob'
# blazeface
mldata_full_laurens_blazeface_path = data_path + '/ente_mldata_full_laurens_blazeface'
# yolo 
mldata_full_laurens_yolo_path = data_path + '/ente_mldata_full_laurens_yolo_70'
# mldata_full_laurens_yolo_path = data_path + '/ente-mldata-yolo-70-38000_FILES'
lfw_demonstration_data_path = data_path + '/lfw_demonstration_data'

# Define the data to use 
data_to_use = lfw_demonstration_data_path

In [3]:
# DONT CHANGE ANYTHING HERE
json_path = data_to_use + '/indexeddb/mldata.json'
json_metadata_path = data_to_use + '/indexeddb/meta_data_10000025.json'

with open(json_path, 'r') as file:
    mldata_json = json.load(file)

indexed_files_json = mldata_json['files']
clusters_web_json = mldata_json['library']
people_web_json = mldata_json['people']

In [4]:
face_detection_threshold = 0.8

In [5]:
# get the embeddings from the indexed files
low_score_faces = 0
embeddings = []
file_ids_of_embeddings = []
face_ids_of_embeddings = []
for file in indexed_files_json.values():
    # skip files with errors
    if 'faces' not in file:
        continue
    for face in file['faces']:
        # skip faces with low detection confidence
        if face['detection']['probability'] < face_detection_threshold:
            low_score_faces += 1
            continue
        embeddings.append(face['embedding'])
        file_ids_of_embeddings.append(str(file['fileId']))  # Convert file ID to string
        face_ids_of_embeddings.append(str(face['id']))

# # Sort the embeddings, file_ids_of_embeddings, and face_ids_of_embeddings based on creationTime
# if data_to_use != lfw_demonstration_data_path:
#     embeddings, file_ids_of_embeddings, face_ids_of_embeddings = zip(*sorted(zip(embeddings, file_ids_of_embeddings, face_ids_of_embeddings), key=lambda x: mldata_metadata_json[x[1]]['creationTime'], reverse=False))

print(f"Amount of files: {len(indexed_files_json.values())}")
print(f"Amount of embeddings/faces: {len(embeddings)}")
print(f"Amount of low score faces: {low_score_faces}")

Amount of files: 4350
Amount of embeddings/faces: 4835
Amount of low score faces: 603


In [6]:
# # check if all embeddings are unique
# for idx, embedding in enumerate(embeddings):
#     if embeddings.count(embedding) > 1:
#         print(f"embedding of index {idx}: [{embedding[0]}, {embedding[1]}...{embedding[-1]}] is not unique")

# check if all entries in face_ids_of_embeddings are unique
# for id in face_ids_of_embeddings:
#     if face_ids_of_embeddings.count(id) > 1:
#         print(f"face_id {id} is not unique")

In [7]:
# get the clusters from the clusters_web_json
clusters_int_web = []
clusters_fileids_web = []
for cluster in clusters_web_json['data']['faceClusteringResults']['clusters']:
    clusters_int_web.append(cluster)
noise_web = clusters_web_json['data']['faceClusteringResults']['noise'] 
for cluster in people_web_json.values():
    clusters_fileids_web.append(cluster['files'])

cluster_entries_number = len(noise_web)
for cluster in clusters_fileids_web:
    cluster_entries_number += len(cluster)
print(f"Amount of cluster entries: {cluster_entries_number}")
print(f"Amount of clusters: {len(clusters_int_web)}")

Amount of cluster entries: 5438
Amount of clusters: 168


In [8]:
len(clusters_fileids_web)

168

In [9]:
len(file_ids_of_embeddings)

4835

In [10]:
len(face_ids_of_embeddings)

4835

## Linear clustering

This is based on simply comparing cosine distances, similar to Immich. 

In [11]:
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
# embeddings_cosine_distances = cosine_distances(embeddings)

In [12]:
threshold_cosine_distance = 0.3

In [13]:
def calculate_normalized_embeddings(embeddings_matrix):
    # Normalize the embeddings_matrix
    embeddings_matrix_norm = np.linalg.norm(embeddings_matrix, axis=1)
    normalized_embeddings_matrix = embeddings_matrix / embeddings_matrix_norm[:, np.newaxis]
    return normalized_embeddings_matrix

def calculate_cosine_distance_using_normalized_embedding(normalized_embedding, normalized_embeddings_matrix):
    # Calculate the dot product between the normalized embedding and normalized embeddings_matrix
    cosine_similarity = np.dot(normalized_embeddings_matrix, normalized_embedding)
    
    # Calculate the cosine distance
    cosine_distance = 1 - cosine_similarity
    
    return cosine_distance

def calculate_cosine_distance_using_normalized_embedding_matrix(normalized_embeddings_matrix):
    # Calculate the dot product between the normalized embedding and normalized embeddings_matrix
    cosine_similarity = np.dot(normalized_embeddings_matrix, normalized_embeddings_matrix.T)
    
    # Calculate the cosine distance
    cosine_distance = 1 - cosine_similarity
    
    return cosine_distance

def calculate_cosine_distance_between_two_normalized_embedding_matrices(normalized_embeddings_matrix_1, normalized_embeddings_matrix_2):
    # Calculate the dot product between the normalized embedding and normalized embeddings_matrix
    cosine_similarity = np.dot(normalized_embeddings_matrix_1, normalized_embeddings_matrix_2.T)
    
    # Calculate the cosine distance
    cosine_distance = 1 - cosine_similarity
    
    return cosine_distance

In [14]:
# Initialize an empty list to store the cluster assignments
cluster_counter = 0
cluster_results = np.zeros(len(embeddings), dtype=int)

normalized_embeddings = calculate_normalized_embeddings(embeddings)

# Iterate over each embedding, skipping the first one
cluster_counter += 1
cluster_results[0] = cluster_counter
for idx, norm_embedding in tqdm(enumerate(normalized_embeddings[1:], start=1)):
    # Calculate the cosine distances between the embedding and all other embeddings
    # distances = cosine_distances([embedding], embeddings[:idx])[0]
    distances = calculate_cosine_distance_using_normalized_embedding(norm_embedding, normalized_embeddings[:idx])
    # distances = embeddings_cosine_distances[idx]
    
    # Find the index of the closest embedding
    closest_idx = np.argmin(distances)
    
    # Get the cosine distance to the closest embedding
    closest_distance = distances[closest_idx]
    
    # Check if the distance is below the threshold
    if closest_distance < threshold_cosine_distance:
        other_embedding_cluster = cluster_results[closest_idx]
        # if other_embedding_cluster == 0:
        #     cluster_counter += 1
        #     cluster_results[idx] = cluster_counter
        #     # cluster_results[closest_idx] = cluster_counter
        #     print('TEST: do we ever get here?')
        # else:
        cluster_results[idx] = other_embedding_cluster
    else:
        cluster_counter += 1
        cluster_results[idx] = cluster_counter

# fill in the clusters list
clusters = []
for cluster_number in range(1, cluster_counter + 1):
    cluster = []
    for idx, cluster_result in enumerate(cluster_results):
        if cluster_result == cluster_number:
            cluster.append(idx)
    clusters.append(cluster)

clusters_larger_than_1 = [cluster for cluster in clusters if len(cluster) > 1]
clusters_larger_than_2 = [cluster for cluster in clusters if len(cluster) > 2]
# clusters_larger_than_1

# get the file_ids of the clusters
clusters_fileids = copy.deepcopy(clusters)
clusters_faceids = copy.deepcopy(clusters)
for cluster in clusters_fileids:
    for idx, file_id in enumerate(cluster):
        cluster[idx] = file_ids_of_embeddings[file_id]
for cluster in clusters_faceids:
    for idx, face_id in enumerate(cluster):
        cluster[idx] = face_ids_of_embeddings[face_id]

# get the clusters with more than 1 file_id
clusters_fileids_larger_than_1 = [cluster for cluster in clusters_fileids if len(cluster) > 1]
clusters_fileids_larger_than_2 = [cluster for cluster in clusters_fileids if len(cluster) > 2]

# get the clusters with more than 1 face_id
clusters_faceids_larger_than_1 = [cluster for cluster in clusters_faceids if len(cluster) > 1]
clusters_faceids_larger_than_2 = [cluster for cluster in clusters_faceids if len(cluster) > 2]

# clusters thumbnails by adding `.jpg` to every face_id
clusters_thumbnails = copy.deepcopy(clusters_faceids)
for cluster in clusters_thumbnails:
    for idx, face_id in enumerate(cluster):
        cluster[idx] = face_id + '.jpg'
clusters_thumbnails_larger_than_1 = [cluster for cluster in clusters_thumbnails if len(cluster) > 1]
clusters_thumbnails_larger_than_2 = [cluster for cluster in clusters_thumbnails if len(cluster) > 2]
clusters_thumbnails_larger_than_10 = [cluster for cluster in clusters_thumbnails if len(cluster) > 10]
clusters_thumbnails_larger_than_20 = [cluster for cluster in clusters_thumbnails if len(cluster) > 20]
clusters_thumbnails_larger_than_30 = [cluster for cluster in clusters_thumbnails if len(cluster) > 30]
clusters_thumbnails_larger_than_40 = [cluster for cluster in clusters_thumbnails if len(cluster) > 40]
clusters_thumbnails_larger_than_50 = [cluster for cluster in clusters_thumbnails if len(cluster) > 50]

4834it [00:00, 34123.96it/s]


In [15]:
# Taking final clusters with more than 2 faces
final_clusters = sorted(clusters_larger_than_2, key=len, reverse=True)
final_clusters_fileids = sorted(clusters_fileids_larger_than_2, key=len, reverse=True)
final_clusters_faceids = sorted(clusters_faceids_larger_than_2, key=len, reverse=True)
final_clusters_thumbnails = sorted(clusters_thumbnails_larger_than_2, key=len, reverse=True)
final_clusters_embeddings = sorted(clusters_larger_than_2, key=len, reverse=True)
face_ids_to_cluster_map = {}
for idx, cluster in enumerate(final_clusters_faceids):
    for face_id in cluster:
        face_ids_to_cluster_map[face_id] = idx

In [16]:
print(f"Total number of clusters: {len(clusters_thumbnails)}")
print(f"Number of clusters with more than 1 face_id: {len(clusters_thumbnails_larger_than_1)}")
print(f"Number of clusters with more than 2 face_id: {len(clusters_thumbnails_larger_than_2)}")
print(f"Number of clusters with more than 10 face_id: {len(clusters_thumbnails_larger_than_10)}")
print(f"Number of clusters with more than 20 face_id: {len(clusters_thumbnails_larger_than_20)}")
print(f"Number of clusters with more than 30 face_id: {len(clusters_thumbnails_larger_than_30)}")
print(f"Number of clusters with more than 40 face_id: {len(clusters_thumbnails_larger_than_40)}")
print(f"Number of clusters with more than 50 face_id: {len(clusters_thumbnails_larger_than_50)}")

Total number of clusters: 1000
Number of clusters with more than 1 face_id: 237
Number of clusters with more than 2 face_id: 188
Number of clusters with more than 10 face_id: 115
Number of clusters with more than 20 face_id: 49
Number of clusters with more than 30 face_id: 28
Number of clusters with more than 40 face_id: 15
Number of clusters with more than 50 face_id: 9


In [17]:
cluster_size = []
for cluster in final_clusters:
    cluster_size.append(len(cluster))
# cluster_size.sort(reverse=True)
print(f"Cluster sizes of first 200 clusters: {np.array(cluster_size[:200], dtype=int)}")

Cluster sizes of first 200 clusters: [521 219 127 104  96  70  69  59  51  50  48  47  47  45  44  39  39  39
  37  35  34  34  34  33  33  32  31  31  30  29  28  27  26  26  26  26
  25  25  23  23  23  22  22  22  21  21  21  21  21  20  20  20  19  19
  18  17  17  17  17  16  16  16  16  16  16  16  16  15  15  15  15  15
  15  14  14  14  14  14  14  14  14  14  14  14  13  13  13  13  13  13
  13  13  12  12  12  12  12  12  12  12  12  12  12  12  11  11  11  11
  11  11  11  11  11  11  11  10  10  10  10  10  10  10  10  10   9   9
   9   9   9   9   9   9   9   9   9   9   9   9   8   8   8   8   8   8
   8   8   8   8   7   7   7   7   7   7   7   7   7   7   6   6   6   6
   6   6   5   5   5   5   4   4   4   4   4   4   4   4   4   4   4   3
   3   3   3   3   3   3   3   3]


## Creating folders with thumbnails of the new clusters

In [18]:
new_clusters_path = data_to_use + '/caches/new_clusters'
new_clusters_folder = new_clusters_path + '/thumbnails_clusters'
thumbnails_path = data_to_use + '/caches/face-crops'

# Create the new_clusters_folder if it doesn't exist
if not os.path.exists(new_clusters_folder):
    os.makedirs(new_clusters_folder)
else: # empty the directory if it already exists
    shutil.rmtree(new_clusters_folder)
    os.makedirs(new_clusters_folder)

# Iterate over each cluster in final_clusters_thumbnails
for cluster_idx, cluster in enumerate(final_clusters_thumbnails):
    # Create a new folder for the cluster
    cluster_folder = os.path.join(new_clusters_folder, f'cluster_{cluster_idx+1}_size_{len(cluster)}')
    os.makedirs(cluster_folder)
    
    # Copy the thumbnails from the cluster to the cluster folder
    for thumbnail_filename in cluster:
        thumbnail_path = os.path.join(thumbnails_path, thumbnail_filename)
        shutil.copy(thumbnail_path, cluster_folder)