#Install DeepFace

In [1]:
from IPython.display import clear_output
%pip install deepface
clear_output()
print("DeepFace installed")

DeepFace installed


#Import Libraries

In [2]:
import os
import csv
import sqlite3
import zipfile
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
from deepface import DeepFace
from sklearn.metrics import pairwise_distances

24-04-07 23:41:29 - Directory /root/.deepface created
24-04-07 23:41:29 - Directory /root/.deepface/weights created


#Connect to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#Specify Paths

In [6]:
# Change the paths to get/store files in your Google Drive
train_zip_path = '/content/drive/MyDrive/Colab Notebooks/data/train_224.zip'
test_zip_path = '/content/drive/MyDrive/Colab Notebooks/data/test_224.zip'
train_images_path = '/content/train_224'
test_images_path = '/content/test_224'
train_database_path = '/content/drive/MyDrive/Colab Notebooks/database/traindb.db'
test_database_path = '/content/drive/MyDrive/Colab Notebooks/database/testdb.db'
crops_num_csv = '/content/drive/MyDrive/Colab Notebooks/csv/crops_num.csv'
category_csv = '/content/drive/MyDrive/Colab Notebooks/csv/category.csv'
predicted_labels_csv = '/content/drive/MyDrive/Colab Notebooks/csv/predicted_labels.csv'

#Unzip Cropped Train and Test

In [5]:
with zipfile.ZipFile(train_zip_path) as z:
    z.extractall('')
with zipfile.ZipFile(test_zip_path) as z:
    z.extractall('')

#Generate Train Embeddings


In [8]:
# Connect to database
conn = sqlite3.connect(train_database_path)
cursor = conn.cursor()

# Create tables if they don't exist
cursor.execute('''CREATE TABLE IF NOT EXISTS face_meta (ID INTEGER PRIMARY KEY, IMG_NAME TEXT, EMBEDDING BLOB)''')

# Iterate through image paths and store embeddings
facial_img_paths = []
for root, directory, files in os.walk(train_images_path):
    for f in files:
        if '.jpg' in f:
            facial_img_paths.append(os.path.join(root, f))


for i, facial_img_path in enumerate(tqdm(facial_img_paths)):
    embedding = DeepFace.represent(img_path=facial_img_path, model_name="SFace", detector_backend='skip')[0]["embedding"]
    img_name = os.path.basename(facial_img_path).split('.')[0]
    embedding_array = np.array(embedding)
    embedding_blob = embedding_array.tobytes()

    # Insert data into database
    cursor.execute('INSERT INTO face_meta (ID, IMG_NAME, EMBEDDING) VALUES (?, ?, ?)', (i, img_name, embedding_blob))

conn.commit()
conn.close()

100%|██████████| 68592/68592 [13:13<00:00, 86.47it/s]


#Generate Test Embeddings

In [9]:
# Connect to database
conn = sqlite3.connect(test_database_path)
cursor = conn.cursor()

# Create tables if they don't exist
cursor.execute('''CREATE TABLE IF NOT EXISTS face_meta (ID INTEGER PRIMARY KEY, IMG_NAME TEXT, EMBEDDING BLOB)''')

# Iterate through image paths and store embeddings
filenames = [f for f in os.listdir(test_images_path) if f.endswith('.jpg')]
facial_img_paths = []
for i in range(len(filenames)):
    facial_img_paths.append(f'/content/test_224/{i}.jpg')

for i, facial_img_path in enumerate(tqdm(facial_img_paths)):
    embedding = DeepFace.represent(img_path=facial_img_path, model_name="SFace", detector_backend='skip')[0]["embedding"]
    img_name = os.path.basename(facial_img_path).split('.')[0]
    embedding_array = np.array(embedding)
    embedding_blob = embedding_array.tobytes()
    cursor.execute('INSERT INTO face_meta (ID, IMG_NAME, EMBEDDING) VALUES (?, ?, ?)', (i, img_name, embedding_blob))

conn.commit()
conn.close()

100%|██████████| 7737/7737 [01:26<00:00, 89.24it/s]


#Load Facial Database into Pandas Data Frame

In [10]:
conn = sqlite3.connect(train_database_path)
cursor = conn.cursor()

select_statement = 'select img_name, embedding from face_meta'
results = cursor.execute(select_statement)

instances = []
for result in results:
    img_name = result[0]
    embedding_bytes = result[1]
    embedding = np.frombuffer(embedding_bytes, dtype = 'float64')

    instance = []
    instance.append(img_name)
    instance.append(embedding)
    instances.append(instance)

train_df = pd.DataFrame(instances, columns = ['img_name', 'embedding'])
conn.commit()
conn.close()

print(train_df['embedding'])

0        [-0.7855247259140015, 0.822955310344696, 1.848...
1        [-0.6390532851219177, 0.9139964580535889, -0.6...
2        [-0.36544084548950195, 0.05464779958128929, -0...
3        [-1.0158979892730713, 0.23610854148864746, 0.5...
4        [0.5652865171432495, -1.86287260055542, 0.2145...
                               ...                        
68587    [-0.8820852041244507, -1.7068018913269043, -0....
68588    [-0.8508542776107788, 0.0733473151922226, -0.5...
68589    [-0.9522559642791748, -0.2867908179759979, 0.2...
68590    [-1.2297043800354004, 0.49487999081611633, 0.1...
68591    [-0.44406208395957947, -0.39389684796333313, 0...
Name: embedding, Length: 68592, dtype: object


#Load Test Database into Pandas Data Frame

In [11]:
conn = sqlite3.connect(test_database_path)
cursor = conn.cursor()

select_statement = 'select img_name, embedding from face_meta'
results = cursor.execute(select_statement)

instances = []
for result in results:
    img_name = result[0]
    embedding_bytes = result[1]
    embedding = np.frombuffer(embedding_bytes, dtype = 'float64')

    instance = []
    instance.append(img_name)
    instance.append(embedding)
    instances.append(instance)

test_df = pd.DataFrame(instances, columns = ['img_name', 'embedding'])
conn.commit()
conn.close()

print(test_df['embedding'])

0       [-0.47111958265304565, -0.07002589106559753, 0...
1       [-0.35302120447158813, -0.7184938192367554, 0....
2       [0.3281221091747284, -0.14927883446216583, 0.3...
3       [0.04830002784729004, -0.11965912580490112, 0....
4       [-0.4777531921863556, -0.2088930606842041, 0.4...
                              ...                        
7732    [-1.2303379774093628, -0.1131342202425003, -0....
7733    [-0.9727636575698853, -0.7996971011161804, -1....
7734    [1.0379232168197632, 0.26702556014060974, 0.02...
7735    [0.42745551466941833, -0.02320195734500885, 0....
7736    [-0.695655107498169, 0.14255203306674957, 0.98...
Name: embedding, Length: 7737, dtype: object


#Calculate Best Matches

In [12]:
test_embeddings = np.stack(test_df['embedding'].values)
train_embeddings = np.stack(train_df['embedding'].values)

# Calculate all pairwise Euclidean distances
distances = pairwise_distances(test_embeddings, train_embeddings, metric="cosine")

# Find the index and value of the minimum distance for each test_embedding
min_distance_indices = np.argmin(distances, axis=1)
min_distances = np.min(distances, axis=1)

# Retrieve the corresponding best match names from train_df
best_match_names = train_df.iloc[min_distance_indices]['img_name'].values

# Pair each test image name with its best match name and the minimum distance
best_matches = list(zip(test_df['img_name'], best_match_names, min_distances))

# Print the first 10 best matches
print(best_matches[:10])

[('0', '20_265', 0.4842519443673172), ('1', '21_431', 0.014795872828496148), ('2', '70_191', 0.4696896136210913), ('3', '83_572', 0.5008485286484988), ('4', '52_111', 0.3931523582888281), ('5', '3_603', 0.5146734006722695), ('6', '52_421', 0.0), ('7', '76_412', 0.40352681518166145), ('8', '37_602', 0.4048441933128196), ('9', '60_265', 0.4639686816705615)]


#Extract Best Matches for Test Images

In [13]:
num_crops = []
with open(crops_num_csv, newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)
    for row in reader:
        if len(row) >= 2:
            num_crops.append(int(row[1]))

print(num_crops)
print(len(num_crops))

new_best_matches = []
current_start = 0
for image_idx in range(0, len(num_crops)):
    shortest_distance = np.inf
    best_predicted_label_idx = None

    for crop_idx in range(current_start, current_start + num_crops[image_idx]):
        distance = best_matches[crop_idx][2]
        if distance < shortest_distance:
            shortest_distance = distance
            best_predicted_label_idx = crop_idx

    new_best_matches.append(best_matches[best_predicted_label_idx])
    current_start += num_crops[image_idx]

[1, 5, 4, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 4, 1, 1, 1, 4, 1, 2, 1, 2, 1, 2, 3, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 7, 1, 1, 1, 1, 4, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 52, 1, 1, 3, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 5, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 7, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 3, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 4, 1, 1, 1, 1, 1, 1, 3, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 11, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 5, 6, 1, 1, 1, 2, 1, 5, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 4, 1, 1, 4, 2, 4, 1, 1, 7, 1, 1, 1, 1

#Write Predicted Labels to CSV File

In [14]:
# create index to label map from category csv
category_df = pd.read_csv(category_csv)
num_classes = category_df['Category'].nunique()
index_to_label = {row[0]: row[1] for row in category_df.itertuples(index=False)}

# extract prediected labels
pred_labels = []
for best_match in new_best_matches:
    pred_label = index_to_label[int(best_match[1].split('_')[0])]
    pred_labels.append(pred_label)

# print out the predicted labels for reference
print(len(new_best_matches))
for i, pred_label in enumerate(pred_labels):
    print(f"{i:<4} {pred_label}")

# write csv file
with open(predicted_labels_csv, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Id', 'Category'])
    cnt = 0
    for label in pred_labels:
        writer.writerow([cnt, label])
        cnt += 1

print(f'CSV file "{predicted_labels_csv}" has been written successfully.')

4977
0    Ashley Judd
1    Amy Ryan
2    Adrien Brody
3    Adriana Barraza
4    Albert Brooks
5    Aaron Taylor-Johnson
6    Amber Heard
7    Adele
8    Amber Heard
9    Alex Pettyfer
10   Anna Faris
11   Anthony Perkins
12   Bill Hader
13   Barabara Palvin
14   Benjamin Bratt
15   Berenice Bejo
16   Alessandra Ambrosio
17   Alan Arkin
18   Akemi Darenogare
19   Anderson Cooper
20   Ashley Graham
21   Ashley Greene
22   Benedict Cumberbatch
23   Adele
24   Ashley Greene
25   Alan Rickman
26   Adam Sandler
27   Ben Stiller
28   Adrien Brody
29   Barry Pepper
30   Angela Bassett
31   Ava Gardner
32   Amy Adams
33   Al Pacino
34   Andrew Garfield
35   Andreea Diaconu
36   Andrew Garfield
37   Ashton Kutcher
38   Anna Paquin
39   Alessandra Ambrosio
40   Bill O Reilly
41   bella thorne
42   Anderson Cooper
43   Anne Baxter
44   Betty White
45   Amy Ryan
46   Barbra Streisand
47   Adriana Barraza
48   Alan Arkin
49   Alessandra Ambrosio
50   Aaron Paul
51   Ava Gardner
52   Anna Faris
53   