# Demonstration of Extraction Process

In [1]:
%load_ext autoreload
%autoreload 2  

import os
import numpy as np
import pandas as pd
from scipy.spatial.distance import minkowski, cosine
from IPython.display import display
from PIL import Image
import torch

print("PyTorch Version:", torch.__version__)
print("MPS Available:", torch.backends.mps.is_available())
print("MPS Built:", torch.backends.mps.is_built())

import sys
sys.path.append("..")

from modules.extraction.preprocessing import Preprocessing
from modules.extraction.embedding import Embedding 

PyTorch Version: 2.2.2
MPS Available: True
MPS Built: True


### Helper Functions

In [2]:
# Function to generate embedding with give image path
def generate_embedding(model_name, image_path, target_image_size=160):
    # Execute preprocessing
    preprocessing = Preprocessing(image_size=target_image_size, device='mps')
    image = Image.open(image_path)
    image = preprocessing.process(image) # preprocessed image

    # Generate embedding for given image
    model = Embedding(pretrained=model_name, device='mps') # Use Metal (Apple GPU)
    embedding_vector = model.encode(image)

    return embedding_vector

# Function to Compute Distance
def compute_distance(embedding1, embedding2, distance_type='euclidean'):
    if distance_type == 'euclidean':
        return np.linalg.norm(embedding1 - embedding2)
    
    elif distance_type == 'dot_product':
        return np.dot(embedding1, embedding2)
    
    elif distance_type == 'cosine':
        return cosine(embedding1, embedding2) 
    
    elif distance_type == 'minkowski':
        return minkowski(embedding1, embedding2, p=3)

# Function to get ranking sequence based on given distance for each personnel
def get_ranking(df, sortby, groupby='personnel', ascending=True):
    ranked_df = df[[groupby, 'gallery_image', sortby]]
    ranked_df = ranked_df.rename(columns={'gallery_image': f'gallery_image_by_{sortby}'})

    # Rank the distance for each group
    ranked_df[f'rank_by_{sortby}'] = ranked_df.groupby(groupby)[sortby].rank(ascending=ascending).astype(int)

    # Reset index 
    ranked_df = ranked_df.sort_values([groupby, f'rank_by_{sortby}'], ascending=True).reset_index(drop=True)

    return ranked_df


# Function to find the nearest neighbors based on ranking of similarities
def get_NN(df, sortby, groupby='personnel', ascending=True, num_records=10):
    ranked_df = get_ranking(df, sortby, groupby, ascending)

    return ranked_df.loc[ranked_df[f'rank_by_{sortby}'] <= num_records]


#### **Task 1: Assignment Instructions:**

Demonstrate the embedding capability of the service for 'cassia-webface' and 'vggface2'  by calculating the Euclidean distance of the following five probe images and their corresponding gallery images:

- Drew Barrymore
- Warren Buffet
- Owen Wilson
- Nelson Mandela
- Ian Thorpe

In [3]:
# Initialize Default Params
MODEL_NAME_1 = 'casia-webface'
MODEL_NAME_2 = 'vggface2'
PROBE_IMAGE_DIR = '../storage/probe/'
GALLERY_IMAGE_DIR = '../storage/multi_image_gallery/'
PERSONNEL = sorted(['Drew_Barrymore', 'Warren_Buffett', 'Owen_Wilson', 'Nelson_Mandela', 'Ian_Thorpe'])


In [4]:
# Get distances between the probe image and the gallery image for specified personnel
task1_result = []

for name in PERSONNEL:
    # Get probe image and generate embedding for each model
    probe_image_dir = os.path.join(PROBE_IMAGE_DIR, name)
    probe_image_path= os.path.join(probe_image_dir, os.listdir(probe_image_dir)[0]) # only one probe image per person
    probe_embedding_1 = generate_embedding(MODEL_NAME_1, probe_image_path)
    probe_embedding_2 = generate_embedding(MODEL_NAME_2, probe_image_path)

    gallery_image_dir = os.path.join(GALLERY_IMAGE_DIR, name)
    gallery_image_files = os.listdir(gallery_image_dir)
    for img_file in gallery_image_files:
        # Get gallery images and generate embedding for each model
        gallery_image_path = os.path.join(gallery_image_dir, img_file)
        gallery_embedding_1 = generate_embedding(MODEL_NAME_1, gallery_image_path)
        gallery_embedding_2 = generate_embedding(MODEL_NAME_2, gallery_image_path)

        # calculate distance between probe image and gallery images
        distance_1 = compute_distance(probe_embedding_1, gallery_embedding_1, 'euclidean')
        distance_2 = compute_distance(probe_embedding_2, gallery_embedding_2, 'euclidean')

        task1_result.append({
            "personnel": name,
            "probe_embedding_1": probe_embedding_1, 
            "probe_embedding_2": probe_embedding_2,
            "gallery_embedding_1": gallery_embedding_1,
            "gallery_embedding_2": gallery_embedding_2,
            "distance_casia_webface": distance_1,
            "distance_vggface2": distance_2
        })

task1_result = pd.DataFrame(task1_result)


In [5]:
# Show result of distances
display(task1_result[["personnel", "distance_casia_webface", "distance_vggface2"]])

# Calculate the average of distance_casia_webface
average_distance_casia_webface = task1_result['distance_casia_webface'].mean() 
print(f"average_distance_casia_webface: {average_distance_casia_webface}")  

# Calculate the average of distance_vggface2
average_distance_vggface2 = task1_result['distance_vggface2'].mean()
print(f"average_distance_vggface2: {average_distance_vggface2}")  

# Count the records when distance_casia_webface < distance_vggface2
print(f"Records when distance_casia_webface < distance_vggface2: {task1_result[task1_result.distance_casia_webface < task1_result.distance_vggface2].shape[0]}")

Unnamed: 0,personnel,distance_casia_webface,distance_vggface2
0,Drew_Barrymore,0.583682,1.09345
1,Ian_Thorpe,0.850744,1.038967
2,Ian_Thorpe,0.633799,1.311603
3,Ian_Thorpe,0.818589,1.139788
4,Ian_Thorpe,0.940338,0.984165
5,Ian_Thorpe,0.656061,1.189985
6,Ian_Thorpe,0.749804,0.862273
7,Ian_Thorpe,0.805604,1.070395
8,Ian_Thorpe,0.608957,0.800957
9,Ian_Thorpe,0.801862,0.978482


average_distance_casia_webface: 0.7361924648284912
average_distance_vggface2: 0.9625166654586792
Records when distance_casia_webface < distance_vggface2: 14


#### Observations & Analyses

* On average, CASIA-WebFace has a lower Euclidean distance than VGGFace2
* Out of the 16 pairs of probe image and gallery image, 14 pairs get smaller distances with CASIA-WebFace model. 
    * Two exceptions are Owen Wilson and Nelson Mandela (for the 3rd gallery image), where VGGFace2 has a lower distance. 

This suggests that CASIA-WebFace may be better at feature extraction for the given cases, as a lower distance generally indicates better similarity between the probe image and the gallery image.

However, to evaluate model performance, more metrics need to be considered as listed examples below. The distances alone do not account for these metrics.

* True Positive Rate (TPR): Correctly identifying the same person.

* False Positive Rate (FPR): Incorrectly identifying different people as the same person.

* Accuracy: Overall correctness of the model.

* ROC-AUC: Area under the Receiver Operating Characteristic curve.

---

#### **Task 2: Assignment Instructions:**

Precompute the embeddings of ALL images stored in storage/gallery/*. 

For each of the five probe images, calculate the following distance against all the images in the gallery. 
* euclidean
* dot_product
* cosine
* minkowski

Sort the embeddings from shortest to longest distance and print the images of the ten nearest neighbors and the name associated with each image. Note your observations.

In [6]:
# Precompute gallery embeddings
gallery_embedding_rows = []

gallery_personnel = sorted(os.listdir(GALLERY_IMAGE_DIR))
for name in gallery_personnel:
    # ignore system files
    if name == '.DS_Store': 
        continue 
    
    # Iterate subdirectories and generate embeddings
    gallery_image_files = os.listdir(os.path.join(GALLERY_IMAGE_DIR, name))
    for img_file in gallery_image_files:
        gallery_image_path = os.path.join(GALLERY_IMAGE_DIR, name, img_file)
        gallery_embedding = generate_embedding(MODEL_NAME_1, gallery_image_path)
        gallery_embedding_rows.append({
            "personnel": name,
            "image_name": img_file,
            "image_path": gallery_image_path,
            "embedding": gallery_embedding,
        })

gallery_embeddings = pd.DataFrame(gallery_embedding_rows) 
print(f"gallery_embeddings.shape: {gallery_embeddings.shape}")


gallery_embeddings.shape: (2265, 4)


In [10]:
# Get probe embeddings
probe_embeddings = task1_result.drop_duplicates(subset=["personnel"], keep='first').reset_index(drop=True)
probe_embeddings = probe_embeddings[["personnel", "probe_embedding_1"]]
assert probe_embeddings.shape[0] == 5 

# For each of the five probe images, calculate the distances against all the images in the gallery. 
distances = []
for _, probe_row in probe_embeddings.iterrows():
    probe_personnel = probe_row['personnel']
    probe_embedding = probe_row['probe_embedding_1']

    for _, gallery_row in gallery_embeddings.iterrows():
        gallery_embedding = gallery_row["embedding"]

        distances.append({
            "personnel": probe_personnel,
            "gallery_image": gallery_row["image_name"],
            "euclidean_distance": compute_distance(probe_embedding, gallery_embedding, 'euclidean'),
            "dot_product_distance": compute_distance(probe_embedding, gallery_embedding, 'dot_product'),
            "cosine_distance": compute_distance(probe_embedding, gallery_embedding, 'cosine'),
            "minkowski_distance": compute_distance(probe_embedding, gallery_embedding, 'minkowski')
        })

distances = pd.DataFrame(distances) 
print(f"distances.shape: {distances.shape}")

distances.shape: (11325, 6)


In [20]:
# Get the 10 nearest neighbors based on euclidean distance. 
ten_NN_by_euclidean = get_NN(distances, sortby='euclidean_distance', groupby='personnel', ascending=True) # sort distances from low to high (smallest distance = most similar)

# Get the 10 nearest neighbors based on  dot_product_distance
ten_NN_by_dot_product = get_NN(distances, sortby='dot_product_distance', groupby='personnel', ascending=False) # sort distances from high to low (largest distance = most similar)
ten_NN_by_dot_product = ten_NN_by_dot_product.drop(columns=["personnel"])

# Get the 10 nearest neighbors based on cosine_distance
ten_NN_by_cosine = get_NN(distances, sortby='cosine_distance', groupby='personnel', ascending=True) # sort distances from low to high (smallest distance = most similar)
ten_NN_by_cosine = ten_NN_by_cosine.drop(columns=["personnel"])

# Get the 10 nearest neighbors based on minkowski_distance
ten_NN_by_minkowski = get_NN(distances, sortby='minkowski_distance', groupby='personnel', ascending=True) # sort distances from low to high (smallest distance = most similar)
ten_NN_by_minkowski = ten_NN_by_minkowski.drop(columns=["personnel"])

# Combine the results
task2_result = ten_NN_by_euclidean.join([ten_NN_by_dot_product, ten_NN_by_cosine, ten_NN_by_minkowski]).reset_index(drop=True) 

for name in PERSONNEL:
    task2_result_by_name = task2_result.loc[task2_result['personnel'] == name]
    
    print(f"Top 10 nearest neighbors for {name}:")
    display(task2_result_by_name.style.hide())

Top 10 nearest neighbors for Drew_Barrymore:


personnel,gallery_image_by_euclidean_distance,euclidean_distance,rank_by_euclidean_distance,gallery_image_by_dot_product_distance,dot_product_distance,rank_by_dot_product_distance,gallery_image_by_cosine_distance,cosine_distance,rank_by_cosine_distance,gallery_image_by_minkowski_distance,minkowski_distance,rank_by_minkowski_distance
Drew_Barrymore,Julie_Gerberding_0007.jpg,0.395337,1,Julie_Gerberding_0007.jpg,0.921854,1,Julie_Gerberding_0007.jpg,0.078146,1,Julie_Gerberding_0007.jpg,0.162327,1
Drew_Barrymore,Geoff_Hoon_0006.jpg,0.398338,2,Geoff_Hoon_0006.jpg,0.920664,2,Geoff_Hoon_0006.jpg,0.079336,2,Geoff_Hoon_0006.jpg,0.163742,2
Drew_Barrymore,Oscar_De_La_Hoya_0005.jpg,0.402496,3,Oscar_De_La_Hoya_0005.jpg,0.918999,3,Oscar_De_La_Hoya_0005.jpg,0.081001,3,Oscar_De_La_Hoya_0005.jpg,0.165154,3
Drew_Barrymore,Dwayne_Johnson_0001.jpg,0.404644,4,Dwayne_Johnson_0001.jpg,0.918132,4,Dwayne_Johnson_0001.jpg,0.081868,4,Dwayne_Johnson_0001.jpg,0.166716,4
Drew_Barrymore,Julie_Gerberding_0005.jpg,0.406382,5,Julie_Gerberding_0005.jpg,0.917427,5,Julie_Gerberding_0005.jpg,0.082573,5,Julie_Gerberding_0005.jpg,0.16677,5
Drew_Barrymore,Julie_Gerberding_0003.jpg,0.412095,6,Julie_Gerberding_0003.jpg,0.915089,6,Julie_Gerberding_0003.jpg,0.084911,6,Julie_Gerberding_0003.jpg,0.169179,6
Drew_Barrymore,Taufik_Hidayat_0001.jpg,0.417285,7,Taufik_Hidayat_0001.jpg,0.912937,7,Taufik_Hidayat_0001.jpg,0.087063,7,Nancy_Pelosi_0002.jpg,0.171805,7
Drew_Barrymore,Nancy_Pelosi_0002.jpg,0.42159,8,Nancy_Pelosi_0002.jpg,0.911131,8,Nancy_Pelosi_0002.jpg,0.088869,8,Taufik_Hidayat_0001.jpg,0.172848,8
Drew_Barrymore,Hillary_Clinton_0007.jpg,0.42417,9,Hillary_Clinton_0007.jpg,0.91004,9,Hillary_Clinton_0007.jpg,0.08996,9,Hillary_Clinton_0007.jpg,0.175937,9
Drew_Barrymore,Jefferson_Perez_0001.jpg,0.431717,10,Jefferson_Perez_0001.jpg,0.90681,10,Jefferson_Perez_0001.jpg,0.09319,10,Ian_Thorpe_0001.jpg,0.17655,10


Top 10 nearest neighbors for Ian_Thorpe:


personnel,gallery_image_by_euclidean_distance,euclidean_distance,rank_by_euclidean_distance,gallery_image_by_dot_product_distance,dot_product_distance,rank_by_dot_product_distance,gallery_image_by_cosine_distance,cosine_distance,rank_by_cosine_distance,gallery_image_by_minkowski_distance,minkowski_distance,rank_by_minkowski_distance
Ian_Thorpe,Alvaro_Uribe_0005.jpg,0.489364,1,Alvaro_Uribe_0005.jpg,0.880261,1,Alvaro_Uribe_0005.jpg,0.119739,1,Alvaro_Uribe_0005.jpg,0.202377,1
Ian_Thorpe,Gary_Williams_0001.jpg,0.534547,2,Gary_Williams_0001.jpg,0.85713,2,Gary_Williams_0001.jpg,0.14287,2,Gary_Williams_0001.jpg,0.218757,2
Ian_Thorpe,Kim_Ryong-sung_0001.jpg,0.539373,3,Kim_Ryong-sung_0001.jpg,0.854539,3,Kim_Ryong-sung_0001.jpg,0.145461,3,Kim_Ryong-sung_0001.jpg,0.221552,3
Ian_Thorpe,John_McEnroe_0001.jpg,0.542252,4,John_McEnroe_0001.jpg,0.852981,4,John_McEnroe_0001.jpg,0.147019,4,John_McEnroe_0001.jpg,0.222375,4
Ian_Thorpe,Michael_Phelps_0003.jpg,0.545856,5,Michael_Phelps_0003.jpg,0.851021,5,Michael_Phelps_0003.jpg,0.148979,5,Michael_Phelps_0003.jpg,0.223999,5
Ian_Thorpe,Mikhail_Youzhny_0002.jpg,0.552381,6,Mikhail_Youzhny_0002.jpg,0.847438,6,Mikhail_Youzhny_0002.jpg,0.152562,6,Mikhail_Youzhny_0002.jpg,0.228062,6
Ian_Thorpe,Mike_Weir_0004.jpg,0.553614,7,Mike_Weir_0004.jpg,0.846756,7,Mike_Weir_0004.jpg,0.153244,7,Mike_Weir_0004.jpg,0.228226,7
Ian_Thorpe,Mikhail_Kasyanov_0003.jpg,0.55436,8,Mikhail_Kasyanov_0003.jpg,0.846342,8,Mikhail_Kasyanov_0003.jpg,0.153658,8,Mikhail_Kasyanov_0003.jpg,0.229014,8
Ian_Thorpe,Geoff_Hoon_0005.jpg,0.554689,9,Geoff_Hoon_0005.jpg,0.84616,9,Geoff_Hoon_0005.jpg,0.15384,9,Geoff_Hoon_0005.jpg,0.229284,9
Ian_Thorpe,Meghann_Shaughnessy_0001.jpg,0.564643,10,Meghann_Shaughnessy_0001.jpg,0.840589,10,Meghann_Shaughnessy_0001.jpg,0.159411,10,Meghann_Shaughnessy_0001.jpg,0.229941,10


Top 10 nearest neighbors for Nelson_Mandela:


personnel,gallery_image_by_euclidean_distance,euclidean_distance,rank_by_euclidean_distance,gallery_image_by_dot_product_distance,dot_product_distance,rank_by_dot_product_distance,gallery_image_by_cosine_distance,cosine_distance,rank_by_cosine_distance,gallery_image_by_minkowski_distance,minkowski_distance,rank_by_minkowski_distance
Nelson_Mandela,Nelson_Mandela_0003.jpg,0.516682,1,Nelson_Mandela_0003.jpg,0.86652,1,Nelson_Mandela_0003.jpg,0.13348,1,Nelson_Mandela_0003.jpg,0.215894,1
Nelson_Mandela,Jimmy_Carter_0006.jpg,0.56942,2,Jimmy_Carter_0006.jpg,0.83788,2,Jimmy_Carter_0006.jpg,0.16212,2,Jimmy_Carter_0006.jpg,0.232043,2
Nelson_Mandela,Dick_Cheney_0006.jpg,0.583526,3,Dick_Cheney_0006.jpg,0.829749,3,Dick_Cheney_0006.jpg,0.170251,3,Dick_Cheney_0010.jpg,0.239135,3
Nelson_Mandela,Dick_Cheney_0010.jpg,0.585275,4,Dick_Cheney_0010.jpg,0.828726,4,Dick_Cheney_0010.jpg,0.171274,4,Dick_Cheney_0006.jpg,0.241892,4
Nelson_Mandela,Ferenc_Madl_0001.jpg,0.605685,5,Ferenc_Madl_0001.jpg,0.816573,5,Ferenc_Madl_0001.jpg,0.183427,5,Peter_Struck_0001.jpg,0.248114,5
Nelson_Mandela,Peter_Struck_0001.jpg,0.610387,6,Peter_Struck_0001.jpg,0.813714,6,Peter_Struck_0001.jpg,0.186286,6,Jimmy_Carter_0005.jpg,0.250283,6
Nelson_Mandela,Jimmy_Carter_0005.jpg,0.613321,7,Jimmy_Carter_0005.jpg,0.811918,7,Jimmy_Carter_0005.jpg,0.188082,7,Ferenc_Madl_0001.jpg,0.252426,7
Nelson_Mandela,Boris_Becker_0005.jpg,0.61551,8,Boris_Becker_0005.jpg,0.810574,8,Boris_Becker_0005.jpg,0.189426,8,John_Manley_0002.jpg,0.252491,8
Nelson_Mandela,John_Manley_0002.jpg,0.616622,9,John_Manley_0002.jpg,0.809889,9,John_Manley_0002.jpg,0.190111,9,Boris_Becker_0005.jpg,0.253374,9
Nelson_Mandela,George_Pataki_0003.jpg,0.619571,10,George_Pataki_0003.jpg,0.808066,10,George_Pataki_0003.jpg,0.191934,10,Nelson_Mandela_0001.jpg,0.254702,10


Top 10 nearest neighbors for Owen_Wilson:


personnel,gallery_image_by_euclidean_distance,euclidean_distance,rank_by_euclidean_distance,gallery_image_by_dot_product_distance,dot_product_distance,rank_by_dot_product_distance,gallery_image_by_cosine_distance,cosine_distance,rank_by_cosine_distance,gallery_image_by_minkowski_distance,minkowski_distance,rank_by_minkowski_distance
Owen_Wilson,Paul_Bremer_0001.jpg,0.613407,1,Paul_Bremer_0001.jpg,0.811866,1,Paul_Bremer_0001.jpg,0.188134,1,Larry_Ellison_0001.jpg,0.250159,1
Owen_Wilson,Larry_Ellison_0001.jpg,0.614226,2,Larry_Ellison_0001.jpg,0.811363,2,Larry_Ellison_0001.jpg,0.188637,2,Bill_Graham_0004.jpg,0.253546,2
Owen_Wilson,Dennis_Erickson_0001.jpg,0.614437,3,Dennis_Erickson_0001.jpg,0.811234,3,Dennis_Erickson_0001.jpg,0.188767,3,Paul_Bremer_0001.jpg,0.25437,3
Owen_Wilson,Kevin_Spacey_0001.jpg,0.61656,4,Kevin_Spacey_0001.jpg,0.809927,4,Kevin_Spacey_0001.jpg,0.190073,4,Martin_Verkerk_0002.jpg,0.254421,4
Owen_Wilson,Bill_Graham_0004.jpg,0.619955,5,Bill_Graham_0004.jpg,0.807828,5,Bill_Graham_0004.jpg,0.192172,5,Kevin_Spacey_0001.jpg,0.254612,5
Owen_Wilson,Martin_Verkerk_0002.jpg,0.622717,6,Martin_Verkerk_0002.jpg,0.806111,6,Martin_Verkerk_0002.jpg,0.193889,6,Dennis_Erickson_0001.jpg,0.255366,6
Owen_Wilson,Aleksander_Kwasniewski_0003.jpg,0.626062,7,Aleksander_Kwasniewski_0003.jpg,0.804023,7,Aleksander_Kwasniewski_0003.jpg,0.195977,7,Matthew_Perry_0001.jpg,0.255763,7
Owen_Wilson,Matthew_Perry_0001.jpg,0.627598,8,Matthew_Perry_0001.jpg,0.80306,8,Matthew_Perry_0001.jpg,0.19694,8,Aleksander_Kwasniewski_0003.jpg,0.256952,8
Owen_Wilson,Eric_Hinske_0001.jpg,0.628641,9,Eric_Hinske_0001.jpg,0.802405,9,Eric_Hinske_0001.jpg,0.197595,9,Bill_Graham_0002.jpg,0.258177,9
Owen_Wilson,George_Pataki_0003.jpg,0.632184,10,George_Pataki_0003.jpg,0.800171,10,George_Pataki_0003.jpg,0.199829,10,George_Pataki_0003.jpg,0.258242,10


Top 10 nearest neighbors for Warren_Buffett:


personnel,gallery_image_by_euclidean_distance,euclidean_distance,rank_by_euclidean_distance,gallery_image_by_dot_product_distance,dot_product_distance,rank_by_dot_product_distance,gallery_image_by_cosine_distance,cosine_distance,rank_by_cosine_distance,gallery_image_by_minkowski_distance,minkowski_distance,rank_by_minkowski_distance
Warren_Buffett,Nancy_Pelosi_0004.jpg,0.454554,1,Nancy_Pelosi_0004.jpg,0.89669,1,Nancy_Pelosi_0004.jpg,0.10331,1,Nancy_Pelosi_0004.jpg,0.186353,1
Warren_Buffett,Gordon_Campbell_0001.jpg,0.455278,2,Gordon_Campbell_0001.jpg,0.896361,2,Gordon_Campbell_0001.jpg,0.103639,2,Gordon_Campbell_0001.jpg,0.187053,2
Warren_Buffett,Paul_Bremer_0002.jpg,0.463189,3,Paul_Bremer_0002.jpg,0.892728,3,Paul_Bremer_0002.jpg,0.107272,3,George_Voinovich_0002.jpg,0.189259,3
Warren_Buffett,Alejandro_Toledo_0007.jpg,0.463751,4,Alejandro_Toledo_0007.jpg,0.892467,4,Alejandro_Toledo_0007.jpg,0.107533,4,Alejandro_Toledo_0007.jpg,0.189575,4
Warren_Buffett,George_Voinovich_0002.jpg,0.465213,5,George_Voinovich_0002.jpg,0.891788,5,George_Voinovich_0002.jpg,0.108212,5,Paul_Bremer_0002.jpg,0.192028,5
Warren_Buffett,Gloria_Macapagal_Arroyo_0005.jpg,0.474772,6,Gloria_Macapagal_Arroyo_0005.jpg,0.887296,6,Gloria_Macapagal_Arroyo_0005.jpg,0.112704,6,Mike_Tyson_0003.jpg,0.192879,6
Warren_Buffett,Zoran_Djindjic_0001.jpg,0.477569,7,Zoran_Djindjic_0001.jpg,0.885964,7,Zoran_Djindjic_0001.jpg,0.114036,7,Gloria_Macapagal_Arroyo_0005.jpg,0.19424,7
Warren_Buffett,Mike_Tyson_0003.jpg,0.477971,8,Mike_Tyson_0003.jpg,0.885772,8,Mike_Tyson_0003.jpg,0.114228,8,Zoran_Djindjic_0001.jpg,0.196608,8
Warren_Buffett,Abdullah_Gul_0008.jpg,0.483009,9,Abdullah_Gul_0008.jpg,0.883351,9,Abdullah_Gul_0008.jpg,0.116649,9,Kurt_Busch_0001.jpg,0.19888,9
Warren_Buffett,Kurt_Busch_0001.jpg,0.483095,10,Kurt_Busch_0001.jpg,0.883309,10,Kurt_Busch_0001.jpg,0.116691,10,Abdullah_Gul_0008.jpg,0.200085,10


#### Observations and Analyses:

1. **Different Distance Metrics Yield Similar Rankings**  
   - The ranking of nearest neighbors are similar across Euclidean, Dot Product, Cosine, and Minkowski distances.
   - Particularly, Euclidean, Cosine and Minkowski distances are ranked with scores from low to high; while Dot Product is from high to low.

2. **Euclidean and Minkowski Distance**  
   - The nearest images under Euclidean and Minkowski distance are often the same or very similar.
   - This is expected since Minkowski distance is a generalization of Euclidean distance.

3. **Dot Product and Cosine Distance**  
   - Since Cosine distance normalizes embeddings, it focuses on angular similarity rather than absolute magnitude.
   - Dot Product may favor higher magnitude vectors, influencing the ranking.

4. **Identity Recognitions (# of matches between probe and gallery in top 10 nearest neighbors)**  
   - Nelson Mandela has the closest match with corresponding gallery image of himself. 
   - For the others, the top 10 ranked gallery images are rarely the ones of themselves
   - Further investigation into dataset biases, embedding normalization, and metric-specific behavior could refine matching accuracy.

---

#### **Task 3: Assignment Instructions:**

Report the rank positions of the five probe's associated gallery images. Note your observations.

If Person A is your probe, get the "rank position" of all A's images in the gallery. 

In [19]:
# GEt associated gallery images for the personnel
def get_gallery_personnel(s):
    return '_'.join(s.split('_')[:-1])

# Get ranking of all gallery images based on euclidean_distance and cosine_distance
rank_by_euclidean = get_ranking(distances, sortby='euclidean_distance', groupby='personnel', ascending=True)
rank_by_cosine = get_ranking(distances, sortby='cosine_distance', groupby='personnel', ascending=True)

# Retrieve ranking result for associated gallery images
rank_by_euclidean = rank_by_euclidean.loc[rank_by_euclidean['gallery_image_by_euclidean_distance'].apply(get_gallery_personnel) == rank_by_euclidean['personnel']].reset_index(drop=True)
rank_by_cosine = rank_by_cosine.loc[rank_by_cosine['gallery_image_by_cosine_distance'].apply(get_gallery_personnel) == rank_by_cosine['personnel']].reset_index(drop=True)

print(f"Euclidean distance ranking result for associated gallery images:")
display(rank_by_euclidean)

print(f"Cosine distance ranking result for associated gallery images:")
display(rank_by_cosine)

Euclidean distance ranking result for associated gallery images:


Unnamed: 0,personnel,gallery_image_by_euclidean_distance,euclidean_distance,rank_by_euclidean_distance
0,Drew_Barrymore,Drew_Barrymore_0001.jpg,0.583682,360
1,Ian_Thorpe,Ian_Thorpe_0006.jpg,0.608957,33
2,Ian_Thorpe,Ian_Thorpe_0008.jpg,0.633799,76
3,Ian_Thorpe,Ian_Thorpe_0001.jpg,0.656061,149
4,Ian_Thorpe,Ian_Thorpe_0005.jpg,0.749804,772
5,Ian_Thorpe,Ian_Thorpe_0007.jpg,0.801862,1214
6,Ian_Thorpe,Ian_Thorpe_0004.jpg,0.805604,1246
7,Ian_Thorpe,Ian_Thorpe_0003.jpg,0.818589,1366
8,Ian_Thorpe,Ian_Thorpe_0009.jpg,0.850744,1584
9,Ian_Thorpe,Ian_Thorpe_0002.jpg,0.940338,1959


Cosine distance ranking result for associated gallery images:


Unnamed: 0,personnel,gallery_image_by_cosine_distance,cosine_distance,rank_by_cosine_distance
0,Drew_Barrymore,Drew_Barrymore_0001.jpg,0.170342,360
1,Ian_Thorpe,Ian_Thorpe_0006.jpg,0.185414,33
2,Ian_Thorpe,Ian_Thorpe_0008.jpg,0.20085,76
3,Ian_Thorpe,Ian_Thorpe_0001.jpg,0.215208,149
4,Ian_Thorpe,Ian_Thorpe_0005.jpg,0.281103,772
5,Ian_Thorpe,Ian_Thorpe_0007.jpg,0.321491,1214
6,Ian_Thorpe,Ian_Thorpe_0004.jpg,0.324499,1246
7,Ian_Thorpe,Ian_Thorpe_0003.jpg,0.335044,1366
8,Ian_Thorpe,Ian_Thorpe_0009.jpg,0.361882,1584
9,Ian_Thorpe,Ian_Thorpe_0002.jpg,0.442117,1959


#### Observations and Analyses
Here Euclidean Distance and Cosine Distance are applied for ranking. The smaller the distances, the better the match (i.e., the lower the ranking result).

**Similar Ranking Positions Between Different Distance Metrics**
* The ranking positions of associated gallery images are same across Euclidean and Cosine distances, though with different distance values.

**Ranking and Distance observations**
* Nelson Mandela’s images are consistently well-matched, showing lower Euclidean distances overall.
    * Particularly, he has the best-ranked match (rank = 1, distance = 0.516682) among all personnel, indicating a strong match between the probe and the gallery image.
* On the other hand, Warren Buffett’s gallery image ("Warren_Buffett_0001.jpg") has a very poor rank (rank = 1974, distance = 0.926653), indicating a weak match.
* Ian Thorpe has multiple matches but with a significant variation in ranks (from 33 to 1959), suggesting that some images are much more representative of his features than others.
* Both Drew Barrymore’s ranking (360) and Owen Wilson's ranking (437) suggest a not-so-good match, indicating possible issues with representation or similarity in the dataset.

**Possible Issues in Recognition**
* The high variance in rankings for the same personnel (e.g., Ian Thorpe, Nelson Mandela, Warren Buffett) suggests pose variations, lighting conditions, or image quality differences may significantly affect Euclidean distances.
* If the best match is still poorly ranked (e.g., Owen Wilson's best match is at rank 437), it could indicate dataset imbalance, model bias, or suboptimal feature representation for that individual.
