# Analyze the AgeDB Dataset

### Requirements

Files needed are "AgeDB_Cleaned_Images.csv" and "AgeDB_Cleaned_Individual.csv" which are genereted by notebook file "AgeDB_Code"

### Get both dataframes  

In [48]:
import pandas as pd
import numpy as np

AgeDB_Images = pd.read_csv('createdCSV/AgeDB_Cleaned_Images.csv', index_col=0)
AgeDB_Individual =  pd.read_csv('createdCSV/AgeDB_Cleaned_Individual.csv', index_col=0)

Casia_Images = pd.read_csv('createdCSV/CASIA_Cleaned_Images.csv', index_col=0)
Casia_Individual = pd.read_csv('createdCSV/CASIA_Cleaned_Individual.csv', index_col=0)

In [49]:
AgeDB_Images

Unnamed: 0,ID,File Name,Name,Age,Gender,Age Range
0,3,10058_HelenHunt_45_f.jpg,HelenHunt,45,0,41-50
1,3,10044_HelenHunt_32_f.jpg,HelenHunt,32,0,31-40
2,3,10045_HelenHunt_33_f.jpg,HelenHunt,33,0,31-40
3,3,10046_HelenHunt_34_f.jpg,HelenHunt,34,0,31-40
4,3,10057_HelenHunt_44_f.jpg,HelenHunt,44,0,41-50
...,...,...,...,...,...,...
9884,566,9970_ElkeSommer_61_f.jpg,ElkeSommer,61,0,61-70
9885,566,9971_ElkeSommer_62_f.jpg,ElkeSommer,62,0,61-70
9886,566,9973_ElkeSommer_64_f.jpg,ElkeSommer,64,0,61-70
9887,566,9960_ElkeSommer_42_f.jpg,ElkeSommer,42,0,41-50


In [50]:
AgeDB_Individual

Unnamed: 0,ID,Name,Min Age,Max Age,Age Span,Number of Age Ranges,Number of Images,Gender
0,3,HelenHunt,10,51,41,6,38,0.0
1,4,JaneBirkin,19,68,49,6,39,0.0
2,5,PaulAnka,15,72,57,7,41,1.0
3,6,JaneFonda,25,74,49,6,30,0.0
4,13,MarietteHartley,21,73,52,6,33,0.0
...,...,...,...,...,...,...,...,...
269,556,MegRyan,7,77,70,7,34,0.0
270,557,DorisDay,17,82,65,7,49,0.0
271,559,JaneAsher,6,67,61,7,34,0.0
272,565,AliMacGraw,25,74,49,6,34,0.0


In [51]:
Casia_Images

Unnamed: 0,ID,Filename,Age,Gender,Age Range
0,2,000002_00000273.jpg,44.5,1,41-50
1,2,000002_00000274.jpg,46.0,1,41-50
2,2,000002_00000275.jpg,56.0,1,51-60
3,2,000002_00000276.jpg,31.5,1,31-40
4,2,000002_00000277.jpg,34.5,1,31-40
...,...,...,...,...,...
66078,10553,010553_00490111.jpg,36.0,1,31-40
66079,10553,010553_00490113.jpg,34.0,1,31-40
66080,10553,010553_00490114.jpg,34.5,1,31-40
66081,10553,010553_00490115.jpg,26.0,1,21-30


In [52]:
Casia_Individual

Unnamed: 0,ID,Min Age,Max Age,Age Span,Number of Age Ranges,Number of Images,Gender
0,2,31.0,69.0,38.0,4,60,1.0
1,17,22.5,47.5,25.0,3,36,0.0
2,24,17.0,83.0,66.0,7,187,1.0
3,29,21.5,74.0,52.5,6,88,1.0
4,30,20.0,77.0,57.0,6,64,1.0
...,...,...,...,...,...,...,...
1269,10497,9.5,45.0,35.5,4,37,1.0
1270,10541,24.5,56.0,31.5,4,30,1.0
1271,10545,23.0,60.0,37.0,4,30,1.0
1272,10546,27.0,43.5,16.5,3,42,1.0


### Face recognition

Now we need to analyze the similarity between the same person across different age ranges. In order to compare and find similarities/differences we need to find the feature vector for each image.

In [54]:
# Help functions

def cosine_similarity(feat1, feat2):
    return (np.dot(feat1, feat2.T) / (np.linalg.norm(feat1) * np.linalg.norm(feat2)))[0][0]

def get_row_information(df, filename):
    
    row = df.loc[df["Filename"] == filename]
    
    id = row["ID"].values[0]
    #name = row["Name"].values[0] # No name in Casia
    age = row["Age"].values[0]
    age_range = row["Age Range"].values[0]
    gender = row["Gender"].values[0]

    return id, age, age_range, gender # Return name at second position for AgeDB

def get_path(filename):

    path_to_folder = '/home/ivarbl/FaceProject/FaceProject/Images/casia-webface-valid-remote/'
    path_to_image = path_to_folder + filename[:-4]

    return path_to_image

In [55]:
# Function to gather all similarities and features

#from insightface.app import FaceAnalysis
#from insightface.data import get_image as ins_get_image
#from PIL import Image

def face_comparison_dataframe(df):

    ID_individual = []
    #Name_individual  = [] # No name in Casia
    Gender_individual = []
    firstFilename = []
    secondFilename = []
    firstAge = []
    secondAge = []
    firstAgeRange = []
    secondAgeRange = []
    #firstImageFeature = []
    #secondImageFeature = []
    #similarities = []

    list_of_images = df["Filename"].to_list() # "File Name"/"Filename" for AgeDB/Casia

    #app = FaceAnalysis(name='buffalo_l')
    #app.prepare(ctx_id=0, det_size=(256,256)) # Need 128x128 for Casia
    

    for i in range(len(list_of_images) - 1):
        
        firstImage_filename = list_of_images[i]
        id_individual, first_age, first_age_range, gender_individual = get_row_information(df, firstImage_filename)
        #first_path = get_path(firstImage_filename)

        for j in range(i+1, len(list_of_images)):

            secondImage_filename = list_of_images[j]
            second_id, second_age, second_age_range, _ = get_row_information(df, secondImage_filename)

            if id_individual == second_id:
                #second_path = get_path(secondImage_filename)
                
                #first_image = ins_get_image(first_path)
                #second_image = ins_get_image(second_path)

                #first_face = app.get(first_image)
                #second_face = app.get(second_image)

                #first_feature = []
                #for face in first_face:
                #    first_feature.append(face.normed_embedding)

                #second_feature = []
                #for face in second_face:
                #    second_feature.append(face.normed_embedding)
                
                #first_feature_np = np.array(first_feature)
                #second_feature_np = np.array(second_feature)

                #sims = cosine_similarity(first_feature_np, second_feature_np)

                ID_individual.append(id_individual)
                #Name_individual.append(name_individual)
                Gender_individual.append(gender_individual)
                firstFilename.append(firstImage_filename)
                secondFilename.append(secondImage_filename)
                firstAge.append(first_age)
                secondAge.append(second_age)
                firstAgeRange.append(first_age_range)
                secondAgeRange.append(second_age_range)
                #firstImageFeature.append(first_feature[0])
                #secondImageFeature.append(second_feature[0])
                #similarities.append(sims)
    
    result_col = { "ID" : ID_individual,
                    #"Name" : Name_individual,
                    "Gender" : Gender_individual,
                    "First Image" : firstFilename,
                    "First Age" : firstAge,
                    "First Age Range": firstAgeRange,
                    #"First Image Features" : firstImageFeature,
                    "Second Image" : secondFilename,
                    "Second Age" : secondAge,
                    "Second Age Range" : secondAgeRange,
                    #"Second Image Features" : secondImageFeature,
                    #"Cosine Similarity" : similarities
                    }

    result_df = pd.DataFrame(result_col)
    return result_df

## Use external resources

Below command runs above function, this takes around 60 hours to execute on a "regular" computer. 

In [37]:
from multiprocessing import Pool

unique_id_list_AgeDB = AgeDB_Individual["ID"].to_list()
input_df_id_AgeDB = [AgeDB_Images[AgeDB_Images["ID"] == i] for i in unique_id_list_AgeDB]

len(input_df_id_AgeDB)

274

In [38]:
output_df_AgeDB = []

with Pool() as pool:
    for output in pool.map(face_comparison_dataframe, input_df_id_AgeDB):
        output_df_AgeDB.append(output)

In [39]:
AgeDB_pairs = pd.concat(output_df_AgeDB)
AgeDB_pairs

Unnamed: 0,ID,Name,Gender,First Image,First Age,First Age Range,Second Image,Second Age,Second Age Range
0,3,HelenHunt,0,10058_HelenHunt_45_f.jpg,45,41-50,10044_HelenHunt_32_f.jpg,32,31-40
1,3,HelenHunt,0,10058_HelenHunt_45_f.jpg,45,41-50,10045_HelenHunt_33_f.jpg,33,31-40
2,3,HelenHunt,0,10058_HelenHunt_45_f.jpg,45,41-50,10046_HelenHunt_34_f.jpg,34,31-40
3,3,HelenHunt,0,10058_HelenHunt_45_f.jpg,45,41-50,10057_HelenHunt_44_f.jpg,44,41-50
4,3,HelenHunt,0,10058_HelenHunt_45_f.jpg,45,41-50,10047_HelenHunt_34_f.jpg,34,31-40
...,...,...,...,...,...,...,...,...,...
523,566,ElkeSommer,0,9971_ElkeSommer_62_f.jpg,62,61-70,9960_ElkeSommer_42_f.jpg,42,41-50
524,566,ElkeSommer,0,9971_ElkeSommer_62_f.jpg,62,61-70,9946_ElkeSommer_18_f.jpg,18,11-20
525,566,ElkeSommer,0,9973_ElkeSommer_64_f.jpg,64,61-70,9960_ElkeSommer_42_f.jpg,42,41-50
526,566,ElkeSommer,0,9973_ElkeSommer_64_f.jpg,64,61-70,9946_ElkeSommer_18_f.jpg,18,11-20


In [40]:
# Saving work
AgeDB_pairs.to_csv("createdCSV/AgeDB_pairs.csv")

In [56]:
unique_id_list_CASIA = Casia_Individual["ID"].to_list()
input_df_id_CASIA = [Casia_Images[Casia_Images["ID"] == i] for i in unique_id_list_CASIA]

len(input_df_id_CASIA)

1274

In [57]:
output_df_CASIA = []

with Pool() as pool:
    for output in pool.map(face_comparison_dataframe, input_df_id_CASIA):
        output_df_CASIA.append(output)

In [58]:
CASIA_pairs = pd.concat(output_df_CASIA)
CASIA_pairs

Unnamed: 0,ID,Gender,First Image,First Age,First Age Range,Second Image,Second Age,Second Age Range
0,2,1,000002_00000273.jpg,44.5,41-50,000002_00000274.jpg,46.0,41-50
1,2,1,000002_00000273.jpg,44.5,41-50,000002_00000275.jpg,56.0,51-60
2,2,1,000002_00000273.jpg,44.5,41-50,000002_00000276.jpg,31.5,31-40
3,2,1,000002_00000273.jpg,44.5,41-50,000002_00000277.jpg,34.5,31-40
4,2,1,000002_00000273.jpg,44.5,41-50,000002_00000279.jpg,46.0,41-50
...,...,...,...,...,...,...,...,...
523,10553,1,010553_00490113.jpg,34.0,31-40,010553_00490115.jpg,26.0,21-30
524,10553,1,010553_00490113.jpg,34.0,31-40,010553_00490116.jpg,36.5,31-40
525,10553,1,010553_00490114.jpg,34.5,31-40,010553_00490115.jpg,26.0,21-30
526,10553,1,010553_00490114.jpg,34.5,31-40,010553_00490116.jpg,36.5,31-40


In [59]:
# Saving work
CASIA_pairs.to_csv("createdCSV/CASIA_pairs.csv")