## Casia dataset

Preprocessing of Casia dataset.

Note: Since we are running this notebook file on UPPMAX and we are not able to transfer the complete CASIA dataset onto the remote serve. We processed the data on local and will analysis the result here. Please check "CASIA_Local_Preprocess.ipynb".



In [3]:
import pandas as pd
import numpy as np

In [11]:
df_casia_images = pd.read_csv("MetaData/df_casia_images.csv", index_col=0)
df_casia_images

Unnamed: 0,ID,Filename,Age,Gender,Age Range
0,2,000002_00000273.jpg,44.5,1,41-50
1,2,000002_00000274.jpg,46.0,1,41-50
2,2,000002_00000275.jpg,56.0,1,51-60
3,2,000002_00000276.jpg,31.5,1,31-40
4,2,000002_00000277.jpg,34.5,1,31-40
...,...,...,...,...,...
66078,10553,010553_00490111.jpg,36.0,1,31-40
66079,10553,010553_00490113.jpg,34.0,1,31-40
66080,10553,010553_00490114.jpg,34.5,1,31-40
66081,10553,010553_00490115.jpg,26.0,1,21-30


In [12]:
df_casia_individual = pd.read_csv("MetaData/df_casia_individual.csv", index_col=0)
df_casia_individual

Unnamed: 0,ID,Min Age,Max Age,Age Span,Number of Age Ranges,Number of Images,Gender
0,2,31.0,69.0,38.0,4,62,1.0
1,17,22.5,47.5,25.0,3,36,0.0
2,24,17.0,83.0,66.0,7,189,1.0
3,29,21.5,74.0,52.5,6,88,1.0
4,30,20.0,77.0,57.0,6,66,1.0
...,...,...,...,...,...,...,...
1269,10497,9.5,45.0,35.5,4,37,1.0
1270,10541,24.5,56.0,31.5,4,30,1.0
1271,10545,23.0,60.0,37.0,4,30,1.0
1272,10546,27.0,43.5,16.5,3,42,1.0


In [7]:
ls Images/casia-webface-valid-remote >> MetaData/casia_filename.txt

In [8]:
# Reading file
with open('MetaData/casia_filename.txt') as f:
    lines = f.read().splitlines()

In [10]:
len(lines)  # Length: 66083 - which implies that all images have been transferd to the remote server

66083

In [18]:
# Create one DataFrame with all information of each person 
# With following columns:
# ID, Min Age, Max Age, Average Age, Age Span, Number of Age Ranges, Number of Images, Gender

# Creating a funciton so we can reuse it later on

def creating_individual_dataframe(df_image):
    """
        Creating a dataframe with information of each person from a dataframe (called "df_image") that holds information of each image. 

        Criteria: The dataframe df_image needs to have colums 
            ID: Specific ID for each person
            File Name: The filename of the image
            Name: Name of the person
            Age: Ground truth age of the person
            Gender: 0 if female and 1 if male
            Age Range: One of the ranges 0-10, 11-20, 21-30, 31-40, 41-50, 51-60, 61-70, 70+
    """

    ID_indiv = list(np.unique(np.array(df_image["ID"])))
    #number_of_indiv = len(ID_indiv)

    MinAge = []
    MaxAge = []
    AgeSpan = []
    NumAgeRanges = []
    NumImages = []
    #Name_List_Unique = []
    Gender_Unique = []

    for i in ID_indiv:
        
        ages = np.array(df_image.loc[df_image["ID"] == i, "Age"])
        gender_indiv = np.array(df_image.loc[df_image["ID"] == i, "Gender"])
        #name_unique = np.unique(np.array(df_image.loc[df_image["ID"] == i, "Name"]))[0]

        max_age = np.max(ages)
        min_age = np.min(ages)
        number_of_images = len(ages)
        
        MinAge.append(min_age)
        MaxAge.append(max_age)
        AgeSpan.append(max_age - min_age)
        NumImages.append(number_of_images)
        #Name_List_Unique.append(name_unique)
        
        NumAgeRanges.append(len(np.unique(np.array(df_image.loc[df_image["ID"] == i, "Age Range"]))))

        # We are taking the average of each gender if the gender isn't the "ground-truth" we will detect noice if gender equals 0.5
        Gender_Unique.append(np.average(gender_indiv))


    # Create Individual DataFrame

    data_indiv = {"ID" : ID_indiv,
                "Min Age" : MinAge,
                "Max Age" : MaxAge,
                "Age Span" : AgeSpan,
                "Number of Age Ranges" : NumAgeRanges,
                "Number of Images" : NumImages,
                "Gender": Gender_Unique}
            
    df_individual = pd.DataFrame(data_indiv)
    
    return df_individual

In [16]:
from insightface.app import FaceAnalysis
from insightface.data import get_image as ins_get_image
from PIL import Image

def valid_faces_insightface(df_images, path_to_data):

    """
    Creating a dataframe on the same form as df_images but only with images included ONE person and images that insightface can recognize. 
    """

    model = 'buffalo_l'

    imageList = df_images["Filename"].to_list()
    print(len(imageList))
    
    found_images_list = []

    app = FaceAnalysis(name=model)
    app.prepare(ctx_id=0, det_size=(128, 128)) # All images are 112x112

    for i in range(len(imageList)):
        
        #open_image = Image.open(path_to_data + str(imageList[i]))
        #image_size = open_image.size

        # We need to delete .jpg in order to run insightface
        image_name = imageList[i][:-4]
        
        path_to_image = path_to_data + str(image_name)

        image = ins_get_image(path_to_image)
        
        faces = app.get(image)

        features = []
        for face in faces:
            features.append(face.normed_embedding)

        if len(features) == 1:
            found_images_list.append(image_name)
        
        if i % 10000 == 0:
            print(i)

    

    # Creating a dataframe for all images that we have identify one and only one face
    rows_df = []

    for found_face in found_images_list:
        filename_found_face = str(found_face) + str('.jpg')
        row = df_images[df_images['Filename'] == filename_found_face]
        rows_df.append(row)

    df_found_faces = pd.concat(rows_df)

    return df_found_faces


# Approximated runtime X min
df_casia_images =  valid_faces_insightface(df_casia_images, '/home/ivarbl/FaceProject/FaceProject/Images/casia-webface-valid-remote/')
df_casia_images

66083
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /home/ivarbl/.insightface/models/buffalo_l/1k3d68.onnx landmark_3d_68 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /home/ivarbl/.insightface/models/buffalo_l/2d106det.onnx landmark_2d_106 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /home/ivarbl/.insightface/models/buffalo_l/det_10g.onnx detection [1, 3, '?', '?'] 127.5 128.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /home/ivarbl/.insightface/models/buffalo_l/genderage.onnx genderage ['None', 3, 96, 96] 0.0 1.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /home/ivarbl/.insightface/models/buffalo_l/w600k_r50.onnx recognition ['None', 3, 112, 112] 127.5 12

  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4


0
10000
20000
30000
40000
50000
60000


Unnamed: 0,ID,Filename,Age,Gender,Age Range
0,2,000002_00000273.jpg,44.5,1,41-50
1,2,000002_00000274.jpg,46.0,1,41-50
2,2,000002_00000275.jpg,56.0,1,51-60
3,2,000002_00000276.jpg,31.5,1,31-40
4,2,000002_00000277.jpg,34.5,1,31-40
...,...,...,...,...,...
66078,10553,010553_00490111.jpg,36.0,1,31-40
66079,10553,010553_00490113.jpg,34.0,1,31-40
66080,10553,010553_00490114.jpg,34.5,1,31-40
66081,10553,010553_00490115.jpg,26.0,1,21-30


In [19]:
df_casia_individual = creating_individual_dataframe(df_casia_images)
df_casia_individual 

Unnamed: 0,ID,Min Age,Max Age,Age Span,Number of Age Ranges,Number of Images,Gender
0,2,31.0,69.0,38.0,4,60,1.0
1,17,22.5,47.5,25.0,3,36,0.0
2,24,17.0,83.0,66.0,7,187,1.0
3,29,21.5,74.0,52.5,6,88,1.0
4,30,20.0,77.0,57.0,6,64,1.0
...,...,...,...,...,...,...,...
1269,10497,9.5,45.0,35.5,4,37,1.0
1270,10541,24.5,56.0,31.5,4,30,1.0
1271,10545,23.0,60.0,37.0,4,30,1.0
1272,10546,27.0,43.5,16.5,3,42,1.0


In [20]:
# Saving work so we don't lose anything
df_casia_images.to_csv("createdCSV/CASIA_Cleaned_Images.csv")
df_casia_individual.to_csv("createdCSV/CASIA_Cleaned_Individual.csv")

In [21]:
# Checking host

import socket
print(socket.gethostname())

r1002.uppmax.uu.se
