## Casia dataset

Preprocessing of Casia dataset.

Goal: Get 2 dataframes.

First dataframe (one row per image): ID, Filename, Gender, Age Range

Second dataframe (one row per person): ID, Name, Min Age, Max Age, Age Span, Number of Age Ranges, Number of Images, Gender


In [1]:
import pandas as pd
import numpy as np

In [2]:
with open('MetaData/casia-webface.txt') as f:
    lines = f.read().splitlines()

In [4]:
list_of_rows = [[int(lines[i].split(" ")[0]), str(lines[i].split(" ")[1]), float(lines[i].split(" ")[2]), int(lines[i].split(" ")[3])] for i in range(len(lines))]
df_casia = pd.DataFrame(list_of_rows, columns=["ID", "Filename", "Age", "Gender"])
df_casia

Unnamed: 0,ID,Filename,Age,Gender
0,0,casia-webface/000000/00000001.jpg,26.0,1
1,0,casia-webface/000000/00000002.jpg,30.0,1
2,0,casia-webface/000000/00000003.jpg,31.0,1
3,0,casia-webface/000000/00000004.jpg,28.0,1
4,0,casia-webface/000000/00000005.jpg,32.0,1
...,...,...,...,...
462329,10571,casia-webface/010571/00490619.jpg,29.5,0
462330,10571,casia-webface/010571/00490620.jpg,22.0,0
462331,10571,casia-webface/010571/00490621.jpg,35.5,1
462332,10571,casia-webface/010571/00490622.jpg,28.0,0


In [5]:
# Create age-range function
def age_range_func(x):
    
    if x <= 10:
        return '0-10'
    elif x <= 20:
        return '11-20'
    elif x <= 30:
        return '21-30'
    elif x <= 40:
        return '31-40'
    elif x <= 50:
        return '41-50'
    elif x <= 60:
        return '51-60'
    elif x <= 70:
        return '61-70'
    else:
        return '70+'

#Adding column
df_casia["Age Range"] = df_casia["Age"].apply(age_range_func)
df_casia

Unnamed: 0,ID,Filename,Age,Gender,Age Range
0,0,casia-webface/000000/00000001.jpg,26.0,1,21-30
1,0,casia-webface/000000/00000002.jpg,30.0,1,21-30
2,0,casia-webface/000000/00000003.jpg,31.0,1,31-40
3,0,casia-webface/000000/00000004.jpg,28.0,1,21-30
4,0,casia-webface/000000/00000005.jpg,32.0,1,31-40
...,...,...,...,...,...
462329,10571,casia-webface/010571/00490619.jpg,29.5,0,21-30
462330,10571,casia-webface/010571/00490620.jpg,22.0,0,21-30
462331,10571,casia-webface/010571/00490621.jpg,35.5,1,31-40
462332,10571,casia-webface/010571/00490622.jpg,28.0,0,21-30


In [7]:
def creating_individual_dataframe(df_image):
    """
        Creating a dataframe with information of each person from a dataframe (called "df_image") that holds information of each image. 

        Criteria: The dataframe df_image needs to have colums 
            ID: Specific ID for each person
            File Name: The filename of the image
            Name: Name of the person
            Age: Ground truth age of the person
            Gender: 0 if female and 1 if male
            Age Range: One of the ranges 0-10, 11-20, 21-30, 31-40, 41-50, 51-60, 61-70, 70+
    """

    ID_indiv = list(np.unique(np.array(df_image["ID"])))

    MinAge = []
    MaxAge = []
    AgeSpan = []
    NumAgeRanges = []
    NumImages = []
    Gender_Unique = []

    for i in ID_indiv:
        
        ages = np.array(df_image.loc[df_image["ID"] == i, "Age"])
        gender_indiv = np.array(df_image.loc[df_image["ID"] == i, "Gender"])

        max_age = np.max(ages)
        min_age = np.min(ages)
        number_of_images = len(ages)
        
        MinAge.append(min_age)
        MaxAge.append(max_age)
        AgeSpan.append(max_age - min_age)
        NumImages.append(number_of_images)
        
        NumAgeRanges.append(len(np.unique(np.array(df_image.loc[df_image["ID"] == i, "Age Range"]))))

        # We are taking the average of each gender if the gender isn't the "ground-truth" we will detect noice if gender equals 0.5
        Gender_Unique.append(np.average(gender_indiv))


    # Create Individual DataFrame

    data_indiv = {"ID" : ID_indiv,
                "Min Age" : MinAge,
                "Max Age" : MaxAge,
                "Age Span" : AgeSpan,
                "Number of Age Ranges" : NumAgeRanges,
                "Number of Images" : NumImages,
                "Gender": Gender_Unique}
            
    df_individual = pd.DataFrame(data_indiv)
    
    return df_individual

df_casia_individual = creating_individual_dataframe(df_casia)
df_casia_individual

Unnamed: 0,ID,Min Age,Max Age,Age Span,Number of Age Ranges,Number of Images,Gender
0,0,20.0,36.0,16.0,3,12,1.000000
1,1,5.0,67.5,62.5,7,254,0.019685
2,2,31.0,69.0,38.0,4,62,1.000000
3,3,13.0,76.0,63.0,7,364,0.964286
4,4,7.0,66.0,59.0,5,72,0.027778
...,...,...,...,...,...,...,...
10567,10567,11.5,39.0,27.5,3,52,0.038462
10568,10568,22.5,56.0,33.5,4,16,0.937500
10569,10569,20.5,36.5,16.0,2,16,0.062500
10570,10570,5.0,49.0,44.0,5,27,0.000000
