In [1]:
import shutil
import os
import zipfile

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from PIL import Image, ImageOps
from sklearn.model_selection import train_test_split

In [8]:
images = []
nation = []
genders = []

# Dataset is from the following paper: https://arxiv.org/pdf/1702.08423
for i in os.listdir(os.path.expanduser("~/Person-feature-detection/Datasets/utkface-dataset/utkface_aligned_cropped/crop_part1/"))[0:10000]:
    if i.count('_') >= 3:
        split = i.split('_')
        genders.append(int(split[1]))
        nation.append(int(split[2]))
        images.append(Image.open(os.path.expanduser("~/Person-feature-detection/Datasets/utkface-dataset/utkface_aligned_cropped/crop_part1/" + i)))

In [9]:
images = pd.Series(list(images), name = 'Images')
nation = pd.Series(list(nation), name = 'Nation')
genders = pd.Series(list(genders), name = 'Genders')

df = pd.concat([images, nation, genders], axis=1)
print(df)

                                                 Images  Nation  Genders
0     <PIL.JpegImagePlugin.JpegImageFile image mode=...       3        0
1     <PIL.JpegImagePlugin.JpegImageFile image mode=...       0        0
2     <PIL.JpegImagePlugin.JpegImageFile image mode=...       1        0
3     <PIL.JpegImagePlugin.JpegImageFile image mode=...       3        1
4     <PIL.JpegImagePlugin.JpegImageFile image mode=...       0        0
...                                                 ...     ...      ...
9773  <PIL.JpegImagePlugin.JpegImageFile image mode=...       3        0
9774  <PIL.JpegImagePlugin.JpegImageFile image mode=...       3        1
9775  <PIL.JpegImagePlugin.JpegImageFile image mode=...       0        0
9776  <PIL.JpegImagePlugin.JpegImageFile image mode=...       0        0
9777  <PIL.JpegImagePlugin.JpegImageFile image mode=...       3        1

[9778 rows x 3 columns]


In [10]:
df['Nation'].unique()

array([3, 0, 1, 2, 4])

In [11]:
nation_labels = {
    0: "White",
    1: "Black",
    2: "Asian",
    3: "Indian",
    4: "Others"
}

df['NationLabel'] = df['Nation'].map(nation_labels)

df

Unnamed: 0,Images,Nation,Genders,NationLabel
0,<PIL.JpegImagePlugin.JpegImageFile image mode=...,3,0,Indian
1,<PIL.JpegImagePlugin.JpegImageFile image mode=...,0,0,White
2,<PIL.JpegImagePlugin.JpegImageFile image mode=...,1,0,Black
3,<PIL.JpegImagePlugin.JpegImageFile image mode=...,3,1,Indian
4,<PIL.JpegImagePlugin.JpegImageFile image mode=...,0,0,White
...,...,...,...,...
9773,<PIL.JpegImagePlugin.JpegImageFile image mode=...,3,0,Indian
9774,<PIL.JpegImagePlugin.JpegImageFile image mode=...,3,1,Indian
9775,<PIL.JpegImagePlugin.JpegImageFile image mode=...,0,0,White
9776,<PIL.JpegImagePlugin.JpegImageFile image mode=...,0,0,White


In [12]:
base_dir = os.path.expanduser("~/Person-feature-detection/nation/")
splits = ['Train', 'Validate', 'Test']

for split in splits:
    for nation in nation_labels.values():
        path = os.path.join(base_dir, split, nation)
        os.makedirs(path, exist_ok=True)

train_ratio = 0.8
val_ratio = 0.05
test_ratio = 0.15

train_df, temp_df = train_test_split(df, test_size=val_ratio + test_ratio, stratify=df['Nation'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=test_ratio / (val_ratio + test_ratio), stratify=temp_df['Nation'], random_state=42)

def copy_images_by_nation(dataframe, split):
    for _, row in dataframe.iterrows():
        image = row['Images']
        nation_label = nation_labels[row['Nation']]
        dest_dir = os.path.join(base_dir, split, nation_label)
        image_path = f"{dest_dir}/{os.path.basename(image.filename)}"
        image.save(image_path)

copy_images_by_nation(train_df, "Train")
copy_images_by_nation(val_df, "Validate")
copy_images_by_nation(test_df, "Test")