#### <b>Load Dataset</b>

In [None]:
!wget https://postechackr-my.sharepoint.com/:u:/g/personal/dongbinna_postech_ac_kr/Eb37jNPPA7hHl0fmktYqcV8B-qmPLx-ZKYQ1eFk4UPBV_A?download=1 -O CelebAMask-HQ.zip
!wget https://postechackr-my.sharepoint.com/:t:/g/personal/dongbinna_postech_ac_kr/EVRoUY8_txRFv56-KWvZrksBDWbD6adkjBxwwRN7qAC6bg?download=1 -O CelebA-HQ-identity.txt
!wget https://postechackr-my.sharepoint.com/:t:/g/personal/dongbinna_postech_ac_kr/EVrdIrPOkR1OlEWBVK8lE3AB9bFh741GnKBkNgPa8trNuA?download=1 -O CelebA-HQ-attribute.txt

In [None]:
%%capture
!rm -rf ./CelebAMask-HQ
!unzip CelebAMask-HQ.zip

#### <b>Generate Facial Identity Recognition Dataset</b>

In [1]:
identities = {}
with open('/root/data/CelebA-HQ-identity.txt') as f:
    lines = f.readlines()
    for line in lines:
        file_name, identity = line.strip().split()
        identities[file_name] = identity

print(f'There are {len(set(identities.values()))} identities.')
print(f'There are {len(identities.keys())} images.')

There are 6217 identities.
There are 30000 images.


In [5]:
import os
from shutil import copyfile
source_root = '/root/data/temp/CelebA-HQ-img'
target_root = '/root/data/identity_celebahq'
file_list = os.listdir(source_root)

for file in file_list:
    identity = identities[file]
    source = os.path.join(source_root, file)
    target = os.path.join(target_root, str(identity), file)
    if not os.path.exists(os.path.join(target_root, str(identity))):
        os.makedirs(os.path.join(target_root, str(identity)))
    copyfile(source, target)

In [4]:
import os

source_root = '/root/data/temp/CelebA-HQ-img'
target_root = '/root/data/identity_celebahq'
folder_root = target_root
folder_list = os.listdir(folder_root)

threshold= 5
identity_cnt = 0

train_images = 0
test_images = 0
train_ratio = 0.8

for folder in folder_list:
    file_list = os.path.join(folder_root, folder)
    file_list = os.listdir(file_list)
    if len(file_list) >= threshold:
        identity_cnt += 1
#         num_train = int(train_ratio * len(file_list))
#         for file in file_list[:num_train]:
        for file in file_list:
            train_images += 1
#             source = os.path.join(folder_root, folder, file)
#             target = os.path.join(folder_root, 'whole', folder, file)
#             if not os.path.exists(os.path.join(folder_root, 'whole', folder)):
#                 os.makedirs(os.path.join(folder_root, 'whole', folder))
#             os.rename(source, target)
#         for file in file_list[num_train:]:
#             test_images += 1
#             source = os.path.join(folder_root, folder, file)
#             target = os.path.join(folder_root, 'test', folder, file)
#             if not os.path.exists(os.path.join(folder_root, 'test', folder)):
#                 os.makedirs(os.path.join(folder_root, 'test', folder))
#             os.rename(source, target)

print(f'There are {identity_cnt} identities that have more than {threshold} images.')
print(f'There are {train_images} train images.')
print(f'There are {test_images} test images.')

There are 1 identities that have more than 5 images.
There are 3819 train images.
There are 0 test images.


In [None]:
!mkdir -p ./facial_identity_dataset/train
!mkdir -p ./facial_identity_dataset/test
os.rename('./identity_dataset/train', './facial_identity_dataset/train')
os.rename('./identity_dataset/test', './facial_identity_dataset/test')

#### <b>Generate Face Gender Recognition Dataset</b>

In [None]:
gender_map = {}

with open('./CelebA-HQ-attribute.txt') as f:
    lines = f.readlines()
    for line in lines[2:]:
        splited = line.strip().split()
        file_name, male = splited[0], splited[21]
        gender_map[file_name] = male

print(f'There are {len(set(gender_map.values()))} classes.')
print(f'There are {len(gender_map.keys())} images.')

In [None]:
import os
from shutil import copyfile


source_root = './CelebAMask-HQ/CelebA-HQ-img/'
target_root = './gender_dataset/'
file_list = os.listdir(source_root)

for file in file_list:
    gender = gender_map[file]
    if gender == '1':
        gender = 'male'
    else:
        gender = 'female'
    source = os.path.join(source_root, file)
    target = os.path.join(target_root, gender, file)
    if not os.path.exists(os.path.join(target_root, gender)):
        os.makedirs(os.path.join(target_root, gender))
    copyfile(source, target)

In [None]:
folder_root = './gender_dataset/'
folder_list = os.listdir(folder_root)

male_cnt = 0
female_cnt = 0

train_images = 0
test_images = 0
train_ratio = 0.8

for folder in folder_list:
    file_list = os.path.join(folder_root, folder)
    file_list = os.listdir(file_list)
    if folder == 'male':
        male_cnt += len(file_list)
    else:
        female_cnt += len(file_list)
    num_train = int(train_ratio * len(file_list))
    for file in file_list[:num_train]:
        train_images += 1
        source = os.path.join(folder_root, folder, file)
        target = os.path.join(folder_root, 'train', folder, file)
        if not os.path.exists(os.path.join(folder_root, 'train', folder)):
            os.makedirs(os.path.join(folder_root, 'train', folder))
        os.rename(source, target)
    for file in file_list[num_train:]:
        test_images += 1
        source = os.path.join(folder_root, folder, file)
        target = os.path.join(folder_root, 'test', folder, file)
        if not os.path.exists(os.path.join(folder_root, 'test', folder)):
            os.makedirs(os.path.join(folder_root, 'test', folder))
        os.rename(source, target)

print(f'There are {male_cnt} male images.')
print(f'There are {female_cnt} female images.')
print(f'There are {train_images} train images.')
print(f'There are {test_images} test images.')

In [None]:
!mkdir -p ./face_gender_dataset/train
!mkdir -p ./face_gender_dataset/test
os.rename('./gender_dataset/train', './face_gender_dataset/train')
os.rename('./gender_dataset/test', './face_gender_dataset/test')

#### <b>Save Processed Dataset</b>

In [None]:
%%capture
!zip -r facial_identity_dataset.zip ./facial_identity_dataset/
!zip -r face_gender_dataset.zip ./face_gender_dataset/