# Data Collection

The purpose of this notebook is to get the data from kaggle.
Since the dataset is about 200 000 images, but we only need a fraction of that, this notebook will also be responsible for removing unnecessary data and structuring the files.

_Dataset is already split up into train, test and validation, however we merge all into one and split our data ourselves at a later stage_

In [1]:
import os
import shutil

# Vars
DATASET = 'ashishjangra27/gender-recognition-200k-images-celeba'
SUBSET_IMGS = 3000
LIVE_PREDICTION_SUBSET = 60

ROOT_DIR = os.path.dirname(os.getcwd())
LIVE_PREDICTION_DIR = os.path.join(ROOT_DIR, 'assets', 'live_prediction_images')

# Set kaggle.json path
os.environ['KAGGLE_CONFIG_DIR'] = ROOT_DIR

In [2]:
import kaggle

# Authenticate
kaggle.api.authenticate()

# Download dataset
kaggle.api.dataset_download_files(DATASET, path=ROOT_DIR, unzip=True)



In [3]:
dataset_dir = os.path.join(ROOT_DIR, 'Dataset')  # Dataset folder path
dirs = [dir for dir in os.listdir(dataset_dir)]  # ['Train', 'Test', 'Validation']
male_dirs = []
female_dirs = []

# Create lists of all directory paths
for dir in dirs:
    current_dirs = os.listdir(os.path.join(dataset_dir, dir))
    for child in current_dirs:
        if child == 'Male':
            male_dirs.append(os.path.join(dataset_dir, dir, child))
        elif child == 'Female':
            female_dirs.append(os.path.join(dataset_dir, dir, child))

In [4]:
male_dirs

['/workspace/genderpredictor/Dataset/Test/Male',
 '/workspace/genderpredictor/Dataset/Train/Male',
 '/workspace/genderpredictor/Dataset/Validation/Male']

In [29]:
# Create new directories

main_dir = 'temp_dataset'

os.mkdir(os.path.join(ROOT_DIR, main_dir))

male_dir = 'male'
female_dir = 'female'

os.mkdir(os.path.join(ROOT_DIR, main_dir, male_dir))
os.mkdir(os.path.join(ROOT_DIR, main_dir, female_dir))

In [30]:
# Copy number of files SUBSET_IMGS to new dataset directories and number of files LIVE_PREDICTION_SUBSET to directory
def copy_files():
    global male_dirs
    global female_dirs
    global ROOT_DIR
    global main_dir

    subset_size = round(SUBSET_IMGS / len(male_dirs + female_dirs))
    subset_size_live_prediction = round(LIVE_PREDICTION_SUBSET / len(male_dirs + female_dirs))
    
    # Copy dataset images
    for male_dir, female_dir in zip(male_dirs, female_dirs):
        index = 0
        for male_file, female_file in zip(os.listdir(male_dir), os.listdir(female_dir)):
            if index == subset_size:
                break
            shutil.copy2(os.path.join(male_dir, male_file), os.path.join(ROOT_DIR, main_dir, 'male'))
            shutil.copy2(os.path.join(female_dir, female_file), os.path.join(ROOT_DIR, main_dir, 'female'))
            index += 1
    
    # Copy live prediction images
    for male_dir, female_dir in zip(male_dirs, female_dirs):
        index = 0
        for male_file, female_file in zip(os.listdir(male_dir)[subset_size+1:], os.listdir(female_dir)[subset_size+1:]):
            if index == subset_size_live_prediction:
                break
            shutil.copy2(os.path.join(male_dir, male_file), os.path.join(ROOT_DIR, LIVE_PREDICTION_DIR, 'male'))
            shutil.copy2(os.path.join(female_dir, female_file), os.path.join(ROOT_DIR, LIVE_PREDICTION_DIR, 'female'))
            index += 1

copy_files()

In [31]:
# Remove original dataset

shutil.rmtree(dataset_dir)

In [33]:
# Check size of new data
male_size = len(os.listdir(os.path.join(ROOT_DIR, main_dir, 'male')))
female_size = len(os.listdir(os.path.join(ROOT_DIR, main_dir, 'female')))
live_prediction_male_size = len(os.listdir(os.path.join(ROOT_DIR, LIVE_PREDICTION_DIR, 'male')))
live_prediction_female_size = len(os.listdir(os.path.join(ROOT_DIR, LIVE_PREDICTION_DIR, 'female')))

print(f'Male dir in dataset: {male_size} images')
print(f'Female dir in dataset: {female_size} images')
print(f'Total in dataset: {male_size + female_size} images')

print(f'Male dir in live prediction folder: {live_prediction_male_size} images')
print(f'Female dir in live prediction folder: {live_prediction_female_size} images')
print(f'Total in live prediction folder: {live_prediction_male_size + live_prediction_female_size} images')

Male dir in dataset: 1500 images
Female dir in dataset: 1500 images
Total in dataset: 3000 images
Male dir in live prediction folder: 30 images
Female dir in live prediction folder: 30 images
Total in live prediction folder: 60 images


In [34]:
# Rename temp_dataset to dataset
os.rename(os.path.join(ROOT_DIR, main_dir), os.path.join(ROOT_DIR, 'dataset'))