# Data Collection

The purpose of this notebook is to get the data from kaggle.
Since the dataset is about 200 000 images, but we only need a fraction of that, this notebook will also be responsible for removing unnecessary data and structuring the files.

_Dataset is already split up into train, test and validation, however we merge all into one and split our data ourselves at a later stage_

In [2]:
import os
import shutil
from sklearn.model_selection import train_test_split
import pandas as pd

# Vars
DATASET = 'ashwingupta3012/male-and-female-faces-dataset/data'
SUBSET_IMGS = 3000
LIVE_PREDICTION_SUBSET = 60

ROOT_DIR = os.path.dirname(os.getcwd())
LIVE_PREDICTION_DIR = os.path.join(ROOT_DIR, 'assets', 'live_prediction_images')

# Set kaggle.json path
os.environ['KAGGLE_CONFIG_DIR'] = ROOT_DIR

In [9]:
import kaggle

# Authenticate
kaggle.api.authenticate()

# Download dataset
kaggle.api.dataset_download_files(DATASET, path=ROOT_DIR, unzip=True)



In [11]:
dataset_dir = os.path.join(ROOT_DIR, 'Dataset')  # Dataset folder path
dirs = [dir for dir in os.listdir(dataset_dir)]  # ['Train', 'Test', 'Validation']
male_dirs = []
female_dirs = []

# Create lists of all directory paths
for dir in dirs:
    current_dirs = os.listdir(os.path.join(dataset_dir, dir))
    for child in current_dirs:
        if child == 'Male':
            male_dirs.append(os.path.join(dataset_dir, dir, child))
        elif child == 'Female':
            female_dirs.append(os.path.join(dataset_dir, dir, child))

In [4]:
male_dirs

['/workspace/genderpredictor/Dataset/Test/Male',
 '/workspace/genderpredictor/Dataset/Train/Male',
 '/workspace/genderpredictor/Dataset/Validation/Male']

In [86]:
# Create new directories
main_dir = 'temp_dataset'
dirs = ['train', 'test', 'val']

os.mkdir(os.path.join(ROOT_DIR, main_dir))

# Create train, test, val in new temp_dataset
for dir in dirs:
    os.mkdir(os.path.join(ROOT_DIR, main_dir, dir))

# Create male, female dirs in each new dir
for dir in dirs:
    os.mkdir(os.path.join(ROOT_DIR, main_dir, dir, 'male'))
    os.mkdir(os.path.join(ROOT_DIR, main_dir, dir, 'female'))

In [87]:
# Read all filepaths, create train, test, val dataframes
male_files = []
female_files = []
male_labels = []
female_labels = []

# Loop through directories and append filepaths
for male_dir, female_dir in zip(male_dirs, female_dirs):
    for male_file, female_file in zip(os.listdir(male_dir), os.listdir(female_dir)):
        male_files.append(male_file)
        female_files.append(female_file)
        male_labels.append(1)
        female_labels.append(0)

# Create dataframe
data = {'file': male_files + female_files, 'gender': male_labels + female_labels}
df = pd.DataFrame(data = data)

# Keep only SUBSET_IMGS / 2 images of each gender
df_male = df.head(int(SUBSET_IMGS / 2))
df_female = df.tail(int(SUBSET_IMGS / 2))
df = pd.concat([df_male, df_female], axis=0)

# Shuffle dataframe
df = df.sample(frac=1.0, random_state=42).reset_index(drop=True)


In [84]:
train_ratio = 0.6
val_ratio = 0.2
test_ratio = 0.2

# Split dataframe into train and temp
df_train, df_temp = train_test_split(df, test_size=1 - train_ratio, random_state=42)
# Split temp dataframe into test and val
df_val, df_test = train_test_split(df_temp, test_size=test_ratio / (test_ratio + val_ratio), random_state=42)

# Save dataframes to csv
df_train.to_csv(os.path.join(ROOT_DIR, main_dir, 'train', 'train.csv'), index=False)
df_test.to_csv(os.path.join(ROOT_DIR, main_dir, 'test', 'test.csv'), index=False)
df_val.to_csv(os.path.join(ROOT_DIR, main_dir, 'val', 'val.csv'), index=False)

In [88]:
# Copy number of files SUBSET_IMGS to new dataset directories and number of files LIVE_PREDICTION_SUBSET to directory
for male_dir, female_dir in zip(male_dirs, female_dirs):
    index = 0
    for male_file, female_file in zip(os.listdir(male_dir), os.listdir(female_dir)):
        # Copy to appropriate folder
        if df_train['file'].isin([male_file]).any():
            shutil.copy2(os.path.join(male_dir, male_file), os.path.join(ROOT_DIR, main_dir, 'train', 'male'))
        elif df_train['file'].isin([female_file]).any():
            shutil.copy2(os.path.join(female_dir, female_file), os.path.join(ROOT_DIR, main_dir, 'train', 'female'))
        elif df_test['file'].isin([male_file]).any():
            shutil.copy2(os.path.join(male_dir, male_file), os.path.join(ROOT_DIR, main_dir, 'test', 'male'))
        elif df_test['file'].isin([female_file]).any():
            shutil.copy2(os.path.join(female_dir, female_file), os.path.join(ROOT_DIR, main_dir, 'test', 'female'))
        elif df_val['file'].isin([male_file]).any():
            shutil.copy2(os.path.join(male_dir, male_file), os.path.join(ROOT_DIR, main_dir, 'val', 'male'))
        elif df_val['file'].isin([female_file]).any():
            shutil.copy2(os.path.join(female_dir, female_file), os.path.join(ROOT_DIR, main_dir, 'val', 'female'))
        index += 1

subset_size_live_prediction = round(LIVE_PREDICTION_SUBSET / len(male_dirs + female_dirs))  
# Copy live prediction images
for male_dir, female_dir in zip(male_dirs, female_dirs):
    index = 0
    for male_file, female_file in zip(os.listdir(male_dir)[int(SUBSET_IMGS/2)+1:], os.listdir(female_dir)[int(SUBSET_IMGS/2)+1:]):
        if index == subset_size_live_prediction:
            break
        shutil.copy2(os.path.join(male_dir, male_file), os.path.join(ROOT_DIR, LIVE_PREDICTION_DIR, 'male'))
        shutil.copy2(os.path.join(female_dir, female_file), os.path.join(ROOT_DIR, LIVE_PREDICTION_DIR, 'female'))
        index += 1

In [90]:
# Remove original dataset

shutil.rmtree(dataset_dir)

In [4]:
# Check size of new data

male_size_train = len(os.listdir(os.path.join(ROOT_DIR, main_dir, 'train', 'male')))
female_size_train = len(os.listdir(os.path.join(ROOT_DIR, main_dir, 'train', 'female')))
male_size_test = len(os.listdir(os.path.join(ROOT_DIR, main_dir, 'test', 'male')))
female_size_test = len(os.listdir(os.path.join(ROOT_DIR, main_dir, 'test', 'female')))
male_size_val = len(os.listdir(os.path.join(ROOT_DIR, main_dir, 'val', 'male')))
female_size_val = len(os.listdir(os.path.join(ROOT_DIR, main_dir, 'val', 'female')))
live_prediction_male_size = len(os.listdir(os.path.join(ROOT_DIR, LIVE_PREDICTION_DIR, 'male')))
live_prediction_female_size = len(os.listdir(os.path.join(ROOT_DIR, LIVE_PREDICTION_DIR, 'female')))

print(f'Male dir in train: {male_size_train} images')
print(f'Male dir in test: {male_size_test} images')
print(f'Male dir in val: {male_size_val} images')
print(f'Female dir in train: {female_size_train} images')
print(f'Female dir in test: {female_size_test} images')
print(f'Female dir in val: {female_size_val} images')
print(f'Total in dataset: {male_size_train + male_size_test + male_size_val + female_size_train + female_size_test + female_size_val} images')

print(f'Male dir in live prediction folder: {live_prediction_male_size} images')
print(f'Female dir in live prediction folder: {live_prediction_female_size} images')
print(f'Total in live prediction folder: {live_prediction_male_size + live_prediction_female_size} images')

Male dir in train: 895 images
Male dir in test: 310 images
Male dir in val: 295 images
Female dir in train: 905 images
Female dir in test: 290 images
Female dir in val: 305 images
Total in dataset: 3000 images
Male dir in live prediction folder: 30 images
Female dir in live prediction folder: 30 images
Total in live prediction folder: 60 images


In [91]:
# Rename temp_dataset to dataset
os.rename(os.path.join(ROOT_DIR, main_dir), os.path.join(ROOT_DIR, 'dataset'))