**Attention:** This notebook requires Linux to run and create the dataset. If you do not have a Linux PC, please execute this code in Colab and download the final result.

In [None]:
# Loading and unzipping the raw data set
import gdown
import zipfile

# Raw data
archive_url = 'https://drive.google.com/file/d/1YDZ2XB2Jdbot1SDptyaD36gobWKSl0oi/view?usp=share_link'

output_archive = '/content/archive.zip'
gdown.download(archive_url, output_archive, quiet=False, fuzzy = True)
!unzip /content/archive.zip
!mv /content/kaggle/kaggle/train /content
!rm -rf kaggle
!rm archive.zip

In [None]:
import numpy as np
import pandas as pd
import os
from PIL import Image, ImageEnhance, ImageFilter, ImageOps
from skimage import exposure
from skimage.color import rgb2gray
import cv2
import math

In [None]:
# Setting the data paths

data_path = '/content/'
data_dir = '/content/train/'

In [None]:
# As the raw data set contains a few corrupted images, this code iterates over all
# images and validates the data. If the file is found to be corrupt, it is ommited
# from the final dataset. 

errors = []

for image in os.listdir(data_dir):
  try:
    img = Image.open(data_dir + image) # open the image file
    img.verify() # verify that it is, in fact an image
  except (IOError, SyntaxError) as e:
    print('Bad file:', image, data_dir) # print out the names of corrupt files
    errors.append(image)

In [None]:
# read the label files provided by the dataset

train_age = pd.read_csv(data_path + 'train_age.csv')
train_gender = pd.read_csv(data_path + 'train_gender.csv')

In [None]:
# Create a csv-file that contains the combined label for each image
# For the combined label, the age group and the gender are combined as a string
# using '-' as a seperator

train_full_cohort = train_age
train_full_cohort['age'] = train_full_cohort['age'].apply(lambda x: round(x))
# group age into cohorts of 5 years starting from 15
for i in range(15, 100, 5):
    train_full_cohort['age'] = np.where((train_full_cohort['age'] >= i) & (train_full_cohort['age'] < i + 5), i, train_full_cohort['age'])
train_full_cohort['gender'] = train_gender['gender'].astype(str)
train_full_cohort['age'] = train_full_cohort['age'].astype(str)
train_full_cohort['combine'] = train_full_cohort[['age', 'gender']].agg('-'.join, axis=1)
train_full_cohort.to_csv(data_path + 'train_full_cohort.csv', index=False)

In [None]:
# Creating the csv-files containing ids for the different classes

!mkdir /content/cohorts_5
for label in train_full_cohort['combine'].unique():
    df = train_full_cohort[train_full_cohort['combine'] == label]
    ids = []
    for id in df['imageId']:
        # fill the image-id with 0 from the left, as the image names on disk
        # have this format and python ommits preceeding zeros
        id = str(id).zfill(6)
        # add the .png extension
        id = id + '.png'
        if id not in errors:
          ids.append(id)
        else: 
          print(id)
        #create a csv file with the imageIds for this label
    ids = pd.DataFrame(ids)
    ids.to_csv(data_path + 'cohorts_5/' + str(label) + '.csv', index=False, header=False)

In [None]:
# download the bash script to create the folder structure and move all the 
# images into their respective folders

script_url = 'https://drive.google.com/file/d/1EcMuDdsfpoEQypps03EUcBpceWhN4eQ8/view?usp=share_link'
script_output = '/content/split.sh'
gdown.download(script_url, script_output, quiet = False, fuzzy = True)

In [None]:
# make the script executable, create the data directory and run the script
!chmod +x /content/split.sh
!mkdir data
!/content/split.sh

In [None]:
# create a zip-file of the final dataset
!zip -r /content/cohorts_5.zip data/