In [56]:
import shutil
import os
import zipfile

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from PIL import Image, ImageOps
from sklearn.model_selection import train_test_split

In [67]:
images = []
ages = []
genders = []

# Dataset is from the following paper: https://arxiv.org/pdf/1702.08423

for i in os.listdir('../Datasets/utkface_aligned_cropped/crop_part1/')[0:10000]:
    split = i.split('_')
    ages.append(int(split[0]))
    genders.append(int(split[1]))
    images.append(Image.open('../Datasets/utkface_aligned_cropped/crop_part1/' + i))

In [68]:
images = pd.Series(list(images), name = 'Images')
ages = pd.Series(list(ages), name = 'Ages')
genders = pd.Series(list(genders), name = 'Genders')

df = pd.concat([images, ages, genders], axis=1)
print(df)

                                                 Images  Ages  Genders
0     <PIL.JpegImagePlugin.JpegImageFile image mode=...    27        0
1     <PIL.JpegImagePlugin.JpegImageFile image mode=...    75        0
2     <PIL.JpegImagePlugin.JpegImageFile image mode=...     1        0
3     <PIL.JpegImagePlugin.JpegImageFile image mode=...    23        1
4     <PIL.JpegImagePlugin.JpegImageFile image mode=...    36        0
...                                                 ...   ...      ...
9775  <PIL.JpegImagePlugin.JpegImageFile image mode=...    63        0
9776  <PIL.JpegImagePlugin.JpegImageFile image mode=...    24        1
9777  <PIL.JpegImagePlugin.JpegImageFile image mode=...     9        0
9778  <PIL.JpegImagePlugin.JpegImageFile image mode=...    53        0
9779  <PIL.JpegImagePlugin.JpegImageFile image mode=...    36        1

[9780 rows x 3 columns]


In [69]:
df['Ages'].unique()

array([ 27,  75,   1,  23,  36,  58,  48,  15,  21,  16,  72,  14,   6,
         4,  19,  39,  45,  17,  62,  18,   3,  65,   2,  49,  51,  29,
        30,  56,  63,  28,  22,  70,  69,   8,  11,  80,  10,  34,  54,
         5,  26,  24,  13,  67,  93,  32,  78,   9,  55,  52,  35,  31,
        41,  64,   7,  47,  12,  68,  77,  59,  37,  20,  90,  95,  38,
        25,  40,  82,  66,  44,  43,  46,  92,  53,  89,  60,  50,  85,
        42,  57,  73,  61,  86,  79,  96,  33,  74,  76,  71,  84,  99,
       101,  88, 110,  81,  83,  91, 100,  87])

In [70]:
bins = [0, 4, 14, 20, 30, 40, 50, 60, 100]
labels = ['0-4', '5-14', '15-20', '21-30', '31-40', '41-50', '51-60', '60-100']

df['AgeRange'] = pd.cut(df['Ages'], bins=bins, labels=labels, right=True)

# df.info()

print(df.describe())

# Entfernen von Zeilen mit NaN in AgeRange
df = df.dropna(subset=['AgeRange'])

print(df.describe())

              Ages      Genders
count  9780.000000  9780.000000
mean     29.421575     0.553170
std      24.777333     0.497807
min       1.000000     0.000000
25%       7.000000     0.000000
50%      25.000000     1.000000
75%      49.000000     1.000000
max     110.000000     3.000000
              Ages      Genders
count  9777.000000  9777.000000
mean     29.397770     0.553033
std      24.743719     0.497822
min       1.000000     0.000000
25%       7.000000     0.000000
50%      25.000000     1.000000
75%      49.000000     1.000000
max     100.000000     3.000000


In [71]:
base_dir = "."
splits = ['Train', 'Validate', 'Test']

train_ratio = 0.8
val_ratio = 0.05
test_ratio = 0.15

for split in splits:
    for age_range in df['AgeRange'].unique():
        path = os.path.join(base_dir, split, str(age_range))
        os.makedirs(path, exist_ok=True)

train_df, temp_df = train_test_split(df, test_size=val_ratio + test_ratio, stratify=df['AgeRange'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=test_ratio / (val_ratio + test_ratio), stratify=temp_df['AgeRange'], random_state=42)

# Funktion, um Bilder in die entsprechenden Ordner zu kopieren
def copy_images(dataframe, split):
    for _, row in dataframe.iterrows():
        image = row['Images']  # Bild aus der DataFrame-Zeile
        age_range = str(row['AgeRange'])
        dest_dir = os.path.join(base_dir, split, age_range)
        image_path = f"{dest_dir}/{os.path.basename(image.filename)}"  # Dateiname erhalten
        image.save(image_path)  # Bild speichern

# Bilder in die entsprechenden Ordner kopieren
copy_images(train_df, "Train")
copy_images(val_df, "Validate")
copy_images(test_df, "Test")

print("Daten wurden erfolgreich in die Ordnerstruktur aufgeteilt!")

Daten wurden erfolgreich in die Ordnerstruktur aufgeteilt!
