In [1]:
import os
from pathlib import Path

import pandas as pd
from sklearn.model_selection import StratifiedKFold

In [2]:
def generate_metadata_celeba(data_dir):
    celeba_dir = Path(data_dir) / "CelebA"
    assert (celeba_dir / "list_attr_celeba.txt").is_file()
    assert (celeba_dir / "list_eval_partition.txt").is_file()

    with open(celeba_dir / "list_eval_partition.txt", "r") as partition_file:
        partitions = partition_file.readlines()

    with open(celeba_dir / "list_attr_celeba.txt", "r") as attr_file:
        attrs = attr_file.readlines()[2:]

    paths = []
    splits = []
    labels = []
    genders = []

    for i, (partition, attr) in enumerate(zip(partitions, attrs)):
        file_name, split = partition.strip().split()
        attributes = attr.strip().split()[1:]

        label = 1 if attributes[-1] == "1" else 0
        gender = "Male" if attributes[20] == "1" else "Female"

        image_path = os.path.join("CelebA", "img_align_celeba", file_name)
        paths.append(image_path)
        splits.append(int(split))
        labels.append(label)
        genders.append(gender)

    data = pd.DataFrame(
        {
            "id": list(range(1, len(paths) + 1)),
            "path": paths,
            "split": splits,
            "y": labels,
            "gender": genders,
        }
    )

    attr_mapping = {"0_Male": 0, "0_Female": 1, "1_Male": 2, "1_Female": 3}
    data["g"] = (data["y"].astype(str) + "_" + data["gender"]).map(attr_mapping)
    data["split"] = data["split"].map({0: 0, 1: 0, 2: 1})

    train_data = data[data["split"] == 0].reset_index(drop=True)
    test_data = data[data["split"] == 1].reset_index(drop=True)

    train_data = train_data.groupby("g").apply(
        lambda x: x.sample(train_data["g"].value_counts().min())
    )
    test_data = test_data.groupby("g").apply(
        lambda x: x.sample(test_data["g"].value_counts().min())
    )
    data = pd.concat([train_data, test_data]).reset_index(drop=True)

    data = data.sort_values(by=["path"]).reset_index(drop=True)
    data["a"] = data["gender"].map({"Male": 0, "Female": 1})

    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    for i, (_, test_idx) in enumerate(skf.split(X=data.index, y=data["a"])):
        data.loc[test_idx, "fold"] = i + 1
    data["fold"] = data["fold"].astype(int)

    return data[["path", "y", "a", "fold"]]


def generate_metadata_chestxray14(data_dir):
    chest14_dir = Path(os.path.join(data_dir, "ChestXray-NIHCC"))
    assert (chest14_dir / "Data_Entry_2017_v2020.csv").is_file()

    data = pd.read_csv(chest14_dir / "Data_Entry_2017_v2020.csv")

    # data = data[data["Finding Labels"] == "No Finding"]
    data["id"] = data["Patient ID"].astype(str)
    data = data.drop_duplicates(subset="id", keep="first", ignore_index=True)

    grouped = data.groupby("Patient Gender")
    data = (
        data.groupby("Patient Gender")
        .apply(lambda x: x.sample(grouped.size().min(), random_state=42))
        .reset_index(drop=True)
    )

    data["path"] = data["Image Index"].apply(lambda x: os.path.join("ChestXray-NIHCC/images", x))
    data = data.sort_values(by=["path"]).reset_index(drop=True)
    data["a"] = data["Patient Gender"].map({"M": 0, "F": 1})
    data["y"] = data["Patient Age"].astype(int)

    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    for i, (_, test_idx) in enumerate(skf.split(X=data.index, y=data["a"])):
        data.loc[test_idx, "fold"] = i + 1
    data["fold"] = data["fold"].astype(int)

    return data[["path", "y", "a", "fold"]]


def generate_metadata_utkface(data_dir):
    utkface_dir = Path(os.path.join(data_dir, "UTKFace"))

    ages = []
    genders = []
    ethnicities = []
    paths = []

    for image_path in utkface_dir.glob("*.jpg"):
        path = image_path.name
        parts = path.split("_")
        if len(parts) != 4:
            continue
        age, gender, ethnicity, _ = parts
        ages.append(int(age))
        genders.append(int(gender))
        ethnicities.append(int(ethnicity))
        paths.append(os.path.join("UTKFace", path))

    data = pd.DataFrame(
        {
            "path": paths,
            "Age": ages,
            "Gender": genders,
            "Ethnicity": ethnicities,
        }
    )
    data = data[(data["Age"].gt(10)) & (data["Age"].lt(100))].reset_index(drop=True)

    grouped = data.groupby("Gender")
    data = (
        data.groupby("Gender")
        .apply(lambda x: x.sample(grouped.size().min(), random_state=42))
        .reset_index(drop=True)
    )

    data = data.sort_values(by=["path"]).reset_index(drop=True)
    data["a"] = data["Gender"]
    data["y"] = data["Age"].astype(int)

    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    for i, (_, test_idx) in enumerate(skf.split(X=data.index, y=data["a"])):
        data.loc[test_idx, "fold"] = i + 1
    data["fold"] = data["fold"].astype(int)

    return data[["path", "y", "a", "fold"]]


DATASETS_GENERATORS = {
    "ChestX-ray14": generate_metadata_chestxray14,
    "UTKFace": generate_metadata_utkface,
    "CelebA": generate_metadata_celeba,
}

In [3]:
dataset = "ChestX-ray14"  # datasets: ChestX-ray14, UTKFace, CelebA

if dataset not in DATASETS_GENERATORS:
    raise ValueError(f"Dataset {dataset} not found!")

print("Generating data...")

data_dir = "../../../datasets/"
data = DATASETS_GENERATORS[dataset](data_dir)

data_dir = Path("../../metadatas")
data_dir.mkdir(parents=True, exist_ok=True)
data.to_csv(data_dir / f"{dataset}.csv", index=False)

print("Dataset size:", len(data))
print("Male-Female ratio:", data["a"].value_counts(normalize=True).tolist())

print("Done!\n")

Generating data...


Dataset size: 28350
Male-Female ratio: [0.5, 0.5]
Done!

