In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os

In [2]:
celeba_dir = Path("../../../datasets/CelebA")

partition_file_path = celeba_dir / "list_eval_partition.txt"
with open(partition_file_path, "r") as f:
    partitions = f.readlines()

attr_file_path = celeba_dir / "list_attr_celeba.txt"
with open(attr_file_path, "r") as f:
    attrs = f.readlines()[2:]

In [3]:
paths = []
splits = []
labels = []
genders = []

for i, (partition, attr) in enumerate(zip(partitions, attrs)):
    file_name, split = partition.strip().split()
    attributes = attr.strip().split()[1:]
    
    label = 1 if attributes[9] == "1" else 0
    gender = "Male" if attributes[20] == "1" else "Female"
    
    image_path = os.path.join("CelebA", "img_align_celeba", file_name)
    paths.append(image_path)
    splits.append(int(split))
    labels.append(label)
    genders.append(gender)

In [4]:
data = pd.DataFrame({
    "id": list(range(1, len(paths) + 1)),
    "path": paths,
    "split": splits,
    "y": labels,
    "gender": genders,
})

attr_mapping = {"0_Male": 0, "0_Female": 1, "1_Male": 2, "1_Female": 3}
data["g"] = (data["y"].astype(str) + "_" + data["gender"]).map(attr_mapping)
data["a"] = data["gender"].map({"Male": 0, "Female": 1}).astype(int)

In [5]:
print("Total:", len(data))
print("Train:", len(data[data["split"] == 0]))
print("Val:", len(data[data["split"] == 1]))
print("Test:", len(data[data["split"] == 2]))
print("Gender ratio (test):", data["gender"].value_counts().tolist())

Total: 202599
Train: 162770
Val: 19867
Test: 19962
Gender ratio (test): [118165, 84434]


In [6]:
data.head(20)

Unnamed: 0,id,path,split,y,gender,g,a
0,1,CelebA/img_align_celeba/000001.jpg,0,0,Female,1,1
1,2,CelebA/img_align_celeba/000002.jpg,0,0,Female,1,1
2,3,CelebA/img_align_celeba/000003.jpg,0,0,Male,0,0
3,4,CelebA/img_align_celeba/000004.jpg,0,0,Female,1,1
4,5,CelebA/img_align_celeba/000005.jpg,0,0,Female,1,1
5,6,CelebA/img_align_celeba/000006.jpg,0,0,Female,1,1
6,7,CelebA/img_align_celeba/000007.jpg,0,0,Male,0,0
7,8,CelebA/img_align_celeba/000008.jpg,0,0,Male,0,0
8,9,CelebA/img_align_celeba/000009.jpg,0,0,Female,1,1
9,10,CelebA/img_align_celeba/000010.jpg,0,0,Female,1,1


In [11]:
im.shape

(218, 178, 3)

In [None]:
im = plt.imread(os.path.join("../../../datasets", data["path"][2]))
plt.imshow(im)

In [21]:
data["g"].value_counts(normalize=True)

g
1    0.443887
0    0.408121
3    0.139359
2    0.008633
Name: proportion, dtype: float64

In [22]:
data["split"] = data["split"].map({0: 0, 1: 0, 2: 1}).astype(int)
data["split"].value_counts()

split
0    182637
1     19962
Name: count, dtype: int64

In [23]:
train_data = data[data["split"] == 0].reset_index(drop=True)
test_data = data[data["split"] == 1].reset_index(drop=True)

print(train_data["g"].value_counts(normalize=True))

g
1    0.438925
0    0.411472
3    0.141012
2    0.008591
Name: proportion, dtype: float64


In [24]:
n_neg_m = len(train_data[(train_data["g"] == 0) & (train_data["a"] == 0)])
n_pos_m = len(train_data[(train_data["g"] == 2) & (train_data["a"] == 0)])
n_m = n_neg_m + n_pos_m

n_neg_m_prop = n_neg_m / n_m
n_pos_m_prop = 1 - n_neg_m_prop

print("n_m:", n_m)
print("n_neg_m prop:", f"{n_neg_m_prop:.2f}")
print("n_pos_m prop:", f"{n_pos_m_prop:.2f}")

n_neg_f = len(train_data[(train_data["g"] == 1) & (train_data["a"] == 1)])
n_pos_f = len(train_data[(train_data["g"] == 3) & (train_data["a"] == 1)])
n_f = n_neg_f + n_pos_f

n_neg_f_prop = n_neg_f / n_f
n_pos_f_prop = 1 - n_neg_f_prop

print()
print("n_f:", n_f)
print("n_neg_f prop:", f"{n_neg_f_prop:.2f}")
print("n_pos_f prop:", f"{n_pos_f_prop:.2f}")

n_m: 76719
n_neg_m prop: 0.98
n_pos_m prop: 0.02

n_f: 105918
n_neg_f prop: 0.76
n_pos_f prop: 0.24


In [25]:
n_m_sample = train_data["g"].value_counts().min() * 2

train_data_m = train_data[train_data["a"] == 0].sample(n_m_sample, replace=False, random_state=42)
train_data_f = train_data[train_data["a"] == 1].sample(n_m_sample, replace=False, random_state=42)

train_data_sc = pd.concat([train_data_m, train_data_f]).reset_index(drop=True)

In [26]:
n_m_sample = train_data["g"].value_counts().min() * 2
n_m_sample_pos = int(n_m_sample * n_pos_m_prop)
n_m_sample_neg = n_m_sample - n_m_sample_pos

train_data_m_neg = train_data[(train_data["g"] == 0) & (train_data["a"] == 0)].sample(n_m_sample_neg, replace=False, random_state=42)
train_data_m_pos = train_data[(train_data["g"] == 2) & (train_data["a"] == 0)].sample(n_m_sample_pos, replace=False, random_state=42)

n_f_sample = n_m_sample
n_f_sample_pos = int(n_f_sample * n_pos_f_prop)
n_f_sample_neg = n_f_sample - n_f_sample_pos

train_data_f_neg = train_data[(train_data["g"] == 1) & (train_data["a"] == 1)].sample(n_f_sample_neg, replace=False, random_state=42)
train_data_f_pos = train_data[(train_data["g"] == 3) & (train_data["a"] == 1)].sample(n_f_sample_pos, replace=False, random_state=42)

train_data_sc = pd.concat([train_data_m_neg, train_data_m_pos, train_data_f_neg, train_data_f_pos]).reset_index(drop=True)

In [27]:
output_dir = Path("../../metadata/shifted_celeba")
output_dir.mkdir(exist_ok=True, parents=True)

print(len(train_data_sc))
print(train_data_sc["g"].value_counts(normalize=True))

train_data_sc = train_data_sc[["path", "y", "a", "split"]]
train_data_sc.to_csv(output_dir / "train_sc.csv", index=False)

6276
g
0    0.489802
1    0.378426
3    0.121574
2    0.010198
Name: proportion, dtype: float64


In [28]:
train_data_bal = train_data.groupby("g").apply(lambda x: x.sample(train_data["g"].value_counts().min()))
test_data_bal = test_data.groupby("g").apply(lambda x: x.sample(test_data["g"].value_counts().min()))

print(len(train_data_bal))
print(train_data_bal["g"].value_counts(normalize=True))

train_data_bal = train_data_bal[["path", "y", "a", "split"]]
test_data_bal = test_data_bal[["path", "y", "a", "split"]]

train_data_bal.to_csv(output_dir / "train_bal.csv", index=False)
test_data_bal.to_csv(output_dir / "test_bal.csv", index=False)

6276
g
0    0.25
1    0.25
2    0.25
3    0.25
Name: proportion, dtype: float64
