# Data Preparation

This notebook prepares datasets for training diffusion models. For local machine training, we focus on small image resolutions (32x32) and manageable dataset sizes.

In [5]:
import torch
from torchvision.datasets import CelebA, CIFAR10
from pathlib import Path

In [6]:
dataset_root = Path.cwd().parent / 'datasets'
dataset_root.mkdir(parents=True, exist_ok=True)

## 1. CIFAR-10 (32x32, RGB)
CIFAR-10 contains 60,000 images in 10 classes. We download both the training (50,000) and testing (10,000) splits.

In [7]:
# Download splits and print sizes
cifar_train = CIFAR10(root=dataset_root, train=True, download=True)
cifar_test = CIFAR10(root=dataset_root, train=False, download=True)

print(f"CIFAR-10 (Train) size: {len(cifar_train)}")
print(f"CIFAR-10 (Test) size: {len(cifar_test)}")
print(f"CIFAR-10 Total size: {len(cifar_train) + len(cifar_test)}")

CIFAR-10 (Train) size: 50000
CIFAR-10 (Test) size: 10000
CIFAR-10 Total size: 60000


## 2. CelebA
Large-scale face attributes dataset. We download each split individually to report their sizes.

In [8]:
# Download splits individually and print sizes
celeba_train = CelebA(root=dataset_root, split="train", download=True)
celeba_valid = CelebA(root=dataset_root, split="valid", download=True)
celeba_test = CelebA(root=dataset_root, split="test", download=True)

print(f"CelebA (Train) size: {len(celeba_train)}")
print(f"CelebA (Valid) size: {len(celeba_valid)}")
print(f"CelebA (Test) size: {len(celeba_test)}")
print(f"CelebA Total size: {len(celeba_train) + len(celeba_valid) + len(celeba_test)}")

CelebA (Train) size: 162770
CelebA (Valid) size: 19867
CelebA (Test) size: 19962
CelebA Total size: 202599
