In [None]:
!git clone -b perfect_model https://github.com/lahirumanulanka/ann-visual-emotion

# Synthetic Emotion Dataset Expansion (Original ≈80,190 → Target 100,000)

This notebook generates additional face emotion images using a Hugging Face diffusion model to reach a balanced 100k sample dataset of 224×224 grayscale faces. It:

1. Loads existing dataset structure and counts.
2. Plans required per-class synthetic counts.
3. Generates realistic face images conditioned by emotion prompts.
4. Filters for face presence, quality, and near-duplicate removal.
5. Converts to grayscale 224×224 and stores under `data/processed/FullDataEmoSet_gen/<label>/`.
6. Merges original + synthetic metadata, balances, and performs stratified splits with synthetic fraction caps.
7. Writes `train.csv`, `val.csv`, `test.csv`, `status.json` (extended), and synthetic metadata logs.
8. Produces diagnostics, plots, and reproducibility artifacts.

Safety / Ethics: These are synthetic faces (not tied to real users). Use only for model training & research. Avoid unintended misuse. Adjust prompts to maintain demographic diversity and neutrality.


In [None]:
# 1. Config & Dependencies
import sys, os, math, json, time, hashlib, platform, shutil, random
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Tuple

# Core generation target
TARGET_TOTAL = 100_000
IMG_SIZE = (224, 224)  # (width, height)
GRAYSCALE_MODE = 'L'
STRICT_ENFORCE_OUTPUT = True  # Extra post-processing pass to guarantee all images are 224x224 grayscale

# Roots (relative to this notebook)
RAW_ROOT = Path('../data/raw/FullDataEmoSet').resolve()
ORIG_SPLITS_DIR = Path('../data/processed/EmoSet_splits').resolve()  # existing splits (if needed)
OUT_GEN_ROOT = Path('../data/processed/FullDataEmoSet_gen').resolve()  # synthetic images per class
CSV_DIR = Path('../data/processed/EmoSet_splits_gen').resolve()       # new combined splits output
META_DIR = CSV_DIR / 'meta'
DEST_IMG_ROOT = CSV_DIR / 'raw_all'  # final curated (original + synthetic) copy for training reference

# Controls
RUN_WRITE = True  # master switch for any filesystem writes
MAX_PER_CLASS_SYNTH_FRACTION = 0.60  # cap synthetic fraction per class in TRAIN split
MAX_WORKERS = 1  # placeholder if future parallelization is added
BATCH_GENERATION = 1  # diffusion batch size (1 keeps memory manageable)
MODEL_ID = 'runwayml/stable-diffusion-v1-5'
INFERENCE_STEPS = 30
GUIDANCE_SCALE = 7.5
BASE_SEED = 20250924
DEVICE = 'cuda' if (os.environ.get('FORCE_CPU','0')!='1' and __import__('torch').cuda.is_available()) else 'cpu'
ALLOW_HARDLINK = True  # attempt os.link instead of copy to save space (POSIX)

# Quality thresholds
FACE_DETECT_SINGLE_ONLY = True
BLUR_VAR_MIN = 60.0  # Laplacian variance threshold
HASH_DIST_MAX = 4    # max allowed Hamming distance for near-duplicate rejection (pHash)
PHASH_SIZE = 16      # phash size (affects sensitivity)
PHASH_SAMPLE_RESIZE = (128,128)
RETRY_DEFICIT_MAX_PASSES = 2

# Split ratios
TRAIN_PCT, VAL_PCT, TEST_PCT = 0.7, 0.15, 0.15
assert abs(TRAIN_PCT + VAL_PCT + TEST_PCT - 1.0) < 1e-6

# Create dirs when writing
if RUN_WRITE:
    for d in [OUT_GEN_ROOT, CSV_DIR, META_DIR, DEST_IMG_ROOT]:
        d.mkdir(parents=True, exist_ok=True)

print(f"Device: {DEVICE}\nRAW_ROOT={RAW_ROOT}\nOUT_GEN_ROOT={OUT_GEN_ROOT}\nCSV_DIR={CSV_DIR}")

# Install-time note (libraries expected via pyproject; add here if missing)
try:
    import torch, numpy as np, pandas as pd, cv2, matplotlib.pyplot as plt
    from PIL import Image
    import imagehash
    from tqdm import tqdm
    from diffusers import StableDiffusionPipeline
except Exception as e:
    print('If import fails, ensure diffusers, accelerate, safetensors, opencv-python, imagehash installed.')
    raise

random.seed(BASE_SEED)
np.random.seed(BASE_SEED)
if torch.cuda.is_available():
    torch.manual_seed(BASE_SEED)


In [None]:
# 2. Load Existing Dataset Metadata
IMG_EXTS = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.webp'}
assert RAW_ROOT.exists(), f"RAW_ROOT missing: {RAW_ROOT}"

class_dirs = [p for p in RAW_ROOT.iterdir() if p.is_dir()]
labels = sorted([p.name for p in class_dirs])
print('Labels:', labels)

records = []
for lab in labels:
    for p in (RAW_ROOT / lab).rglob('*'):
        if p.is_file() and p.suffix.lower() in IMG_EXTS:
            records.append({'path': str(p.resolve()), 'label': lab, 'origin': 'original'})

df_original = pd.DataFrame(records)
class_counts_original = df_original['label'].value_counts().sort_index()
total_original = int(class_counts_original.sum())
print('Original per-class counts:\n', class_counts_original)
print('Total original:', total_original)

# Optional assertion (can relax if dataset changed)
if total_original != 80190:
    print(f"[WARN] Expected 80190 originals, found {total_original}. Proceeding anyway.")


In [None]:
# 3. Compute Generation Plan (Target 100,000)
remaining_needed = max(0, TARGET_TOTAL - len(df_original))
print(f"Synthetic images needed to reach target: {remaining_needed}")

num_classes = len(labels)
base_target_per_class = math.ceil(TARGET_TOTAL / num_classes)

plan_rows = []
needed_per_class: Dict[str,int] = {}
for lab in labels:
    current = int(class_counts_original.get(lab, 0))
    target = max(current, base_target_per_class)
    need = max(0, target - current)
    needed_per_class[lab] = need
    plan_rows.append({
        'label': lab,
        'current': current,
        'per_class_target': target,
        'needed': need,
    })

plan_df = pd.DataFrame(plan_rows).sort_values('label')
print(plan_df)
print('Planned synthetic total (sum needed):', int(plan_df['needed'].sum()))


In [None]:
# 4. Prompt Templates & Emotion-to-Prompt Mapping
emotion_prompts = {
    'happy': "portrait photo of a person smiling, expressive happy face, natural skin texture, neutral background, well lit",
    'sad': "portrait photo of a person with a sad expression, subtle frown, gentle lighting, neutral background",
    'angry': "portrait photo of a person showing anger, tense eyebrows, intense gaze, dramatic lighting, neutral background",
    'fear': "portrait photo of a person showing fear, widened eyes, slightly open mouth, cinematic soft lighting, neutral background",
    'surprise': "portrait photo of a person surprised, raised eyebrows, open mouth, sharp focus, neutral background",
    'disgust': "portrait photo of a person showing disgust, wrinkled nose, expressive face, soft studio light, neutral background",
    'neutral': "portrait photo of a person with a neutral calm expression, even soft lighting, neutral background",
}

# Optional stylistic suffixes to encourage diversity
diversity_suffixes = [
    "ultra detailed, photorealistic",
    "soft diffused light",
    "high detail skin texture",
    "professional portrait",
    "dslr, crisp details",
]

def build_prompt(label: str, variant_idx: int) -> str:
    base = emotion_prompts.get(label, f"portrait photo of a person showing {label} expression, neutral background")
    suffix = diversity_suffixes[variant_idx % len(diversity_suffixes)]
    return base + ", " + suffix

# Hash of prompt dict for reproducibility
prompt_dict_hash = hashlib.sha256(json.dumps(emotion_prompts, sort_keys=True).encode()).hexdigest()[:16]
print('Prompt dict hash:', prompt_dict_hash)


In [None]:
# 5. Initialize Diffusion Pipeline (HuggingFace)
pipe = StableDiffusionPipeline.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16 if (DEVICE=='cuda') else torch.float32,
    safety_checker=None,  # optionally keep None; add safety checker if required
)
if DEVICE == 'cuda':
    pipe = pipe.to('cuda')
    pipe.enable_attention_slicing()
else:
    pipe = pipe.to('cpu')

print(f"Loaded model {MODEL_ID} on {DEVICE}")


In [None]:
# 6-9. Generation Loop + Grayscale + Face Filter + Dedup

# Prepare face detector (Haar cascade)
import cv2
haar_path = cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
face_cascade = cv2.CascadeClassifier(haar_path)
assert not face_cascade.empty(), 'Failed to load Haar cascade.'

from collections import defaultdict, deque

phash_sets: Dict[str, List[imagehash.ImageHash]] = defaultdict(list)

synthetic_rows = []
start_time = time.time()

global_attempt = 0
for lab in labels:
    need = needed_per_class.get(lab, 0)
    if need == 0:
        continue
    out_dir = OUT_GEN_ROOT / lab
    if RUN_WRITE:
        out_dir.mkdir(parents=True, exist_ok=True)
    created = 0
    variant_idx = 0
    pbar = tqdm(total=need, desc=f'Gen {lab}')
    while created < need:
        prompt = build_prompt(lab, variant_idx)
        seed = BASE_SEED + global_attempt
        generator = torch.Generator(device=DEVICE)
        generator = generator.manual_seed(seed)
        try:
            with torch.autocast('cuda', enabled=(DEVICE=='cuda')):
                result = pipe(prompt, num_inference_steps=INFERENCE_STEPS, guidance_scale=GUIDANCE_SCALE, generator=generator)
            img: Image.Image = result.images[0]
        except Exception as e:
            print(f"[ERR] Generation failed {lab}: {e}")
            global_attempt += 1
            variant_idx += 1
            continue

        # Convert to grayscale & resize
        img = img.convert('L')
        if img.size != IMG_SIZE:
            img = img.resize(IMG_SIZE, Image.BILINEAR)

        # Face detection
        cv_img = np.array(img)
        faces = face_cascade.detectMultiScale(cv_img, scaleFactor=1.1, minNeighbors=4, minSize=(40,40))
        face_detected = len(faces) > 0
        if FACE_DETECT_SINGLE_ONLY and len(faces) != 1:
            global_attempt += 1
            variant_idx += 1
            continue

        # Blur metric (Laplacian variance)
        lap_var = float(cv2.Laplacian(cv_img, cv2.CV_64F).var())
        if lap_var < BLUR_VAR_MIN:
            global_attempt += 1
            variant_idx += 1
            continue

        # Dedup via pHash
        ph = imagehash.phash(img, hash_size=PHASH_SIZE)
        is_dup = False
        for existing in phash_sets[lab]:
            if (ph - existing) <= HASH_DIST_MAX:
                is_dup = True
                break
        if is_dup:
            global_attempt += 1
            variant_idx += 1
            continue

        # Accept image
        filename = f"{lab}_syn_{created:06d}_seed{seed}.png"
        save_path = out_dir / filename
        if RUN_WRITE:
            try:
                img.save(save_path)
            except Exception as e:
                print('Save failed:', e)
                global_attempt += 1
                variant_idx += 1
                continue

        phash_sets[lab].append(ph)
        synthetic_rows.append({
            'path': str(save_path.resolve()),
            'label': lab,
            'origin': 'synthetic',
            'prompt': prompt,
            'seed': seed,
            'lap_var': lap_var,
            'phash': str(ph),
            'attempt': global_attempt,
            'face_detected': face_detected,
        })
        created += 1
        pbar.update(1)
        global_attempt += 1
        variant_idx += 1
    pbar.close()

elapsed = time.time() - start_time
print(f"Generation loop elapsed {elapsed/60:.2f} min")

synthetic_meta_df = pd.DataFrame(synthetic_rows)
print('Synthetic created per class:\n', synthetic_meta_df['label'].value_counts().sort_index())


In [None]:
# 10-12. Metadata Logging + Combine + Adjust
if RUN_WRITE and not synthetic_meta_df.empty:
    synthetic_meta_df.to_csv(META_DIR / 'synthetic_meta.csv', index=False)
    with open(META_DIR / 'synthetic_meta.jsonl', 'w') as f:
        for rec in synthetic_rows:
            f.write(json.dumps(rec) + '\n')

# Combine
if len(synthetic_meta_df):
    df_synth = synthetic_meta_df[['path','label','origin']]
else:
    df_synth = pd.DataFrame(columns=['path','label','origin'])

df_all = pd.concat([df_original, df_synth], ignore_index=True)

# Per-class summary
summary_rows = []
for lab in labels:
    orig_n = int((df_original['label']==lab).sum())
    syn_n = int((df_synth['label']==lab).sum())
    tot = orig_n + syn_n
    frac_syn = syn_n / tot if tot>0 else 0.0
    summary_rows.append({'label': lab, 'original': orig_n, 'synthetic': syn_n, 'total': tot, 'synthetic_fraction': round(frac_syn,4)})
summary_df = pd.DataFrame(summary_rows)
print(summary_df)

# Identify deficits (should normally be zero if plan succeeded)
expected_per_class = math.ceil(TARGET_TOTAL / len(labels))
deficit_labels = [r.label for r in summary_df.itertuples() if r.total < expected_per_class]
if deficit_labels:
    print('[INFO] Deficit labels pending second pass:', deficit_labels)
else:
    print('No deficits detected after first pass.')


In [None]:
# Enforce grayscale + 224x224 on all synthetic images (safety pass)
if STRICT_ENFORCE_OUTPUT and RUN_WRITE:
    repaired = 0
    for rec in synthetic_rows:
        p = Path(rec['path'])
        if not p.exists():
            continue
        try:
            img = Image.open(p)
            changed = False
            if img.mode != GRAYSCALE_MODE:
                img = img.convert(GRAYSCALE_MODE)
                changed = True
            if img.size != IMG_SIZE:
                img = img.resize(IMG_SIZE, Image.BILINEAR)
                changed = True
            if changed:
                img.save(p)
                repaired += 1
        except Exception:
            continue
    print(f"STRICT_ENFORCE_OUTPUT applied. Repaired {repaired} synthetic files to {GRAYSCALE_MODE} {IMG_SIZE}.")

In [None]:
# 13. Stratified Split with Synthetic Ratio Constraints
from sklearn.model_selection import train_test_split

# Basic stratified split first on full df_all
train_df, temp_df = train_test_split(df_all, test_size=(1-TRAIN_PCT), stratify=df_all['label'], random_state=BASE_SEED)
val_rel = VAL_PCT / (VAL_PCT + TEST_PCT)
val_df, test_df = train_test_split(temp_df, test_size=(1-val_rel), stratify=temp_df['label'], random_state=BASE_SEED)

# Enforce synthetic fraction cap in TRAIN by per-class adjustment
adjusted_rows = []
for lab in labels:
    subset = train_df[train_df.label==lab]
    orig_sub = subset[subset.origin=='original']
    synth_sub = subset[subset.origin=='synthetic']
    total_lab = len(subset)
    if total_lab == 0:
        continue
    max_syn = int(MAX_PER_CLASS_SYNTH_FRACTION * total_lab)
    if len(synth_sub) > max_syn:
        # keep all originals, sample allowed number of synthetics
        keep_synth = synth_sub.sample(n=max_syn, random_state=BASE_SEED)
        adjusted = pd.concat([orig_sub, keep_synth])
    else:
        adjusted = subset
    adjusted_rows.append(adjusted)
train_df_adjusted = pd.concat(adjusted_rows) if adjusted_rows else train_df

print({'train': len(train_df_adjusted), 'val': len(val_df), 'test': len(test_df)})
print('Train synthetic fraction overall:', round((train_df_adjusted.origin=='synthetic').mean(),4))


In [None]:
# 14-15. Write Balanced Images & CSVs + Extended status.json

def to_container_path(abs_path: Path) -> str:
    # Standardize relative path inside processed dataset root
    # Using pattern similar to prior splits: /data/processed/EmoSet_splits_gen/raw_all/<label>/<file>
    rel = abs_path.relative_to(DEST_IMG_ROOT)
    return f"/data/processed/EmoSet_splits_gen/raw_all/{rel.as_posix()}"

if RUN_WRITE:
    for lab in labels:
        (DEST_IMG_ROOT / lab).mkdir(parents=True, exist_ok=True)

    def copy_into_root(df_):
        copied = 0
        for rec in df_.itertuples():
            src = Path(rec.path)
            lab = rec.label
            dst = DEST_IMG_ROOT / lab / src.name
            if not dst.exists():
                try:
                    if ALLOW_HARDLINK:
                        os.link(src, dst)
                    else:
                        shutil.copy2(src, dst)
                    copied += 1
                except Exception:
                    continue
        return copied

    copied_n = copy_into_root(train_df_adjusted) + copy_into_root(val_df) + copy_into_root(test_df)
    print('Copied/linked files:', copied_n)

    def remap(df_):
        df_ = df_.copy()
        df_['path'] = df_['path'].apply(lambda p: to_container_path(DEST_IMG_ROOT / Path(p).parent.name / Path(p).name))
        return df_

    train_out = remap(train_df_adjusted)
    val_out = remap(val_df)
    test_out = remap(test_df)

    train_out.to_csv(CSV_DIR / 'train.csv', index=False)
    val_out.to_csv(CSV_DIR / 'val.csv', index=False)
    test_out.to_csv(CSV_DIR / 'test.csv', index=False)
    print('Wrote split CSVs')

    # Extended status.json
    per_class_extended = {}
    for r in summary_df.itertuples():
        per_class_extended[r.label] = {
            'original': int(r.original),
            'synthetic': int(r.synthetic),
            'total': int(r.total),
            'synthetic_fraction': float(r.synthetic_fraction)
        }

    total_images = int(summary_df['total'].sum())
    total_synth = int(summary_df['synthetic'].sum())
    status_json = {
        'total_images': total_images,
        'total_original': int(summary_df['original'].sum()),
        'total_synthetic': total_synth,
        'synthetic_fraction': float(total_synth/total_images if total_images else 0.0),
        'per_class': per_class_extended,
        'splits_fraction': {'train': TRAIN_PCT, 'val': VAL_PCT, 'test': TEST_PCT},
        'image_size': {'width': IMG_SIZE[0], 'height': IMG_SIZE[1], 'mode': GRAYSCALE_MODE},
        'seed': BASE_SEED,
        'generation_model_id': MODEL_ID,
        'prompts_version_hash': prompt_dict_hash,
        'quality_thresholds': {
            'blur_var_min': BLUR_VAR_MIN,
            'hash_dist_max': HASH_DIST_MAX,
            'phash_size': PHASH_SIZE,
        },
        'created_timestamp': datetime.now(timezone.utc).isoformat().replace('+00:00','Z'),
    }
    with open(CSV_DIR / 'status.json','w') as f:
        json.dump(status_json, f, indent=2)
    print('Wrote status.json')


In [None]:
# 16-17. Diagnostics & Balance Report + Plots
import matplotlib.pyplot as plt
import seaborn as sns

TOL_PCT = 2.0

def diag(counts: pd.Series) -> dict:
    counts = counts.sort_index()
    target = counts.mean()
    diffs = (counts - target) / target * 100
    return {
        'counts': counts.to_dict(),
        'min': int(counts.min()),
        'max': int(counts.max()),
        'mean': float(target),
        'std': float(counts.std(ddof=0)),
        'max_dev_pct': float(diffs.abs().max()),
        'balanced_within_tol_pct': bool(diffs.abs().max() <= TOL_PCT),
        'tolerance_pct': TOL_PCT,
    }

print('Overall total diag:')
print(diag(df_all['label'].value_counts()))
print('\nOriginal-only diag:')
print(diag(df_original['label'].value_counts()))

splits_named = {
    'train': train_df_adjusted,
    'val': val_df,
    'test': test_df,
}
for name, sdf in splits_named.items():
    print(f"\nSplit {name} diag:")
    print(diag(sdf['label'].value_counts()))

# Plot counts per class stacked (original vs synthetic)
fig, ax = plt.subplots(figsize=(10,5))
summary_df.set_index('label')[['original','synthetic']].plot(kind='bar', stacked=True, ax=ax, color=['#4B8BBE','#FFD43B'])
ax.set_title('Original vs Synthetic Counts per Class')
ax.set_ylabel('Count')
plt.tight_layout(); plt.show()

# Synthetic fraction bar
fig, ax = plt.subplots(figsize=(10,4))
summary_df.plot(x='label', y='synthetic_fraction', kind='bar', ax=ax, color='#FF8C42')
ax.set_title('Synthetic Fraction per Class (Total Dataset)')
ax.set_ylabel('Fraction')
plt.tight_layout(); plt.show()

# Heatmap synthetic fraction per split
heat_data = []
for lab in labels:
    row = []
    for name, sdf in splits_named.items():
        sub = sdf[sdf.label==lab]
        if len(sub)==0:
            row.append(0.0)
        else:
            row.append((sub.origin=='synthetic').mean())
    heat_data.append(row)
heat_df = pd.DataFrame(heat_data, index=labels, columns=['train','val','test'])
plt.figure(figsize=(6, max(4, len(labels)*0.4)))
sns.heatmap(heat_df, annot=True, fmt='.2f', cmap='Blues')
plt.title('Synthetic Fraction per Class per Split')
plt.tight_layout(); plt.show()


In [None]:
# 18. Sample Visualization (Original vs Synthetic)
import random

N_SHOW = 5  # per class
random.seed(BASE_SEED)

fig_rows = []
for lab in labels:
    orig_paths = df_original[df_original.label==lab]['path'].tolist()
    synth_paths = df_synth[df_synth.label==lab]['path'].tolist() if 'df_synth' in globals() else []
    random.shuffle(orig_paths)
    random.shuffle(synth_paths)
    show_orig = orig_paths[:N_SHOW]
    show_syn = synth_paths[:N_SHOW]
    # Build grid row per class
    fig_rows.append((lab, show_orig, show_syn))

n_classes = len(fig_rows)
cols = N_SHOW
fig, axes = plt.subplots(n_classes*2, cols, figsize=(cols*2.2, n_classes*2.2*2))
for r, (lab, orig_list, syn_list) in enumerate(fig_rows):
    for c in range(cols):
        ax_o = axes[r*2][c]
        if c < len(orig_list):
            try:
                img = Image.open(orig_list[c]).convert('L')
                ax_o.imshow(img, cmap='gray')
            except Exception:
                ax_o.text(0.5,0.5,'err',ha='center')
        ax_o.set_xticks([]); ax_o.set_yticks([])
        if c==0:
            ax_o.set_ylabel(f'{lab}\n(orig)', rotation=0, labelpad=40, va='center')

        ax_s = axes[r*2+1][c]
        if c < len(syn_list):
            try:
                img = Image.open(syn_list[c]).convert('L')
                ax_s.imshow(img, cmap='gray')
            except Exception:
                ax_s.text(0.5,0.5,'err',ha='center')
        ax_s.set_xticks([]); ax_s.set_yticks([])
        if c==0:
            ax_s.set_ylabel('syn', rotation=0, labelpad=20, va='center')
plt.tight_layout(); plt.show()


In [None]:
# 19. Grayscale Integrity & Shape Assertions
sample_check_paths = df_all.sample(min(50, len(df_all)), random_state=BASE_SEED)['path'].tolist()
for p in sample_check_paths[:20]:
    try:
        img = Image.open(p)
        assert img.mode == 'L', f'Not grayscale: {p} mode={img.mode}'
        assert img.size == IMG_SIZE, f'Bad size: {p} size={img.size}'
    except Exception as e:
        print('[WARN] Integrity check issue:', e)
print('Basic grayscale + size assertions done.')

# Mean/std comparison
orig_stats_imgs = df_original.sample(min(500, len(df_original)), random_state=BASE_SEED)['path'].tolist()
synth_stats_imgs = df_synth.sample(min(500, len(df_synth)), random_state=BASE_SEED)['path'].tolist() if len(df_synth) else []

def gather_stats(paths):
    vals = []
    for p in paths:
        try:
            arr = np.array(Image.open(p).convert('L'), dtype=np.float32) / 255.0
            vals.append(arr)
        except Exception:
            pass
    if not vals:
        return {'mean': None, 'std': None}
    stack = np.stack(vals)
    return {'mean': float(stack.mean()), 'std': float(stack.std())}

orig_stats = gather_stats(orig_stats_imgs)
synth_stats = gather_stats(synth_stats_imgs)
print('Original grayscale mean/std:', orig_stats)
print('Synthetic grayscale mean/std:', synth_stats)

# 20. Regeneration Retry for Deficit Classes (simplified placeholder)
if deficit_labels:
    print('[Retry] Attempting second-pass generation for deficits...')
    # For brevity, not re-implementing full second loop here.
    # Could call a function similar to main loop with modified prompts.
    pass

# 21. Reproducibility & Environment Capture
env_report = {
    'python_version': sys.version,
    'platform': platform.platform(),
    'torch_version': torch.__version__,
    'diffusers_version': __import__('diffusers').__version__,
    'numpy_version': np.__version__,
    'model_id': MODEL_ID,
    'base_seed': BASE_SEED,
    'prompt_dict_hash': prompt_dict_hash,
    'generation_params': {
        'steps': INFERENCE_STEPS,
        'guidance_scale': GUIDANCE_SCALE,
    },
}
if RUN_WRITE:
    with open(CSV_DIR / 'env_report.json','w') as f:
        json.dump(env_report, f, indent=2)

# 22. Disk Usage & Summary Stats
import humanize

def dir_size(path: Path) -> int:
    total = 0
    for root, _, files in os.walk(path):
        for fn in files:
            try:
                total += (Path(root)/fn).stat().st_size
            except Exception:
                pass
    return total

orig_size = dir_size(RAW_ROOT)
synthetic_size = dir_size(OUT_GEN_ROOT) if OUT_GEN_ROOT.exists() else 0
final_size = dir_size(DEST_IMG_ROOT) if DEST_IMG_ROOT.exists() else 0

print('\nDisk Usage:')
print('Original raw root:', humanize.naturalsize(orig_size))
print('Synthetic root:', humanize.naturalsize(synthetic_size))
print('Final curated root:', humanize.naturalsize(final_size))

print('\nSUMMARY:')
print(f"Total images original: {len(df_original)}")
print(f"Total images synthetic: {len(df_synth)}")
print(f"Total combined: {len(df_all)}")
print('Train/Val/Test sizes:', len(train_df_adjusted), len(val_df), len(test_df))
