# DataLoader Comparison

Compare series_id splitting between PhotoTriageData (exp1) and External MyDataset (exp2)

In [None]:
import pandas as pd
from pathlib import Path

## Load Data

In [None]:
data_dir = Path('../outputs/dataloader_comparison')

exp1_train = pd.read_csv(data_dir / 'exp1_train.csv')
exp1_val = pd.read_csv(data_dir / 'exp1_val.csv')
exp2_train = pd.read_csv(data_dir / 'exp2_train.csv')
exp2_val = pd.read_csv(data_dir / 'exp2_val.csv')

print(f"Exp1 Train: {len(exp1_train)} pairs")
print(f"Exp1 Val:   {len(exp1_val)} pairs")
print(f"Exp2 Train: {len(exp2_train)} pairs")
print(f"Exp2 Val:   {len(exp2_val)} pairs")

Exp1 Train: 9760 pairs
Exp1 Val:   1114 pairs
Exp2 Train: 12075 pairs
Exp2 Val:   483 pairs


## Experiment 1 (PhotoTriageData) - Series Overlap Check

In [None]:
exp1_train_series = set(exp1_train['series_id1']) | set(exp1_train['series_id2'])
exp1_val_series = set(exp1_val['series_id1']) | set(exp1_val['series_id2'])
exp1_overlap = exp1_train_series & exp1_val_series

print(f"Train series: {len(exp1_train_series)}")
print(f"Val series:   {len(exp1_val_series)}")
print(f"Overlap:      {len(exp1_overlap)}")
print(f"\n{'✅ PASS' if len(exp1_overlap) == 0 else '⚠️  FAIL'}: {'No overlap' if len(exp1_overlap) == 0 else f'{len(exp1_overlap)} series in both splits'}")

Train series: 3466
Val series:   433
Overlap:      0

✅ PASS: No overlap


## Experiment 2 (External MyDataset) - Series Overlap Check

In [None]:
exp2_train_series = set(exp2_train['series_id1']) | set(exp2_train['series_id2'])
exp2_val_series = set(exp2_val['series_id1']) | set(exp2_val['series_id2'])
exp2_overlap = exp2_train_series & exp2_val_series

print(f"Train series: {len(exp2_train_series)}")
print(f"Val series:   {len(exp2_val_series)}")
print(f"Overlap:      {len(exp2_overlap)}")
print(f"\n{'✅ PASS' if len(exp2_overlap) == 0 else '⚠️  FAIL'}: {'No overlap' if len(exp2_overlap) == 0 else f'{len(exp2_overlap)} series in both splits'}")

if len(exp2_overlap) > 0:
    print(f"\nOverlapping series: {sorted(list(exp2_overlap))[:10]}...")

Train series: 4560
Val series:   195
Overlap:      0

✅ PASS: No overlap


## Summary

In [None]:
summary = pd.DataFrame([
    {
        'Experiment': 'Exp1 (PhotoTriage)',
        'Train Pairs': len(exp1_train),
        'Val Pairs': len(exp1_val),
        'Train Series': len(exp1_train_series),
        'Val Series': len(exp1_val_series),
        'Overlap': len(exp1_overlap),
        'Status': '✅ PASS' if len(exp1_overlap) == 0 else '⚠️  FAIL'
    },
    {
        'Experiment': 'Exp2 (External)',
        'Train Pairs': len(exp2_train),
        'Val Pairs': len(exp2_val),
        'Train Series': len(exp2_train_series),
        'Val Series': len(exp2_val_series),
        'Overlap': len(exp2_overlap),
        'Status': '✅ PASS' if len(exp2_overlap) == 0 else '⚠️  FAIL'
    }
])

summary

Unnamed: 0,Experiment,Train Pairs,Val Pairs,Train Series,Val Series,Overlap,Status
0,Exp1 (PhotoTriage),9760,1114,3466,433,0,✅ PASS
1,Exp2 (External),12075,483,4560,195,0,✅ PASS
