# Visual Verification of Loader Outputs

This notebook visually inspects the outputs of `SlipstreamLoader` across all pipeline modes:

1. **RandomResizedCrop** — training pipeline
2. **CenterCrop** — validation pipeline
3. **Multi-crop (2x RRC)** — SSL multi-view pipeline
4. **Seed reproducibility** — same seed → same crops

In [None]:
LITDATA_VAL_PATH = "s3://visionlab-datasets/imagenet1k/pre-processed/s256-l512-jpgbytes-q100-streaming/val/"

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from slipstream import SlipstreamDataset

dataset = SlipstreamDataset(
    remote_dir=LITDATA_VAL_PATH,
    decode_images=False,
)
print(f"Dataset: {len(dataset):,} samples")

In [None]:
def show_batch(images, title="", nrow=8, figsize=None):
    """Display a grid of images from a [B, C, H, W] uint8 tensor."""
    if isinstance(images, torch.Tensor):
        images = images.cpu()
    n = min(len(images), nrow * 2)  # show up to 2 rows
    ncols = min(n, nrow)
    nrows = (n + ncols - 1) // ncols
    if figsize is None:
        figsize = (ncols * 1.8, nrows * 1.8)
    fig, axes = plt.subplots(nrows, ncols, figsize=figsize)
    if nrows == 1:
        axes = [axes] if ncols == 1 else list(axes)
        axes = [axes]
    for i in range(nrows):
        for j in range(ncols):
            idx = i * ncols + j
            ax = axes[i][j]
            if idx < n:
                img = images[idx].permute(1, 2, 0).numpy()  # CHW → HWH
                ax.imshow(img)
            ax.axis('off')
    if title:
        fig.suptitle(title, fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()

## 1. RandomResizedCrop (Training)

In [None]:
from slipstream import SlipstreamLoader, RandomResizedCrop

loader_rrc = SlipstreamLoader(
    dataset,
    batch_size=16,
    shuffle=True,
    seed=42,
    pipelines={'image': [RandomResizedCrop(224)]},
    exclude_fields=['path'],
)

batch = next(iter(loader_rrc))
print(f"Image shape: {batch['image'].shape}, dtype: {batch['image'].dtype}")
show_batch(batch['image'], title=f"RandomResizedCrop(224) — labels: {batch['label'][:8].tolist()}")
loader_rrc.shutdown()

## 2. CenterCrop (Validation)

In [None]:
from slipstream import CenterCrop

loader_cc = SlipstreamLoader(
    dataset,
    batch_size=16,
    shuffle=False,
    pipelines={'image': [CenterCrop(224)]},
    exclude_fields=['path'],
)

batch = next(iter(loader_cc))
print(f"Image shape: {batch['image'].shape}, dtype: {batch['image'].dtype}")
show_batch(batch['image'], title=f"CenterCrop(224) — labels: {batch['label'][:8].tolist()}")
loader_cc.shutdown()

## 3. Multi-Crop SSL (2x RandomResizedCrop)

The multi-crop pipeline decodes each JPEG once and produces N different random crops.
The batch returns a list of tensors — one per crop view.

In [None]:
from slipstream import MultiCropRandomResizedCrop

loader_mc = SlipstreamLoader(
    dataset,
    batch_size=8,
    shuffle=True,
    seed=42,
    pipelines={'image': [MultiCropRandomResizedCrop(num_crops=2, size=224)]},
    exclude_fields=['path'],
)

batch = next(iter(loader_mc))
views = batch['image']  # list of 2 tensors
print(f"Number of views: {len(views)}")
print(f"View shape: {views[0].shape}")

show_batch(views[0], title="Multi-crop: View 1")
show_batch(views[1], title="Multi-crop: View 2 (same images, different crops)")
loader_mc.shutdown()

### Side-by-side comparison

Each column shows the same source image with two different random crops.

In [None]:
n = min(6, len(views[0]))
fig, axes = plt.subplots(2, n, figsize=(n * 2, 4.2))
for i in range(n):
    for row, (view, label) in enumerate(zip(views, ['View 1', 'View 2'])):
        axes[row][i].imshow(view[i].permute(1, 2, 0).numpy())
        axes[row][i].axis('off')
        if i == 0:
            axes[row][i].set_ylabel(label, fontsize=11)
fig.suptitle("Multi-crop: two views per image", fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 4. Seed Reproducibility

Using the same `seed=` should produce identical batches (same shuffle order, same crops).

In [None]:
def get_first_batch(seed):
    loader = SlipstreamLoader(
        dataset,
        batch_size=8,
        shuffle=True,
        seed=seed,
        pipelines={'image': [RandomResizedCrop(224)]},
        exclude_fields=['path'],
        verbose=False,
    )
    batch = next(iter(loader))
    loader.shutdown()
    return batch

batch_a = get_first_batch(seed=123)
batch_b = get_first_batch(seed=123)
batch_c = get_first_batch(seed=456)

# Same seed → identical
same_indices = torch.equal(batch_a['_indices'], batch_b['_indices'])
same_pixels = torch.equal(batch_a['image'], batch_b['image'])
print(f"seed=123 vs seed=123: same indices={same_indices}, same pixels={same_pixels}")

# Different seed → different
diff_indices = not torch.equal(batch_a['_indices'], batch_c['_indices'])
print(f"seed=123 vs seed=456: different indices={diff_indices}")

In [None]:
fig, axes = plt.subplots(3, 8, figsize=(14.5, 5.5))
for row, (batch, label) in enumerate([
    (batch_a, 'seed=123 (run 1)'),
    (batch_b, 'seed=123 (run 2)'),
    (batch_c, 'seed=456'),
]):
    for i in range(8):
        axes[row][i].imshow(batch['image'][i].permute(1, 2, 0).numpy())
        axes[row][i].axis('off')
    axes[row][0].set_ylabel(label, fontsize=10)
fig.suptitle('Seed reproducibility: same seed → identical output', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()