# Remote Cache Storage

This notebook demonstrates the **remote cache** feature that enables automatic discovery,
download, and upload of SlipCache to/from S3.

**Key features:**
- Hash-based auto-discovery: caches are stored at `{remote_cache}/slipcache-{dataset_hash}/`
- **Real-time progress display**: PTY-based integration shows byte-level upload/download progress
- Bidirectional sync: indexes, stats, and other derived files are shared across machines
- Graceful fallback: S3 errors don't block local operation

**Test scenarios:**
1. Remote missing → build and upload with progress
2. Remote exists, local missing → download from S3 with progress
3. Both exist → sync derived files (indexes, stats)

In [None]:
import shutil
from pathlib import Path

# Configuration
REMOTE_CACHE = "s3://visionlab-datasets/slipstream-cache/"
LITDATA_VAL_PATH = "s3://visionlab-datasets/imagenet1k/pre-processed/s256-l512-jpgbytes-q100-streaming/val/"

# Check s5cmd is available
if shutil.which("s5cmd"):
    print("s5cmd found ✓")
else:
    print("ERROR: s5cmd not found")
    print("Install with: brew install peak/tap/s5cmd")

## Setup: Create Dataset and Check Current State

In [None]:
from slipstream import SlipstreamDataset, OptimizedCache, SlipstreamLoader

dataset = SlipstreamDataset(
    remote_dir=LITDATA_VAL_PATH,
    decode_images=False,
)

print(f"Dataset: {len(dataset):,} samples")
print(f"Dataset hash: {dataset.dataset_hash}")
print(f"Local cache path: {dataset.cache_path}")
print(f"Local cache exists: {OptimizedCache.exists(dataset.cache_path)}")

In [None]:
from slipstream.s3_sync import s3_path_exists

remote_cache_full = f"{REMOTE_CACHE.rstrip('/')}/slipcache-{dataset.dataset_hash}"
remote_manifest = f"{remote_cache_full}/.slipstream/manifest.json"

print(f"Remote cache path: {remote_cache_full}")
print(f"Remote cache exists: {s3_path_exists(remote_manifest)}")

## Scenario 1: Remote Exists, Local Missing → Download

Delete local cache and verify it downloads from S3.

In [None]:
# First, make sure remote cache exists (may need to build and upload first)
from slipstream import SlipstreamLoader, DecodeCenterCrop

if not s3_path_exists(remote_manifest):
    print("Remote cache doesn't exist yet. Building and uploading...")
    loader_init = SlipstreamLoader(
        dataset,
        batch_size=256,
        remote_cache=REMOTE_CACHE,
        pipelines={'image': [DecodeCenterCrop(224)]},
        exclude_fields=['path'],
        verbose=True,
    )
    loader_init.shutdown()
    print(f"\nRemote cache now exists: {s3_path_exists(remote_manifest)}")
else:
    print(f"Remote cache already exists at {remote_cache_full}")

In [None]:
# Delete local cache to test download
local_slipstream = dataset.cache_path / ".slipstream"
if local_slipstream.exists():
    print(f"Deleting local cache: {local_slipstream}")
    shutil.rmtree(local_slipstream)
    print(f"Local cache deleted.")

print(f"Local cache exists: {OptimizedCache.exists(dataset.cache_path)}")

In [None]:
# Now create loader - should download from remote
print("Creating loader (should download from remote)...\n")

loader1 = SlipstreamLoader(
    dataset,
    batch_size=256,
    remote_cache=REMOTE_CACHE,
    pipelines={'image': [DecodeCenterCrop(224)]},
    exclude_fields=['path'],
    verbose=True,
)

print(f"\nCache loaded: {loader1.cache.num_samples:,} samples")
print(f"Local cache exists: {OptimizedCache.exists(dataset.cache_path)}")

In [None]:
# Verify we can load batches
batch = next(iter(loader1))
print(f"Batch image shape: {batch['image'].shape}")
print(f"Batch labels: {batch['label'][:10]}")
loader1.shutdown()

## Scenario 2: Local Exists, Remote Exists → Sync Derived Files

Both caches exist. Add an index locally and verify it syncs to remote.

In [None]:
from slipstream import write_index

# Load cache and check if index exists
cache = OptimizedCache.load(dataset.cache_path, verbose=False)

# Check current indexes
index_file = cache.cache_dir / "label_index.npy"
print(f"Index file exists locally: {index_file.exists()}")

# Build index if it doesn't exist
if not index_file.exists():
    print("\nBuilding label index...")
    write_index(cache, fields=['label'])
    print(f"Index file created: {index_file.exists()}")

In [None]:
# Check if index exists on remote
remote_index = f"{remote_cache_full}/.slipstream/label_index.npy"
print(f"Remote index path: {remote_index}")
print(f"Remote index exists: {s3_path_exists(remote_index)}")

In [None]:
# Create loader with remote_cache - this should sync the index to remote
print("Creating loader (should sync index to remote)...\n")

loader2 = SlipstreamLoader(
    dataset,
    batch_size=256,
    remote_cache=REMOTE_CACHE,
    pipelines={'image': [DecodeCenterCrop(224)]},
    exclude_fields=['path'],
    verbose=True,
)

print(f"\nRemote index now exists: {s3_path_exists(remote_index)}")
loader2.shutdown()

In [None]:
loader1.cache

## Scenario 3: Manual Sync After Adding Derived Files

Use `loader.sync_remote_cache()` to manually trigger sync after adding new files.

In [None]:
from slipstream import compute_normalization_stats
import json

# Compute normalization stats
cache = OptimizedCache.load(dataset.cache_path, verbose=False)
stats = compute_normalization_stats(cache, image_format="jpeg")
print(f"Computed stats: {stats}")

# Save stats to cache directory
stats_file = cache.cache_dir / "normalization_stats.json"
with open(stats_file, 'w') as f:
    json.dump(stats, f, indent=2)
print(f"\nSaved stats to: {stats_file}")

In [None]:
# Create loader and manually sync
loader3 = SlipstreamLoader(
    dataset,
    batch_size=256,
    remote_cache=REMOTE_CACHE,
    pipelines={'image': [DecodeCenterCrop(224)]},
    exclude_fields=['path'],
    verbose=True,
)

print("\nManually triggering sync...")
downloaded, uploaded = loader3.sync_remote_cache()
print(f"Sync complete: {downloaded} downloaded, {uploaded} uploaded")

# Verify stats file exists on remote
remote_stats = f"{remote_cache_full}/.slipstream/normalization_stats.json"
print(f"\nRemote stats exists: {s3_path_exists(remote_stats)}")

loader3.shutdown()

## Scenario 4: Download Derived Files from Remote

Delete local derived files and verify they download from remote.

In [None]:
# Delete local index and stats (but keep the main cache)
cache = OptimizedCache.load(dataset.cache_path, verbose=False)

index_file = cache.cache_dir / "label_index.npy"
stats_file = cache.cache_dir / "normalization_stats.json"

if index_file.exists():
    index_file.unlink()
    print(f"Deleted local index: {index_file}")

if stats_file.exists():
    stats_file.unlink()
    print(f"Deleted local stats: {stats_file}")

print(f"\nLocal index exists: {index_file.exists()}")
print(f"Local stats exists: {stats_file.exists()}")

In [None]:
from slipstream import SlipstreamLoader, DecodeCenterCrop

# Create loader - should download derived files from remote
print("Creating loader (should download index and stats from remote)...\n")

loader4 = SlipstreamLoader(
    dataset,
    batch_size=256,
    remote_cache=REMOTE_CACHE,
    pipelines={'image': [DecodeCenterCrop(224)]},
    exclude_fields=['path'],
    verbose=True,
)

print(f"\nLocal index now exists: {index_file.exists()}")
print(f"Local stats now exists: {stats_file.exists()}")

# Verify index works
if index_file.exists():
    label_index = loader4.cache.get_index('label')
    print(f"Index loaded: {len(label_index)} unique labels")

loader4.shutdown()

## Summary

The remote cache feature provides:

1. **Auto-discovery**: Uses `slipcache-{dataset_hash}` paths for alignment
2. **Real-time progress**: PTY-based integration shows byte-level transfer progress
3. **Download on init**: If remote exists and local missing, downloads with progress
4. **Upload after build**: If local built and remote missing, uploads with progress
5. **Bidirectional sync**: Syncs derived files (indexes, stats) in both directions
6. **Manual sync**: `loader.sync_remote_cache()` for on-demand sync

```python
# Basic usage
loader = SlipstreamLoader(
    dataset,
    batch_size=256,
    remote_cache="s3://my-bucket/slipstream-caches/",
    pipelines=supervised_train(224),
)

# After adding indexes or stats, sync manually
write_index(loader.cache, fields=['label'])
loader.sync_remote_cache()  # Uploads new files to S3
```

In [None]:
# List all files in remote cache
import subprocess

result = subprocess.run(
    ["s5cmd", "ls", f"{remote_cache_full}/.slipstream/*"],
    capture_output=True, text=True
)
print(f"Files in remote cache ({remote_cache_full}/.slipstream/):")
print(result.stdout)