In [1]:
!TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 60") && \
curl -s -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/instance-type

t4g.2xlarge

### mount our dataset on pond
opts: --chunk-size 8MiB --readahead-size 32MiB --max-cache-size 19GiB

In [2]:
%%script bash
~/pond/target/release/pond mount s3://junctionlabs/pond/volume/coco-train2017/ /mnt/pond/train2017 --chunk-size 8MiB --readahead-size 32MiB --max-cache-size 19GiB --background

s3://junctionlabs/pond/volume/coco-train2017/ is mounted at /mnt/pond/train2017


In [3]:
mount = "/mnt/pond"  # http://images.cocodataset.org/zips/train2017.zip

In [4]:
def clear_cache():
    """ Clear the in-memory cache within Pond. Pond has a special file called .clearcache under its mount root. """
    with open(f"{mount}/.clearcache", 'w') as file:
        file.write("1")

### make this benchmark more deterministic

In [5]:
import torch
import numpy as np
import random

# deterministic with a manual seeds because this is a benchmark
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

### data loading and normalization setup

In [6]:
import time
import torchvision

from torch.utils.data import DataLoader
from torchvision import transforms
from tqdm.notebook import tqdm


def load_data(num_epochs: int, shuffle: bool):
    """
    Load and read the data in under `mount` using torch.utils.data.DataLoader.
    
    Args:
        num_epochs: how many times we'll loop over the dataset
        shuffle: whether we will shuffle the dataset before trying to read it,
                 effectively making the access pattern random
    """
    # each run starts with a fresh cache.
    clear_cache()
    
    transform = transforms.Compose(
        [transforms.Resize((224, 224)), transforms.ToTensor()]
    )
    dataset = torchvision.datasets.ImageFolder(
        root=mount, transform=transform
    )

    # `shuffle` determines if we'll read the data serially or in random order
    loader = DataLoader(dataset, shuffle=shuffle, batch_size=64, num_workers=4)
    
    print(f"dataset size: {len(dataset)} images")

    # each epoch loads all images using the DataLoader
    for epoch in range(num_epochs):
        start = time.time()
    
        files = 0
        for imgs, labels in tqdm(loader):
            # here's where you'd usually have your training loop, but we're just interested in
            # data loading performance
            content = imgs.numpy().tobytes()
            with open('/dev/null', 'wb') as f:
                f.write(content)
            files += imgs.size(0)
            
        print(f"epoch {epoch + 1} took {time.time() - start:.1f}s to read {files} files")

    

### random access

In [7]:
load_data(num_epochs=2, shuffle=True)

dataset size: 118287 images


  0%|          | 0/1849 [00:00<?, ?it/s]

epoch 1 took 309.9s to read 118287 files


  0%|          | 0/1849 [00:00<?, ?it/s]

epoch 2 took 231.2s to read 118287 files


### serial access

In [8]:
load_data(num_epochs=2, shuffle=False)

dataset size: 118287 images


  0%|          | 0/1849 [00:00<?, ?it/s]

epoch 1 took 226.3s to read 118287 files


  0%|          | 0/1849 [00:00<?, ?it/s]

epoch 2 took 227.3s to read 118287 files
