In [None]:
#|default_exp data_utils 

# Datasets and General Data Utilities

For many tasks in machine learning, the actual model training is the easy bit! Many data scientists spend most of their time sourcing and exploring data, and getting it into the right format ready for modelling. Lucky for us, most of the data we'll use for demos during this course has already been collected and organised for us, and to make things even more convenient during the lessons themselves we're going to lay some additional groundwork here in this notebook. 

Motivate dataloaders
Batching
Advantage of pre-fetching the next batch
Mention monitoring GPU usage and watching for CPU bottlenecks in the dataloaders
Dive into pytorch dataloaders
HF Hub and datasets library

https://huggingface.co/docs/datasets/quickstart


https://huggingface.co/docs/datasets/stream

DATA UTILS 

In [None]:
#|export
import torch
import datasets
from tglcourse.utils import *
from datasets import load_dataset
from torchvision import transforms as T
from torch.utils.data import Dataset, DataLoader

### TODO redo this and integrate into notebooks

In [None]:
#|export

def mnist_transform(example):
    example["image"] = [T.ToTensor()(image) for image in example["image"]]
    return example

# Re-create the streaming example above
def get_mnist_dl(batch_size=32, split='train'):
    mnist_dataset = load_dataset('mnist', split=split)
    mnist_dataset = mnist_dataset.with_transform(mnist_transform)
    dataloader = DataLoader(mnist_dataset, batch_size=batch_size)
    return dataloader

In [None]:
#|export
tfm = T.Compose([T.ToTensor(), T.Resize(320), T.CenterCrop(320)])
def imagewoof_transform(example):
    example["image"] = [tfm(image.convert('RGB')) for image in example["image"]]
    return example
def get_imagewoof_dl(batch_size=32):
    dataset = load_dataset('johnowhitaker/imagewoof2-320', split='train').shuffle(seed=42)
    dataset = dataset.with_transform(imagewoof_transform)
    dataloader = DataLoader(dataset, batch_size=batch_size)
    return dataloader

In [None]:
#|export
tfm = T.Compose([T.ToTensor(), T.Resize(32), T.CenterCrop(32)])
def cifar10_transform(example):
    example["image"] = [tfm(image.convert('RGB')) for image in example["image"]]
    return example
def get_cifar10_dl(batch_size=32, split='train'):
    dataset = load_dataset('cifar10', split=split).shuffle(seed=42).rename_column("img", "image")
    dataset = dataset.with_transform(cifar10_transform)
    dataloader = DataLoader(dataset, batch_size=batch_size)
    return dataloader

In [None]:
dataloader = get_cifar10_dl(batch_size=128, split='train')
batch = next(iter(dataloader))
batch['image'].shape, batch['label']

Reusing dataset cifar10 (/root/.cache/huggingface/datasets/cifar10/plain_text/1.0.0/447d6ec4733dddd1ce3bb577c7166b986eaa4c538dcd9e805ba61f35674a9de4)
Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/cifar10/plain_text/1.0.0/447d6ec4733dddd1ce3bb577c7166b986eaa4c538dcd9e805ba61f35674a9de4/cache-16b9e105e7ead8c5.arrow


(torch.Size([128, 3, 32, 32]),
 tensor([1, 2, 6, 7, 9, 4, 7, 6, 4, 2, 2, 0, 4, 8, 4, 2, 5, 7, 2, 9, 9, 8, 8, 1,
         4, 3, 7, 3, 5, 6, 9, 3, 6, 4, 3, 4, 7, 9, 3, 3, 0, 6, 4, 3, 5, 1, 9, 6,
         2, 2, 1, 0, 6, 7, 4, 3, 1, 4, 4, 2, 2, 5, 4, 5, 7, 0, 3, 0, 8, 4, 5, 7,
         9, 0, 9, 9, 9, 4, 8, 3, 3, 6, 5, 5, 3, 2, 8, 1, 4, 3, 4, 2, 7, 8, 2, 0,
         9, 6, 8, 7, 4, 3, 2, 0, 2, 0, 3, 2, 4, 9, 2, 5, 9, 6, 0, 6, 0, 7, 2, 2,
         1, 7, 5, 9, 6, 8, 6, 4]))