## Measuring Performance

- [A100 Specifications](https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf)
- [PyTorch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html)
- [Using CUDA events to measure time](https://discuss.pytorch.org/t/how-to-measure-time-in-pytorch/26964/2)

## Baseline

`BASELINE_TFLOPS_PER_SEC = 2.096`

In [1]:
from dataclasses import asdict

import torch
import torch.nn.functional as F
import torchvision.transforms as T
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10

from ddpm import DDPM, ModelConfig, TrainerConfig
from model import UNet

cfg_m = ModelConfig()
cfg_t = TrainerConfig(bs=128, nw=16)

img2tensor = T.Compose([
    T.RandomHorizontalFlip(),
    T.ToTensor(),
    T.Normalize(mean=0.5, std=0.5)  # [0, 1] -> [-1, 1]
])
ds = CIFAR10('./cifar10', train=True, transform=img2tensor, download=True)
dataloader = DataLoader(ds, batch_size=cfg_t.bs, num_workers=cfg_t.nw, drop_last=True)

ddpm = DDPM(**asdict(cfg_m)).to('cuda')
optimizer = torch.optim.AdamW(ddpm.parameters(), lr=cfg_t.lr)

x0, _ = next(iter(dataloader))
x0 = x0.to(cfg_t.device)
eps = torch.randn_like(x0)
t = torch.randint(0, cfg_m.nT, [cfg_t.bs], device=x0.device)



Files already downloaded and verified


In [2]:
from torchinfo import summary
summary(ddpm, input_data=[x0, eps, t])

Layer (type:depth-idx)                        Output Shape              Param #
DDPM                                          [128, 3, 32, 32]          --
├─UNet: 1-1                                   [128, 3, 32, 32]          --
│    └─Sequential: 2-1                        [128, 128]                --
│    │    └─Linear: 3-1                       [128, 128]                4,224
│    │    └─GELU: 3-2                         [128, 128]                --
│    │    └─Linear: 3-3                       [128, 128]                16,512
│    └─Conv2d: 2-2                            [128, 32, 32, 32]         128
│    └─UNetDownsample: 2-3                    [128, 32, 16, 16]         --
│    │    └─TimeResNetBlock: 3-4              [128, 32, 32, 32]         26,880
│    │    └─TimeResNetBlock: 3-5              [128, 32, 32, 32]         26,880
│    │    └─GroupNorm: 3-6                    [128, 32, 32, 32]         64
│    │    └─Attention: 3-7                    [128, 32, 32, 32]         16,416


In [8]:
BASELINE_TFLOPS = (61.22 * 1e-3) * 2 * 3
BASELINE_TFLOPS

0.36732000000000004

In [5]:
import numpy as np

n_steps = 16
starts = [torch.cuda.Event(enable_timing=True) for _ in range(n_steps)]
ends = [torch.cuda.Event(enable_timing=True) for _ in range(n_steps)]

ddpm.train()

for i, (x0, _) in zip(range(n_steps), dataloader):
    starts[i].record()

    optimizer.zero_grad()
    
    x0 = x0.to('cuda')
    eps = torch.randn_like(x0)
    t = torch.randint(0, cfg_m.nT, [cfg_t.bs], device=x0.device)
    
    eps_pred = ddpm(x0, eps, t)
    loss = F.smooth_l1_loss(eps, eps_pred)
    loss.backward()
    optimizer.step()

    ends[i].record()

torch.cuda.synchronize()
t = np.mean([s.elapsed_time(e) for s, e in zip(starts, ends)]) / 1000.0

In [9]:
BASELINE_TFLOPS_PER_SEC = BASELINE_TFLOPS / t
BASELINE_TFLOPS_PER_SEC

2.096144992132496

## Optimization 1 - Scale Up Batch Size

`OPT1_TFLOPS_PER_SEC = 2.404`  
1.15x speedup

In [1]:
BASELINE_TFLOPS_PER_SEC = 2.096

In [2]:
from dataclasses import asdict

import torch
import torch.nn.functional as F
import torchvision.transforms as T
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10

from ddpm import DDPM, ModelConfig, TrainerConfig
from model import UNet

cfg_m = ModelConfig()
cfg_t = TrainerConfig(bs=320, nw=16)

img2tensor = T.Compose([
    T.RandomHorizontalFlip(),
    T.ToTensor(),
    T.Normalize(mean=0.5, std=0.5)  # [0, 1] -> [-1, 1]
])
ds = CIFAR10('./cifar10', train=True, transform=img2tensor, download=True)
dataloader = DataLoader(ds, batch_size=cfg_t.bs, num_workers=cfg_t.nw, drop_last=True)

ddpm = DDPM(**asdict(cfg_m)).to(cfg_t.device)
optimizer = torch.optim.AdamW(ddpm.parameters(), lr=cfg_t.lr)



Files already downloaded and verified


In [3]:
from torchinfo import summary

x0, _ = next(iter(dataloader))
x0 = x0.to(cfg_t.device)
eps = torch.randn_like(x0)
t = torch.randint(0, cfg_m.nT, [cfg_t.bs], device=x0.device)
summary(ddpm, input_data=[x0, eps, t])

Layer (type:depth-idx)                        Output Shape              Param #
DDPM                                          [320, 3, 32, 32]          --
├─UNet: 1-1                                   [320, 3, 32, 32]          --
│    └─Sequential: 2-1                        [320, 128]                --
│    │    └─Linear: 3-1                       [320, 128]                4,224
│    │    └─GELU: 3-2                         [320, 128]                --
│    │    └─Linear: 3-3                       [320, 128]                16,512
│    └─Conv2d: 2-2                            [320, 32, 32, 32]         128
│    └─UNetDownsample: 2-3                    [320, 32, 16, 16]         --
│    │    └─TimeResNetBlock: 3-4              [320, 32, 32, 32]         26,880
│    │    └─TimeResNetBlock: 3-5              [320, 32, 32, 32]         26,880
│    │    └─GroupNorm: 3-6                    [320, 32, 32, 32]         64
│    │    └─Attention: 3-7                    [320, 32, 32, 32]         16,416


In [4]:
OPT1_TFLOPS = (153.04 * 1e-3) * 2 * 3
OPT1_TFLOPS

0.91824

In [5]:
import numpy as np

n_steps = 16
starts = [torch.cuda.Event(enable_timing=True) for _ in range(n_steps)]
ends = [torch.cuda.Event(enable_timing=True) for _ in range(n_steps)]

ddpm.train()

for i, (x0, _) in zip(range(n_steps), dataloader):
    starts[i].record()

    optimizer.zero_grad()
    
    x0 = x0.to(cfg_t.device)
    eps = torch.randn_like(x0)
    t = torch.randint(0, cfg_m.nT, [cfg_t.bs], device=x0.device)
    
    eps_pred = ddpm(x0, eps, t)
    loss = F.smooth_l1_loss(eps, eps_pred)
    loss.backward()
    optimizer.step()

    ends[i].record()

torch.cuda.synchronize()
t = np.mean([s.elapsed_time(e) for s, e in zip(starts, ends)]) / 1000.0

In [6]:
OPT1_TFLOPS_PER_SEC = OPT1_TFLOPS / t
OPT1_TFLOPS_PER_SEC, OPT1_TFLOPS_PER_SEC / BASELINE_TFLOPS_PER_SEC

(2.4038481782367884, 1.1468741308381625)

## Optimization 2 - `torch.compile`

`OPT2_TFLOPS_PER_SEC = 3.24`  
1.54x speedup

In [1]:
BASELINE_TFLOPS_PER_SEC = 2.096

In [2]:
from dataclasses import asdict

import torch
import torch.nn.functional as F
import torchvision.transforms as T
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10

from ddpm import DDPM, ModelConfig, TrainerConfig
from model import UNet

cfg_m = ModelConfig()
cfg_t = TrainerConfig(bs=320, nw=16)

img2tensor = T.Compose([
    T.RandomHorizontalFlip(),
    T.ToTensor(),
    T.Normalize(mean=0.5, std=0.5)  # [0, 1] -> [-1, 1]
])
ds = CIFAR10('./cifar10', train=True, transform=img2tensor, download=True)
dataloader = DataLoader(ds, batch_size=cfg_t.bs, num_workers=cfg_t.nw, drop_last=True)

ddpm = torch.compile(DDPM(**asdict(cfg_m)).to(cfg_t.device))
optimizer = torch.optim.AdamW(ddpm.parameters(), lr=cfg_t.lr)



Files already downloaded and verified


In [4]:
OPT2_TFLOPS = (153.04 * 1e-3) * 2 * 3
OPT2_TFLOPS

0.91824

In [7]:
# Run the cell twice for torch.compile

import numpy as np

n_steps = 16
starts = [torch.cuda.Event(enable_timing=True) for _ in range(n_steps)]
ends = [torch.cuda.Event(enable_timing=True) for _ in range(n_steps)]

ddpm.train()

for i, (x0, _) in zip(range(n_steps), dataloader):
    starts[i].record()

    optimizer.zero_grad()
    
    x0 = x0.to(cfg_t.device)
    eps = torch.randn_like(x0)
    t = torch.randint(0, cfg_m.nT, [cfg_t.bs], device=x0.device)
    
    eps_pred = ddpm(x0, eps, t)
    loss = F.smooth_l1_loss(eps, eps_pred)
    loss.backward()
    optimizer.step()

    ends[i].record()

torch.cuda.synchronize()
t = np.mean([s.elapsed_time(e) for s, e in zip(starts, ends)]) / 1000.0

In [8]:
OPT2_TFLOPS_PER_SEC = OPT2_TFLOPS / t
OPT2_TFLOPS_PER_SEC, OPT2_TFLOPS_PER_SEC / BASELINE_TFLOPS_PER_SEC

(3.2378172649358796, 1.5447601454846753)

## Optimization 3 - Use PyTorch SDPA

Using Flash Attention, we significantly lower our memory usage, which allow us to scale the batch size to 2048.

- [PyTorch SDPA doc](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)
- [NanoGPT Usage](https://github.com/karpathy/nanoGPT/blob/master/model.py)

`OPT3_TFLOPS_PER_SEC = 6.06`  
2.89x speedup

In [1]:
BASELINE_TFLOPS_PER_SEC = 2.096

In [2]:
from dataclasses import asdict

import torch
import torch.nn.functional as F
import torchvision.transforms as T
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10

from ddpm import DDPM, ModelConfig, TrainerConfig
from model import UNet

cfg_m = ModelConfig()
cfg_t = TrainerConfig(bs=2048, nw=16)

img2tensor = T.Compose([
    T.RandomHorizontalFlip(),
    T.ToTensor(),
    T.Normalize(mean=0.5, std=0.5)  # [0, 1] -> [-1, 1]
])
ds = CIFAR10('./cifar10', train=True, transform=img2tensor, download=True)
dataloader = DataLoader(ds, batch_size=cfg_t.bs, num_workers=cfg_t.nw, drop_last=True)

ddpm = torch.compile(DDPM(**asdict(cfg_m)).to(cfg_t.device))
optimizer = torch.optim.AdamW(ddpm.parameters(), lr=cfg_t.lr)



Files already downloaded and verified


In [3]:
from torchinfo import summary

x0, _ = next(iter(dataloader))
x0 = x0.to(cfg_t.device)
eps = torch.randn_like(x0)
t = torch.randint(0, cfg_m.nT, [cfg_t.bs], device=x0.device)
summary(DDPM(**asdict(cfg_m)).to(cfg_t.device), input_data=[x0, eps, t])

Layer (type:depth-idx)                        Output Shape              Param #
DDPM                                          [2048, 3, 32, 32]         --
├─UNet: 1-1                                   [2048, 3, 32, 32]         --
│    └─Sequential: 2-1                        [2048, 128]               --
│    │    └─Linear: 3-1                       [2048, 128]               4,224
│    │    └─GELU: 3-2                         [2048, 128]               --
│    │    └─Linear: 3-3                       [2048, 128]               16,512
│    └─Conv2d: 2-2                            [2048, 32, 32, 32]        128
│    └─UNetDownsample: 2-3                    [2048, 32, 16, 16]        --
│    │    └─TimeResNetBlock: 3-4              [2048, 32, 32, 32]        26,880
│    │    └─TimeResNetBlock: 3-5              [2048, 32, 32, 32]        26,880
│    │    └─GroupNorm: 3-6                    [2048, 32, 32, 32]        64
│    │    └─Attention: 3-7                    [2048, 32, 32, 32]        16,416


In [3]:
OPT3_TFLOPS = (979.46 * 1e-3) * 2 * 3
OPT3_TFLOPS

5.876760000000001

In [6]:
# Run the cell twice for torch.compile
import numpy as np

n_steps = 16
starts = [torch.cuda.Event(enable_timing=True) for _ in range(n_steps)]
ends = [torch.cuda.Event(enable_timing=True) for _ in range(n_steps)]

ddpm.train()

for i, (x0, _) in zip(range(n_steps), dataloader):
    starts[i].record()

    optimizer.zero_grad()
    
    x0 = x0.to(cfg_t.device)
    eps = torch.randn_like(x0)
    t = torch.randint(0, cfg_m.nT, [cfg_t.bs], device=x0.device)
    
    eps_pred = ddpm(x0, eps, t)
    loss = F.smooth_l1_loss(eps, eps_pred)
    loss.backward()
    optimizer.step()

    ends[i].record()

torch.cuda.synchronize()
t = np.mean([s.elapsed_time(e) for s, e in zip(starts, ends)]) / 1000.0

In [7]:
OPT3_TFLOPS_PER_SEC = OPT3_TFLOPS / t
OPT3_TFLOPS_PER_SEC, OPT3_TFLOPS_PER_SEC / BASELINE_TFLOPS_PER_SEC

(6.0628220806029995, 2.8925677865472323)

## Optimization 4 - Mixed-Precision Training

Lowering to 16-bit precision lowers the memory requirements, so we further increase the batch size to 3072.

- [Tutorial 1](https://pytorch.org/blog/what-every-user-should-know-about-mixed-precision-training-in-pytorch/)
- [Tutorial 2](https://pytorch.org/blog/accelerating-training-on-nvidia-gpus-with-pytorch-automatic-mixed-precision/)

`OPT4_TFLOPS_PER_SEC = 16.47`  
7.86x speedup

In [1]:
BASELINE_TFLOPS_PER_SEC = 2.096

In [2]:
from dataclasses import asdict

import torch
import torch.nn.functional as F
import torchvision.transforms as T
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10

from ddpm import DDPM, ModelConfig, TrainerConfig
from model import UNet

cfg_m = ModelConfig()
cfg_t = TrainerConfig(bs=3072, nw=16)

img2tensor = T.Compose([
    T.RandomHorizontalFlip(),
    T.ToTensor(),
    T.Normalize(mean=0.5, std=0.5)  # [0, 1] -> [-1, 1]
])
ds = CIFAR10('./cifar10', train=True, transform=img2tensor, download=True)
dataloader = DataLoader(ds, batch_size=cfg_t.bs, num_workers=cfg_t.nw, drop_last=True)

ddpm = torch.compile(DDPM(**asdict(cfg_m)).to(cfg_t.device))
optimizer = torch.optim.AdamW(ddpm.parameters(), lr=cfg_t.lr)
scaler = torch.cuda.amp.GradScaler()



Files already downloaded and verified


In [3]:
from torchinfo import summary

x0, _ = next(iter(dataloader))
x0 = x0.to(cfg_t.device)
eps = torch.randn_like(x0)
t = torch.randint(0, cfg_m.nT, [cfg_t.bs], device=x0.device)
summary(DDPM(**asdict(cfg_m)).to(cfg_t.device), input_data=[x0, eps, t])

Layer (type:depth-idx)                        Output Shape              Param #
DDPM                                          [3072, 3, 32, 32]         --
├─UNet: 1-1                                   [3072, 3, 32, 32]         --
│    └─Sequential: 2-1                        [3072, 128]               --
│    │    └─Linear: 3-1                       [3072, 128]               4,224
│    │    └─GELU: 3-2                         [3072, 128]               --
│    │    └─Linear: 3-3                       [3072, 128]               16,512
│    └─Conv2d: 2-2                            [3072, 32, 32, 32]        128
│    └─UNetDownsample: 2-3                    [3072, 32, 16, 16]        --
│    │    └─TimeResNetBlock: 3-4              [3072, 32, 32, 32]        26,880
│    │    └─TimeResNetBlock: 3-5              [3072, 32, 32, 32]        26,880
│    │    └─GroupNorm: 3-6                    [3072, 32, 32, 32]        64
│    │    └─Attention: 3-7                    [3072, 32, 32, 32]        16,416


In [3]:
OPT4_TFLOPS = 1.47 * 2 * 3
OPT4_TFLOPS

8.82

In [5]:
# Run the cell twice for torch.compile
import numpy as np

n_steps = 16
starts = [torch.cuda.Event(enable_timing=True) for _ in range(n_steps)]
ends = [torch.cuda.Event(enable_timing=True) for _ in range(n_steps)]

ddpm.train()

for i, (x0, _) in zip(range(n_steps), dataloader):
    starts[i].record()

    optimizer.zero_grad()

    x0 = x0.to(cfg_t.device)
    eps = torch.randn_like(x0)
    t = torch.randint(0, cfg_m.nT, [cfg_t.bs], device=x0.device)

    with torch.cuda.amp.autocast():
        eps_pred = ddpm(x0, eps, t)
        loss = F.smooth_l1_loss(eps, eps_pred)

    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()

    ends[i].record()

torch.cuda.synchronize()
t = np.mean([s.elapsed_time(e) for s, e in zip(starts, ends)]) / 1000.0

In [6]:
OPT4_TFLOPS_PER_SEC = OPT4_TFLOPS / t
OPT4_TFLOPS_PER_SEC, OPT4_TFLOPS_PER_SEC / BASELINE_TFLOPS_PER_SEC

(16.46762723679398, 7.856692383966593)

## Optimization 5 - Use FFCV for Data Loading

- [FFCV GitHub Repo](https://github.com/libffcv/ffcv)
- [MosaicML Example Notebook](https://colab.research.google.com/github/mosaicml/composer/blob/75dabff3f5715f02bfc32cc23c557ba4042c462d/examples/ffcv_dataloaders.ipynb)
- [FFCV API Reference](https://docs.ffcv.io/api_reference.html)

`OPT5_TFLOPS_PER_SEC = 16.70`  
7.97x speedup

In [1]:
BASELINE_TFLOPS_PER_SEC = 2.096

In [2]:
from dataclasses import asdict

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10

from ddpm import DDPM, ModelConfig, TrainerConfig
from model import UNet

cfg_m = ModelConfig()
cfg_t = TrainerConfig(bs=3072, nw=16)

ddpm = torch.compile(DDPM(**asdict(cfg_m)).to(cfg_t.device))
optimizer = torch.optim.AdamW(ddpm.parameters(), lr=cfg_t.lr)
scaler = torch.cuda.amp.GradScaler()



In [3]:
from pathlib import Path

import numpy as np
import ffcv.transforms as T
from ffcv.fields import RGBImageField
from ffcv.fields.decoders import SimpleRGBImageDecoder
from ffcv.loader import Loader, OrderOption
from ffcv.writer import DatasetWriter

if not Path('./cifar10.beton').exists():
    ds = CIFAR10('./cifar10', train=True, download=True)
    writer = DatasetWriter('./cifar10.beton', {'image': RGBImageField(max_resolution=32)})
    writer.from_indexed_dataset(ds)

img_tsfms = [
    SimpleRGBImageDecoder(),
    T.RandomHorizontalFlip(),
    T.ToTensor(),
    T.ToDevice(torch.device(cfg_t.device)),
    T.ToTorchImage(),
    T.NormalizeImage(  # [0, 255] -> [-1, 1]
        mean=np.array([127.5, 127.5, 127.5]),
        std=np.array([127.5, 127.5, 127.5]),
        type=np.float32
    )
]
dataloader = Loader(
    './cifar10.beton', batch_size=cfg_t.bs, num_workers=cfg_t.nw, drop_last=True, os_cache=True,
    order=OrderOption.RANDOM, pipelines={'image': img_tsfms}
)

In [4]:
OPT5_TFLOPS = 1.47 * 2 * 3
OPT5_TFLOPS

8.82

In [7]:
# Run the cell twice for torch.compile
import numpy as np

n_steps = 16
starts = [torch.cuda.Event(enable_timing=True) for _ in range(n_steps)]
ends = [torch.cuda.Event(enable_timing=True) for _ in range(n_steps)]

ddpm.train()

for i, (x0, ) in zip(range(n_steps), dataloader):
    starts[i].record()

    optimizer.zero_grad()

    x0 = x0.to(cfg_t.device)
    eps = torch.randn_like(x0)
    t = torch.randint(0, cfg_m.nT, [cfg_t.bs], device=x0.device)

    with torch.cuda.amp.autocast():
        eps_pred = ddpm(x0, eps, t)
        loss = F.smooth_l1_loss(eps, eps_pred)

    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()

    ends[i].record()

torch.cuda.synchronize()
t = np.mean([s.elapsed_time(e) for s, e in zip(starts, ends)]) / 1000.0

In [8]:
OPT5_TFLOPS_PER_SEC = OPT5_TFLOPS / t
OPT5_TFLOPS_PER_SEC, OPT5_TFLOPS_PER_SEC / BASELINE_TFLOPS_PER_SEC

(16.70376585232557, 7.96935393717823)