In [1]:
import torch
torch.__version__

'1.13.0.dev20220706+cu113'

##### Usual advice to maximize throughput is to crank your batch size up to you OOM, then back off just a touch...now you have maximized throughput.

##### This is not optimal - you need to back your batch size down both below OOM, and below where you are hitting any cudaMalloc retries! 

In [None]:
# let's take a 2B ViT model - running FSDP Zero2 sharding, and batch size = 4 to start:

# 5.55 seconds, 4 images 
# 69% of GPU memory is being used

![Throughput starting point](images/tutorial_base_2B_bs4.png)

In [None]:
# let's go until we hit OOM:
# BS= 24

![OOM](images/cuda_oom.png)

In [5]:
# let's back off until first point we don't get OOM - this would be considered 'optimal' by conventional measures:
# bs=17


![Batch size 17](images/tutorial_bs_17_retries9.png)

In [6]:
original_img_per_second = 4/5.5594
conventional_best_ips = 17 / 7.2645
print(f"conventional optimization improvement = {round(conventional_best_ips/original_img_per_second,4)}x")

conventional optimization improvement = 3.2525x


In [7]:
# can we do better?  Yes - we need to optimize by ensuring no cudaMalloc retries!

![no retries with batch size = 16](images/tutorial_bs%3D16.png)

Let's compare image per second speed:

![OOM](images/tutorial_max_throughput_25%25.png)

By optimizing to avoid cudaMalloc retries, and not simply 'just below OOM' we've improved throughput +25%!

In [None]:
# 'by hand' way to monitor gpu memory to avoid cudaMalloc retries...
torch.cuda.memory_summary()
            

![cuda report](images/cuda_report_9retries.png)

In [None]:
# by code way - pytorch cuda apis:

cuda_info = torch.cuda.memory_stats()

num_retries = cuda_info.get("num_alloc_retries", 0)
cuda_ooms = cuda_info.get("num_ooms", 0)

print(f"cudaMalloc retries = {num_retries}")
print(f"cuda OOM = {cuda_ooms}\n")

In [None]:
# with utility class Memory_Maximizer:
# setup memory tracking for perf
from gpu_memory import Memory_Maximizer

# setup memory tracking for perf
    if rank == 0:
        memmax = Memory_Maximizer()

# memory and timing tracking
    if local_rank == 0:
        memmax.start()  # start will reset all tracking points

# in training loop - at minibatch or epoch end point:
    loss.backward()
    optimizer.step()

    # update durations and memory tracking
    if local_rank == 0:
        memmax.update()

# at end of training - stop and print stats
    # memory summary
    if local_rank == 0:
        memmax.stop()  # stop and display info  

![OOM](images/tutorial_bs%3D16.png)

In [8]:
# Summary - throughput for FSDP training is optimized by tuning batch size 
# to maximize GPU memory but *without* hitting cudaMalloc retries!