In [None]:
# 2025/7/22
# zhangzhong
# https://docs.pytorch.org/tutorials/recipes/recipes/amp_recipe.html
# This recipe measures the performance of a simple network in default precision, 
# then walks through adding autocast and GradScaler to run the same network in mixed precision with improved performance.

In [None]:
# Some ops, like linear layers and convolutions, are much faster in float16 or bfloat16. 
# Other ops, like reductions, often require the dynamic range of float32
# Mixed precision tries to match each op to its appropriate datatype, which can reduce your network’s runtime and memory footprint.

In [2]:
import torch
import time
import gc

# Timing utilities
start_time = None

def start_timer():
    global start_time
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_max_memory_allocated()
    torch.cuda.synchronize()
    start_time = time.time()

def end_timer_and_print(local_msg):
    torch.cuda.synchronize()
    end_time = time.time()
    print("\n" + local_msg)
    print("Total execution time = {:.3f} sec".format(end_time - start_time))
    print("Max memory used by tensors = {} bytes".format(torch.cuda.max_memory_allocated()))

In [4]:
# A simple network
# batch_size, in_size, out_size, and num_layers are chosen to be large enough to saturate the GPU with work.
def make_model(in_size, out_size, num_layers):
    layers = []
    for _ in range(num_layers - 1):
        layers.append(torch.nn.Linear(in_size, in_size))
        layers.append(torch.nn.ReLU())
    layers.append(torch.nn.Linear(in_size, out_size))
    return torch.nn.Sequential(*tuple(layers)).cuda()

In [8]:
batch_size = 4096 # Try, for example, 128, 256, 513.
in_size = 4096
out_size = 4096
num_layers = 3
num_batches = 50
epochs = 3

device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.set_default_device(device)

# Creates data in default precision.
# The same data is used for both default and mixed precision trials below.
# You don't need to manually change inputs' ``dtype`` when enabling mixed precision.
data = [torch.randn(batch_size, in_size) for _ in range(num_batches)]
targets = [torch.randn(batch_size, out_size) for _ in range(num_batches)]

loss_fn = torch.nn.MSELoss().cuda()

In [12]:
# Default precision: float32
net = make_model(in_size, out_size, num_layers)
opt = torch.optim.SGD(net.parameters(), lr=0.001)

start_timer()
for epoch in range(epochs):
    for input, target in zip(data, targets):
        output = net(input)
        loss = loss_fn(output, target)
        loss.backward()
        opt.step()
        opt.zero_grad() # set_to_none=True here can modestly improve performance
end_timer_and_print("Default precision:")


Default precision:
Total execution time = 7.738 sec
Max memory used by tensors = 7533332992 bytes


In [10]:
# torch.autocast
# context managers that allow regions of your script to run in mixed precision
# autocast会自动选择每个操作的dtype
# https://docs.pytorch.org/docs/stable/amp.html#autocast-op-reference 这里面有那种操作用float32 那种用float16
# 一般的矩阵，卷积等操作都是用float16


for epoch in range(0): # 0 epochs, this section is for illustration only
    for input, target in zip(data, targets):
        # Runs the forward pass under ``autocast``.
        with torch.autocast(device_type=device, dtype=torch.float16):
            output = net(input)
            # output is float16 because linear layers ``autocast`` to float16.
            assert output.dtype is torch.float16

            loss = loss_fn(output, target)
            # loss is float32 because ``mse_loss`` layers ``autocast`` to float32.
            assert loss.dtype is torch.float32

        # Exits ``autocast`` before backward().
        # Backward passes under ``autocast`` are not recommended.
        # Backward ops run in the same ``dtype`` ``autocast`` chose for corresponding forward ops.
        loss.backward()
        opt.step()
        opt.zero_grad() # set_to_none=True here can modestly improve performance

In [None]:
# Adding GradScaler
# Gradient scaling helps prevent gradients with small magnitudes from flushing to zero (“underflowing”) when training with mixed precision.

# Constructs a ``scaler`` once, at the beginning of the convergence run, using default arguments.
# If your network fails to converge with default ``GradScaler`` arguments, please file an issue.
# The same ``GradScaler`` instance should be used for the entire convergence run.
# If you perform multiple convergence runs in the same script, each run should use
# a dedicated fresh ``GradScaler`` instance. ``GradScaler`` instances are lightweight.
# cause dynamic scaling: https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html#faq-general__section_on4_qgf_djb
scaler = torch.amp.GradScaler("cuda")

for epoch in range(0): # 0 epochs, this section is for illustration only
    for input, target in zip(data, targets):
        with torch.autocast(device_type=device, dtype=torch.float16):
            output = net(input)
            loss = loss_fn(output, target)

        # Scales loss. Calls ``backward()`` on scaled loss to create scaled gradients.
        scaler.scale(loss).backward()

        # ``scaler.step()`` first unscales the gradients of the optimizer's assigned parameters.
        # If these gradients do not contain ``inf``s or ``NaN``s, optimizer.step() is then called,
        # otherwise, optimizer.step() is skipped.
        scaler.step(opt)

        # Updates the scale for next iteration.
        # scaler need to update its scaling factor, cause dynamic scaling tricks
        scaler.update()

        opt.zero_grad() # set_to_none=True here can modestly improve performance

In [None]:
# All together: AMP
use_amp = True

net = make_model(in_size, out_size, num_layers)
opt = torch.optim.SGD(net.parameters(), lr=0.001)
scaler = torch.amp.GradScaler("cuda" ,enabled=use_amp)

start_timer()
for epoch in range(epochs):
    for input, target in zip(data, targets):
        with torch.autocast(device_type=device, dtype=torch.float16, enabled=use_amp):
            output = net(input)
            loss = loss_fn(output, target)
        scaler.scale(loss).backward()

        # # Unscales the gradients of optimizer's assigned parameters in-place
        # scaler.unscale_(opt)

        # # Since the gradients of optimizer's assigned parameters are now unscaled, clips as usual.
        # # You may use the same value for max_norm here as you would without gradient scaling.
        # torch.nn.utils.clip_grad_norm_(net.parameters(), max_norm=0.1)

        scaler.step(opt) # 在这里调用了 optimizer.step()
        scaler.update()
        opt.zero_grad() # set_to_none=True here can modestly improve performance
end_timer_and_print("Mixed precision:")




Mixed precision:
Total execution time = 3.325 sec
Max memory used by tensors = 7634019840 bytes


In [None]:
#. Saving/Resuming

# checkpoint = {"model": net.state_dict(),
#               "optimizer": opt.state_dict(),
#               "scaler": scaler.state_dict()}
# # Write checkpoint as desired, e.g.,
# # torch.save(checkpoint, "filename")

# # read
# dev = torch.cuda.current_device()
# # 直接加载到显卡中
# checkpoint = torch.load("filename",
#                         map_location = lambda storage, loc: storage.cuda(dev))
# net.load_state_dict(checkpoint["model"])
# opt.load_state_dict(checkpoint["optimizer"])
# scaler.load_state_dict(checkpoint["scaler"])

In [None]:
# Inference/Evaluation
# autocast may be used by itself to wrap inference or evaluation forward passes.
# GradScaler is not necessary.

In [None]:
# combine AMP with DDP
# https://docs.pytorch.org/docs/stable/notes/amp_examples.html