In [1]:
import torch
import torch.nn as nn

model = nn.Sequential(nn.Linear(1000, 1000), nn.ReLU()).cuda()
x = torch.randn(128, 1000).cuda()

In [20]:
from torch.profiler import profile, record_function, ProfilerActivity

with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    record_shapes=True,
    profile_memory=True,          # 同时记录 CPU 和 GPU 内存（需 PyTorch ≥ 1.10）
    with_stack=True,              # 记录调用栈（方便定位代码位置）
) as prof:
    with record_function("model_inference"):
        output = model(x)
        loss = output.sum()
        loss.backward()

# 打印表格
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

# 保存为 TensorBoard 格式
prof.export_chrome_trace("new_trace.json")

# 或直接启动 TensorBoard（需安装 tensorboard）
# from torch.utils.tensorboard import SummaryWriter
# writer = SummaryWriter()
# writer.add_scalar(...)

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        model_inference        37.71%       1.042ms        82.82%       2.288ms       2.288ms           0 B           0 B           0 B   -1001.00 KB             1  
                                           aten::linear         0.38%      10.384us        39.55%       1.092ms       1.092ms           0 B           0 B     500.00 KB           0 B             1  
         

In [None]:
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    record_shapes=True,
    profile_memory=True,          # 同时记录 CPU 和 GPU 内存（需 PyTorch ≥ 1.10）
    with_stack=True,              # 记录调用栈（方便定位代码位置）) as prof:
    # 预热（避免首次运行开销干扰）
) as prof:
    for _ in range(3):
        _ = model(x)

    # 正式 profiling
    with record_function("main_run"):
        output = model(x)

# 打印表格
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))