In [1]:
import os
import gc
import torch
import torch.nn as nn
import functools
import contextlib

工具函数

In [2]:
def cal_model_params(model: nn.Module):
    params = sum([p.numel() for p in model.parameters()])
    print(f"model parameters: {params:,}")
    
    return params

In [3]:
# 统计CUDA显存
def cuda_memory_decorator(f):    
    @functools.wraps(f)
    def wrapper(*args, **kwargs):
        before = torch.cuda.memory_allocated() / (2**20)
        result = f(*args, **kwargs)
        after = torch.cuda.memory_allocated() / (2**20)
        delta = after - before
        print(f"cuda memory_allocated: {after:.2f}M, delta: {delta:.2f}M")

        return result
    
    return wrapper

In [4]:
# 一次推理
@cuda_memory_decorator
def forward(model: nn.Module, x: torch.Tensor, no_grad: bool = False):
    if no_grad:
        context = torch.no_grad()
    else:
        context = contextlib.nullcontext()
    
    with context:
        if isinstance(x, torch.Tensor):
            return model(x)
        elif isinstance(x, (list, tuple)):
            return model(*x)
        elif isinstance(x, dict):
            return model(**x)

In [5]:
@cuda_memory_decorator
def forward_and_backward(model: nn.Module, x: torch.Tensor):
    loss = model(x).sum()
    loss.backward()
    return loss


In [6]:
# 模型和数据
@cuda_memory_decorator
def get_model_and_data():
    x = torch.randn(1024, 1024, device=torch.cuda.current_device())

    model = nn.Sequential(nn.Linear(1024, 1024, bias=False),
                        nn.GELU(),
                        nn.Linear(1024, 1024, bias=False),
                        nn.GELU(),
                        nn.Linear(1024, 1024, bias=False))
    model.cuda()
    
    return model, x

In [7]:
model, x = get_model_and_data()

cuda memory_allocated: 16.00M, delta: 16.00M



多次前向传播

In [8]:
# forward pass, no_grad=False
forward(model, x, no_grad=False)
forward(model, x, no_grad=False)
forward(model, x, no_grad=False)

# forward pass，no_grad=TRUE
_ = forward(model, x, no_grad=True)

cuda memory_allocated: 44.12M, delta: 28.12M
cuda memory_allocated: 44.12M, delta: 20.00M
cuda memory_allocated: 44.12M, delta: 20.00M
cuda memory_allocated: 28.12M, delta: 4.00M


前向传播显存占用分析，增加的显存占用来自中间变量和输出变量：
- `no_grad=False`的前向传播，增加的显存占用来自于中间变量$4*4*1024*1024=16M$和输出变量4M，共20M。
- **`no_grad=False`第一次前向传播比后续的前向传播，增加了8M显存，来源未知。**
- `no_grad=True`的前向传播，增加的显存占用来自于输出变量4M。


前向传播和反向传播

In [9]:
_ = forward_and_backward(model, x,)

cuda memory_allocated: 48.25M, delta: 20.13M


CUDA显存占用分析，增加的显存占用来自梯度和输出：
- 梯度，0.weight、2.weight和4.weight对应梯度大小分别为4M，共12M。
- 输出，输出Tensor的大小为4M。

查看模型梯度信息

In [10]:
for name, p in model.named_parameters():
    print(name, p.grad.size())

0.weight torch.Size([1024, 1024])
2.weight torch.Size([1024, 1024])
4.weight torch.Size([1024, 1024])


TransformerEncode显存分析

TransformerEncoder包括多头自注意力MultiHeadAttention和多层感知机MLP两个子网络。

MultiHeadAttention显存分析

In [11]:
@cuda_memory_decorator
def get_mha_model_data():
    mha = nn.MultiheadAttention(1024, 1, bias=False)
    mha.cuda()
    x = torch.randn(2, 1024, 1024, device=torch.cuda.current_device())
    
    return mha, x

In [12]:
mha_model, mha_x = get_mha_model_data()

cuda memory_allocated: 68.25M, delta: 24.00M


显存增加了24M，模型和数据分别占用16M和8M显存。

MultiHeadAttention模型参数信息：

In [13]:
for name, p in mha_model.named_parameters():
    print(name, p.size())

in_proj_weight torch.Size([3072, 1024])
out_proj.weight torch.Size([1024, 1024])


MHA前向传播

In [14]:
forward(mha_model, (mha_x, mha_x, mha_x))
forward(mha_model, (mha_x, mha_x, mha_x))
_ = forward(mha_model, (mha_x, mha_x, mha_x), no_grad=True)

cuda memory_allocated: 116.28M, delta: 48.03M
cuda memory_allocated: 116.28M, delta: 48.03M
cuda memory_allocated: 76.27M, delta: 8.02M


MHA前向传播显存占用分析， MHA计算公式：$softmax(\frac {QK}{\sqrt{d_k}})V$
- `no_grad=False`，增加的显存来自中间变量和输出变量，根据MHA计算公式，中间变量占用显存大小$3*8M+8M+8M+8M=40M$，输出变量占用8M。
- `no_grad=Fasle`，增加的显存来自输出变量8M。

In [15]:
@cuda_memory_decorator
def get_encoder_model_and_data():
    encoder = nn.TransformerEncoderLayer(1024, 1, dim_feedforward=4096, dropout=0.)
    encoder.cuda()
    data = torch.randn(2, 1024, 1024, device=torch.cuda.current_device())

    return encoder, data

In [16]:
encoder_model, encoder_x = get_encoder_model_and_data()

cuda memory_allocated: 132.32M, delta: 56.05M


In [17]:
print(encoder_model)

TransformerEncoderLayer(
  (self_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
  )
  (linear1): Linear(in_features=1024, out_features=4096, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
  (linear2): Linear(in_features=4096, out_features=1024, bias=True)
  (norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  (norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  (dropout1): Dropout(p=0.0, inplace=False)
  (dropout2): Dropout(p=0.0, inplace=False)
)


Encoder模型和数据占用显存：
- 模型结构显存48M，其中MHA占用16M，MLP占用32M。
- 数据占用显存8M。

In [18]:
forward(encoder_model, encoder_x)
forward(encoder_model, encoder_x)
_ = forward(encoder_model, encoder_x, no_grad=True)

cuda memory_allocated: 236.47M, delta: 104.16M
cuda memory_allocated: 236.47M, delta: 104.16M
cuda memory_allocated: 140.32M, delta: 8.00M


Encoder前向传播占用显存分析：

- `no_grad=False`，前向传播占用显存104M，主要是中间变量和输出变量。
    - MHA+Norm, 48M + 8M，共56M
    - MLP+Norm, 40M + 8M，共48M
- `no_grad=True`，前向传播占用显存是输出变量8M。