In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.profiler import profile, record_function, ProfilerActivity

In [2]:
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.layer1 = nn.Linear(512, 256)
        self.layer2 = nn.Linear(256, 128)
        self.layer3 = nn.Linear(128, 64)
        self.layer4 = nn.Linear(64, 10)  # Output layer for 10 classes

    def forward(self, x):
        x = self.layer1(x)
        x = torch.relu(self.layer2(x))
        x = torch.relu(self.layer3(x))
        x = self.layer4(x)
        return x

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleModel().to(device)
input_tensor = torch.randn(64, 512).to(device)  # Batch size 64, input size 512
labels = torch.randint(0, 10, (64,)).to(device)  # 랜덤 라벨
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [4]:
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    profile_memory=True,
    record_shapes=True
) as prof:
    model.train()
    optimizer.zero_grad()

    input_tensor = torch.randn(64, 512).to(device)
    labels = torch.randint(0, 10, (64,)).to(device)

    for name, layer in model.named_children():
        with record_function(f"Forward_{name}"):
            input_tensor = layer(input_tensor)

    loss = criterion(input_tensor, labels)

    with record_function("Backward"):
        loss.backward()
    optimizer.step()

In [7]:
print(prof.key_averages().table(sort_by="cuda_memory_usage", row_limit=20))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                         AddmmBackward0         0.99%       4.423ms         3.10%      13.806ms       3.452ms       0.000us         0.00%      55.616us      13.904us           0 b           0 b       8.89 Mb           0 

In [6]:
print(f"GPU 메모리 사용량: {torch.cuda.memory_allocated() / (1024**2):.2f} MB")
print(f"GPU 캐시 메모리 사용량: {torch.cuda.memory_reserved() / (1024**2):.2f} MB")

GPU 메모리 사용량: 17.58 MB
GPU 캐시 메모리 사용량: 22.00 MB
