In [1]:
import torch
if torch.cuda.is_available():
    print(f"PyTorch: \t{torch.__version__}")
    print(f"CUDA : \t\t{torch.version.cuda}")
    print(f"Num of GPUs: \t{torch.cuda.device_count()}")
else:
    device_str = 'cpu'
    print("CUDA not available. Using device: cpu")

PyTorch: 	2.7.1+cu118
CUDA : 		11.8
Num of GPUs: 	2


In [2]:
from torch import nn

In [3]:
# 1

# Параметры бенчмарка
BATCH_SIZE = 64
SEQ_LENGTH = 100
D_MODEL = 512 
N_LAYERS = 6
N_HEADS = 8
DIM_FEEDFORWARD = D_MODEL * 4
VOCAB_SIZE = 1000

In [4]:
class TransEncoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.emb = nn.Embedding(VOCAB_SIZE, D_MODEL)

        # шаблон слоя
        enc_layer = nn.TransformerEncoderLayer(
            d_model = D_MODEL,
            nhead = N_HEADS,
            dim_feedforward = DIM_FEEDFORWARD,
            batch_first=True
        )

        # набор слоев
        self.transformer_enc = nn.TransformerEncoder(
            enc_layer,
            num_layers = N_LAYERS
        )

    def forward(self, src):
        x = self.emb(src) # tok to vec
        output = self.transformer_enc(x)
        return output

In [5]:
# модель на гпу
device = torch.device('cuda')
model = TransEncoder().to(device)
model.eval() # для forward pass

TransEncoder(
  (emb): Embedding(1000, 512)
  (transformer_enc): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
)

In [6]:
# дата
input_data = torch.randint(0, VOCAB_SIZE, (BATCH_SIZE, SEQ_LENGTH), device=device) # 0->VOCAB_SIZE диапазон id токенов
input_data.shape

torch.Size([64, 100])

In [7]:
# warm up
for _ in range(10):
    with torch.no_grad(): #  for no backprop
        _ = model(input_data)
torch.cuda.synchronize() # cpu stop untill gpu calc ready

In [8]:
# тайминги
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
timings = []
n_cycles = 100

start_event.record()
for _ in range(n_cycles):
    with torch.no_grad():
        _ = model(input_data)
end_event.record()
torch.cuda.synchronize()

elapsed_time_ms = start_event.elapsed_time(end_event)
avg_time_ms = elapsed_time_ms / n_cycles
print(f"Среднее время на один forward pass: {avg_time_ms:.3f} мс")

Среднее время на один forward pass: 16.954 мс


In [9]:
# memoryview

torch.cuda.reset_peak_memory_stats(device) # Сбрасываем счетчик пиковой памяти
with torch.no_grad():
    _ = model(input_data)

peak_memory_bytes = torch.cuda.max_memory_allocated(device)
peak_memory_mb = peak_memory_bytes / (1024 * 1024)
print(f"Пиковое использование VRAM: {peak_memory_mb:.2f} МБ")


Пиковое использование VRAM: 198.20 МБ


стоимость одной операции прямого прохода (forward pass) через модель для одного батча данных