# Load torchtext and initialize XLM-R model
- 이 코드는 PyTorch와 torchtext 라이브러리를 사용하여 XLM-R (Cross-lingual Language Model - RoBERTa) 모델을 로드하고 초기화한 다음, CPU 및 GPU에서의 성능을 측정하는 코드입니다

In [1]:
import torch
import torch.nn as nn
import torchtext

from torchtext.models import RobertaClassificationHead
from torchtext.functional import to_tensor

xlmr_large = torchtext.models.XLMR_LARGE_ENCODER
classifier_head = torchtext.models.RobertaClassificationHead(num_classes=2, input_dim = 1024)
model = xlmr_large.get_model(head=classifier_head)

# 추론 모드로 모델 전환 (Better Transformer를 사용하지 않아도 런타임 감소, 특히 GPU 실행에서 필요)
model.eval()

# 입력 변환을 정의합니다.
transform = xlmr_large.transform()

Downloading: "https://download.pytorch.org/models/text/xlmr.large.encoder.pt" to /root/.cache/torch/hub/checkpoints/xlmr.large.encoder.pt
100%|██████████| 2.08G/2.08G [00:08<00:00, 250MB/s]
100%|██████████| 5.07M/5.07M [00:00<00:00, 73.0MB/s]
Downloading: "https://download.pytorch.org/models/text/xlmr.vocab.pt" to /root/.cache/torch/hub/checkpoints/xlmr.vocab.pt
100%|██████████| 4.85M/4.85M [00:00<00:00, 69.1MB/s]


# System Information

In [2]:
import platform

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

cpu = platform.processor()
gpu = torch.cuda.get_device_name(DEVICE)

print(f"torch version: {torch.__version__}")
print(f"torch cuda available: {torch.cuda.is_available()}")
print(f"CPU type: {cpu}")
print(f"GPU type: {gpu}")

torch version: 2.1.0+cu118
torch cuda available: True
CPU type: x86_64
GPU type: Tesla T4



## 기본 희소성 지원 설정을 확인하세요.

희소성 지원은 트랜스포머가 입력에서 패딩을 건너뛸 수 있게 합니다.

In [3]:
model.encoder.transformer.layers.enable_nested_tensor

True

# Benchmark setup

## Define inputs

In [4]:
small_input_batch = [
               "Hello world",
               "How are you!"
]
big_input_batch = [
               "Hello world",
               "How are you!",
               """`Well, Prince, so Genoa and Lucca are now just family estates of the
Buonapartes. But I warn you, if you don't tell me that this means war,
if you still try to defend the infamies and horrors perpetrated by
that Antichrist- I really believe he is Antichrist- I will have
nothing more to do with you and you are no longer my friend, no longer
my 'faithful slave,' as you call yourself! But how do you do? I see
I have frightened you- sit down and tell me all the news.`

It was in July, 1805, and the speaker was the well-known Anna
Pavlovna Scherer, maid of honor and favorite of the Empress Marya
Fedorovna. With these words she greeted Prince Vasili Kuragin, a man
of high rank and importance, who was the first to arrive at her
reception. Anna Pavlovna had had a cough for some days. She was, as
she said, suffering from la grippe; grippe being then a new word in
St. Petersburg, used only by the elite."""
]

### 작은 입력 집합 또는 큰 입력 집합을 선택하세요.
아래의 input_batch에 할당을 수정하여 small_input_batch 또는 big_input_batch   중하나를 선택하거나, 자체 입력을 대체하세요.

In [5]:
input_batch=big_input_batch

model_input = to_tensor(transform(input_batch), padding_value=1)
output = model(model_input)
output.shape

torch.Size([3, 2])

### 성능 측정을 위한 반복 횟수

In [6]:
ITERATIONS=10

## CPU 성능 측정: 느린 경로 및 빠른 경로, BT 희소성 없이 측정
희소성 지원으로 인해 트랜스포머는 입력에서 패딩을 건너뛸 수 있습니다.

### CPU 성능: BT 희소성 없이 측정

In [7]:
model.encoder.transformer.layers.enable_nested_tensor = False

In [8]:
print("slow path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=False) as prof:
  for i in range(ITERATIONS):
    output = model(model_input)
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cpu_time_total", row_limit=5))

print("fast path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=False) as prof:
  with torch.no_grad():
    for i in range(ITERATIONS):
      output = model(model_input)
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cpu_time_total", row_limit=5))



slow path:
--------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
--------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                 aten::addmm        63.96%       52.404s        65.05%       53.294s      72.019ms           740  
                                    aten::mm        21.81%       17.873s        21.81%       17.873s      74.472ms           240  
                                   aten::bmm         5.33%        4.368s         5.33%        4.368s       9.100ms           480  
                              aten::_softmax         2.39%        1.957s         2.39%        1.957s       8.154ms           240  
                                 aten::copy_         2.13%        1.743s

  output = torch._nested_tensor_from_mask(output, src_key_padding_mask.logical_not(), mask_check=False)


-------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                       Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
-------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                aten::addmm        31.59%       10.998s        31.75%       11.052s      22.104ms           500  
                    aten::_addmm_activation        24.84%        8.647s        26.44%        9.206s      38.359ms           240  
                                   aten::mm        18.88%        6.572s        18.88%        6.572s      27.382ms           240  
                                  aten::bmm         9.37%        3.263s         9.37%        3.263s       6.798ms           480  
          aten::_transform_bias_rescale_qkv         4.45%        1.550s         6.78%     

### CPU 성능: BT 희소성과 함께 측정

In [9]:
model.encoder.transformer.layers.enable_nested_tensor = True

In [10]:
print("slow path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=False) as prof:
  for i in range(ITERATIONS):
    output = model(model_input)
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cpu_time_total", row_limit=5))

print("fast path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=False) as prof:
  with torch.no_grad():
    for i in range(ITERATIONS):
      output = model(model_input)
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cpu_time_total", row_limit=5))



slow path:
--------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
--------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                 aten::addmm        65.76%       50.190s        66.41%       50.692s      68.502ms           740  
                                    aten::mm        21.92%       16.734s        21.92%       16.734s      69.725ms           240  
                                   aten::bmm         5.08%        3.878s         5.08%        3.878s       8.079ms           480  
                              aten::_softmax         1.98%        1.511s         1.98%        1.511s       6.295ms           240  
                                  aten::gelu         1.71%        1.308s

## 장치(DEVICE) 성능 측정: 느린 경로 및 빠른 경로, 희소성 없이 및 희소성과 함께 측정
Better Transformer의 빠른 경로 실행이 GPU에서 성능 이점을 제공하려면 런타임이 GPU를 활성화해야 합니다. Google Colab 메뉴에서 "Runtime > Change Runtime Type"을 통해 런타임 유형을 확인하고 변경할 수 있습니다.

In [11]:
model.to(DEVICE)
model.eval()
model_input = model_input.to(DEVICE)

## 장치(DEVICE) 성능: Better Transformer 희소성 없이 측정

In [14]:
model.encoder.transformer.layers.enable_nested_tensor=False

In [13]:
print("slow path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=True) as prof:
  for i in range(ITERATIONS):
    output = model(model_input)
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total", row_limit=5))

print("fast path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=True) as prof:
  with torch.no_grad():
    for i in range(ITERATIONS):
      output = model(model_input)
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total", row_limit=5))

slow path:
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                               aten::mm         2.68%     112.138ms        74.27%        3.103s      12.929ms        3.354s        73.67%        3.354s      13.974ms           240  
                                            aten::addmm         1.04%      43.329ms         1.63%      68.114ms      92.046us     755.467ms        16.59%     755.467ms       1.021ms           740 

### 장치(DEVICE) 성능: Better Transformer 희소성과 함께 측정

In [15]:
model.encoder.transformer.layers.enable_nested_tensor = True

In [16]:
model.to(DEVICE)
model_input = model_input.to(DEVICE)

print("slow path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=True) as prof:
  for i in range(ITERATIONS):
    output = model(model_input)
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total", row_limit=5))

print("fast path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=True) as prof:
  with torch.no_grad():
    for i in range(ITERATIONS):
      output = model(model_input)
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total", row_limit=5))

slow path:
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            aten::addmm         4.10%      38.937ms         5.27%      49.998ms      67.565us     770.219ms        55.07%     770.219ms       1.041ms           740  
                                               aten::mm         0.97%       9.166ms         1.29%      12.244ms      51.017us     268.050ms        19.17%     268.050ms       1.117ms           240 