# TensorRT

## Установка зависимостей

In [None]:

!pip list | grep torch

torch                                    2.8.0+cu126
torchao                                  0.10.0
torchaudio                               2.8.0+cu126
torchdata                                0.11.0
torchsummary                             1.5.1
torchtune                                0.6.1
torchvision                              0.23.0+cu126


Collecting tensorrt
  Downloading tensorrt-10.13.3.9.tar.gz (40 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/40.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorrt_cu13==10.13.3.9 (from tensorrt)
  Downloading tensorrt_cu13-10.13.3.9.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorrt_cu13_libs==10.13.3.9 (from tensorrt_cu13==10.13.3.9->tensorrt)
  Downloading tensorrt_cu13_libs-10.13.3.9.tar.gz (706 bytes)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tensorrt_cu13_bindings==10.13.3.9 (from tensorrt_cu13==10.13.3.9->tensorrt)
  Downloading tensorrt_cu13_bindings-10.13.3.9-cp312-none-manylinux_2_28_x86_6

Collecting nvidia-pyindex
  Downloading nvidia-pyindex-1.0.9.tar.gz (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: nvidia-pyindex
  Building wheel for nvidia-pyindex (setup.py) ... [?25l[?25hdone
  Created wheel for nvidia-pyindex: filename=nvidia_pyindex-1.0.9-py3-none-any.whl size=8419 sha256=672a6ec9f9a30b6dc67f7a62b530736f953280fa6bd6d73be54b68dddbbe8929
  Stored in directory: /root/.cache/pip/wheels/eb/2d/7f/d86cb060a9c51fb933aa4fe0d2f73ffe8df2bd0b58d3d2bba4
Successfully built nvidia-pyindex
Installing collected packages: nvidia-pyindex
Successfully installed nvidia-pyindex-1.0.9
Collecting nvidia-tensorrt
  Downloading nvidia_tensorrt-99.0.0-py3-none-manylinux_2_17_x86_64.whl.metadata (596 bytes)
Collecting tensorrt (from nvidia-tensorrt)
  Using cached tensorrt-10.13.3.9-py2.py3-none-any.whl
Collecting tensorrt-cu13==10.13.3.9 (from tensorrt->nvidia-tensorrt)
  Using cached tensorrt_cu13-10.13.3.9-py2.py3-none-any.whl
Colle

In [1]:
!pip3 install torch-tensorrt==2.8.0 -f https://github.com/pytorch/TensorRT/releases/expanded_assets/2.8.0
!pip install -U "nvidia-modelopt[all]"

Looking in links: https://github.com/pytorch/TensorRT/releases/expanded_assets/2.8.0
Collecting torch-tensorrt==2.8.0
  Downloading torch_tensorrt-2.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_34_x86_64.whl.metadata (11 kB)
Collecting dllist (from torch-tensorrt==2.8.0)
  Downloading dllist-2.0.0-py3-none-any.whl.metadata (2.0 kB)
Collecting tensorrt<10.13.0,>=10.12.0 (from torch-tensorrt==2.8.0)
  Downloading tensorrt-10.12.0.36.tar.gz (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.7/40.7 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorrt-cu12-bindings<10.13.0,>=10.12.0 (from torch-tensorrt==2.8.0)
  Downloading tensorrt_cu12_bindings-10.12.0.36-cp312-none-manylinux_2_28_x86_64.whl.metadata (607 bytes)
Collecting tensorrt-cu12-libs<10.13.0,>=10.12.0 (from torch-tensorrt==2.8.0)
  Downloading tensorrt_cu12_libs-10.12.0.36.tar.gz (709 bytes)
  Installing build dependencies ... [?

## Датасет

In [10]:
import torchvision
from torchvision import transforms
import torch
from torch import nn
import torch_tensorrt
import modelopt.torch.quantization as mtq

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

testing_dataset = torchvision.datasets.CIFAR10(
    root="./data",
    train=False,
    download=True,
    transform=transforms.Compose(
        [
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
        ]
    ),
)

testing_dataloader = torch.utils.data.DataLoader(
    testing_dataset, batch_size=1, shuffle=False, num_workers=1
)


### Функция калибровки

In [21]:
def forward_loop(model):
    for img, _ in testing_dataloader:
        img = img.to(device)
        model(img)



### Модель

In [26]:
model = torchvision.models.resnet18().to(device)

### Квантизация при помощи TensorRT Model Optimiser

In [None]:
from functools import partial

config = mtq.INT8_DEFAULT_CFG

quantized_model = mtq.quantize(model, config, forward_loop)

Inserted 107 quantizers


In [None]:
mtq.print_quant_summary(quantized_model)

# Задание на семинар

Нужно квантизировать при помощи TensorRT любую модель из torchvision или timm (hugging-face) до int8 и до float16. Затем нужно проверить скорость работы получившихся вариаций модели (float32 - исходная, float16 и int8) и их размер. Результаты привести в блокноте.

In [None]:
!pip install timm torchao



In [None]:
from typing import Tuple
from torchao.utils import (
    benchmark_model,
    unwrap_tensor_subclass,
)
from torch import nn
import torch

import os

def benchmark_speed(model_orig: nn.Module,
                    model_quant: nn.Module,
                    example_inputs: torch.Tensor,
                    num_runs: int = 100):
    torch._dynamo.reset()
    orig_time = benchmark_model(model_orig, num_runs, example_inputs)
    quant_time = benchmark_model(model_quant, num_runs, example_inputs)

    print("orig mean time: %0.3f ms" % orig_time)
    print("quant mean time: %0.3f ms" % quant_time)
    print("speedup: %0.1fx" % (orig_time / quant_time))
    torch._dynamo.reset()


def benchmark_size(model_orig: nn.Module, model_quant: nn.Module):
    torch.save(model_orig, "/tmp/orig_model.pt")
    torch.save(model_quant, "/tmp/quant_model.pt")
    quant_model_size_mb = os.path.getsize("/tmp/quant_model.pt") / 1024 / 1024
    orig_model_size_mb = os.path.getsize("/tmp/orig_model.pt") / 1024 / 1024

    print("quant model size: %.2f MB" % quant_model_size_mb)

    print("original model size: %.2f MB" % orig_model_size_mb)


In [None]:
import timm

model = timm.create_model('vit_base_patch14_dinov2.lvd142m').cuda()
model




VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(14, 14), stride=(14, 14))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (patch_drop): Identity()
  (norm_pre): Identity()
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (norm): Identity()
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): LayerScale()
      (drop_path1): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=Fal

In [None]:
import torchvision
from torchvision import transforms

testing_dataset = torchvision.datasets.CIFAR10(
    root="./data",
    train=False,
    download=True,
    transform=transforms.Compose(
        [
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
            transforms.Resize((518, 518))
        ]
    ),
)

testing_dataloader = torch.utils.data.DataLoader(
    testing_dataset, batch_size=1, shuffle=False, num_workers=1
)

In [None]:
from torch_tensorrt.ts.ptq import DataLoaderCalibrator, CalibrationAlgo

calibrator = DataLoaderCalibrator(
    testing_dataloader,
    cache_file="./calibration.cache",
    use_cache=False,
    algo_type=CalibrationAlgo.ENTROPY_CALIBRATION_2,
    device=torch.device("cuda:0"),
)


  calibrator = DataLoaderCalibrator(



In [None]:
quantized_model = torch_tensorrt.compile(model, inputs=[torch_tensorrt.Input((1, 3, 518, 518))],
                                    enabled_precisions={torch.float, torch.half, torch.int8},
                                    # calibrator=calibrator,
                                    device='cuda:0')

ERROR:torch_tensorrt [TensorRT Conversion Context]:IBuilder::buildSerializedNetwork: Error Code 4: Internal Error (Calibration failure occurred with no scaling factors detected. This could be due to no int8 calibrator or insufficient custom scales for network layers. Please see int8 sample to setup calibration correctly.)


AssertionError: 

In [None]:
# benchmark_speed(model, quantized_model, torch.rand(1, 3, 518, 518).cuda())
benchmark_size(model, quantized_model)

quant model size: 937.36 MB
original model size: 330.39 MB
