In [None]:
!pip list | grep torch

torch                            2.0.1+cu118
torchaudio                       2.0.2+cu118
torchdata                        0.6.1
torchsummary                     1.5.1
torchtext                        0.15.2
torchvision                      0.15.2+cu118


In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [None]:
!nvidia-smi

Sun Oct  1 19:37:48 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    27W /  70W |   1393MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!cat /usr/include/cudnn_version.h | grep CUDNN_MAJOR -A 2

#define CUDNN_MAJOR 8
#define CUDNN_MINOR 9
#define CUDNN_PATCHLEVEL 0
--
#define CUDNN_VERSION (CUDNN_MAJOR * 1000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL)

/* cannot use constexpr here since this is a C-only file */


In [None]:
!pip install tensorrt
!pip list | grep tensorrt

Collecting tensorrt
  Downloading tensorrt-8.6.1.post1.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: tensorrt
  Building wheel for tensorrt (setup.py) ... [?25l[?25hdone
  Created wheel for tensorrt: filename=tensorrt-8.6.1.post1-py2.py3-none-any.whl size=17281 sha256=98680342106bfb7b07061b763770ed41c76bd08db680ffc05afc385c170705bc
  Stored in directory: /root/.cache/pip/wheels/f4/c8/0e/b79b08e45752491b9acfdbd69e8a609e8b2ed7640dda5a3e59
Successfully built tensorrt
Installing collected packages: tensorrt
Successfully installed tensorrt-8.6.1.post1
tensorrt                         8.6.1.post1
tensorrt-bindings                8.6.1
tensorrt-libs                    8.6.1


Docker: https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tensorrt

# [Pytorch + TensorRT](https://pytorch.org/TensorRT/)

### Installation

In [None]:
!pip3 install nvidia-pyindex
!pip3 install nvidia-tensorrt
!pip3 install torch-tensorrt==1.4.0 -f https://github.com/pytorch/TensorRT/releases/expanded_assets/1.4.0

Collecting nvidia-pyindex
  Downloading nvidia-pyindex-1.0.9.tar.gz (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: nvidia-pyindex
  Building wheel for nvidia-pyindex (setup.py) ... [?25l[?25hdone
  Created wheel for nvidia-pyindex: filename=nvidia_pyindex-1.0.9-py3-none-any.whl size=8417 sha256=2c861d71196d3ac9b184b6214544caea879a442264745e57419b19172d315e17
  Stored in directory: /root/.cache/pip/wheels/2c/af/d0/7a12f82cab69f65d51107f48bcd6179e29b9a69a90546332b3
Successfully built nvidia-pyindex
Installing collected packages: nvidia-pyindex
Successfully installed nvidia-pyindex-1.0.9
Collecting nvidia-tensorrt
  Downloading nvidia_tensorrt-99.0.0-py3-none-manylinux_2_17_x86_64.whl (17 kB)
Installing collected packages: nvidia-tensorrt
Successfully installed nvidia-tensorrt-99.0.0
Looking in links: https://github.com/pytorch/TensorRT/releases/expanded_assets/1.4.0
Collecting torch-tensorrt==1.4.0
  Downloading torch_tensorrt-1.4.0

### Using Torch-TensorRT in Python

Torch-TensorRT Python API can accept a torch.nn.Module, torch.jit.ScriptModule, or torch.fx.GraphModule as an input

https://pytorch.org/TensorRT/py_api/torch_tensorrt.html

In [None]:
import torch_tensorrt
import torch

In [None]:
import torchvision.models as models

device = "cuda" if torch.cuda.is_available() else "cpu"
model = models.mobilenet_v2(pretrained=True).eval().to(device)

Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /root/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
100%|██████████| 13.6M/13.6M [00:00<00:00, 64.2MB/s]


In [None]:
import time

In [None]:
input_ = torch.rand([1, 3, 224, 224]).to(device)
for _ in range(10):
  t0 = time.time()
  model(input_)
  torch.cuda.current_stream().synchronize()
  t1 = time.time()
  print(t1 - t0)

0.013536214828491211
0.007731199264526367
0.008272647857666016
0.007344722747802734
0.008091211318969727
0.007662057876586914
0.0074388980865478516
0.007418155670166016
0.0073130130767822266
0.0071849822998046875


In [None]:
inputs = [
    torch_tensorrt.Input(
        min_shape=[1, 3, 224, 224],
        opt_shape=[1, 3, 224, 224],
        max_shape=[1, 3, 224, 224],
        dtype=torch.half,
    )
]
enabled_precisions = {torch.float, torch.half}  # Run with fp16

trt_ts_module = torch_tensorrt.compile(
    model, inputs=inputs, enabled_precisions=enabled_precisions
)


In [None]:
input_data = torch.randn((1, 3, 224, 224)).to("cuda").half()
for _ in range(10):
  t0 = time.time()
  result = trt_ts_module(input_data)
  torch.cuda.current_stream().synchronize()
  t1 = time.time()
  print(t1 - t0)

0.07064247131347656
0.008928537368774414
0.0011997222900390625
0.0011873245239257812
0.001203775405883789
0.001194000244140625
0.0011894702911376953
0.0011870861053466797
0.0011799335479736328
0.0012106895446777344


In [None]:
torch.jit.save(trt_ts_module, "trt_ts_module.ts")

In [None]:
# Deployment application
import torch
import torch_tensorrt

trt_ts_module = torch.jit.load("trt_ts_module.ts")
input_data = input_data.to("cuda").half()
t0 = time.time()
result = trt_ts_module(input_data)
torch.cuda.current_stream().synchronize()
t1 = time.time()
print(t1 - t0)

0.0020499229431152344


### Using Torch-TensorRT Directly From PyTorch

In [None]:
import torch
import torch_tensorrt # import require GPU

In [None]:
import torchvision.models as models

device = "cuda" if torch.cuda.is_available() else "cpu"
model = models.mobilenet_v2(pretrained=True).eval().to(device)
script_model = torch.jit.script(model)



In [None]:
### https://pytorch.org/TensorRT/py_api/ts.html
spec = {
    "forward": torch_tensorrt.ts.TensorRTCompileSpec(
        **{
            "inputs": [torch_tensorrt.Input([1, 3, 224, 224])],
            "enabled_precisions": {torch.half},
            "refit": False,
            "debug": False,
            "device": {
                "device_type": torch_tensorrt.DeviceType.GPU,
                "gpu_id": 0,
                "dla_core": 0,
                "allow_gpu_fallback": True,
            },
            "capability": torch_tensorrt.EngineCapability.default,
            "num_avg_timing_iters": 1,
        }
    )
}

In [None]:
trt_model = torch._C._jit_to_backend("tensorrt", script_model, spec)



In [None]:
### https://github.com/pytorch/TensorRT/issues/2113
input = torch.randn((1, 3, 224, 224)).to("cuda").to(torch.float)
trt_model.forward(input).shape

torch.Size([1, 1000])

In [None]:
type(trt_model)

torch.jit._script.RecursiveScriptModule

### Post Training Quantization (PTQ)

In [None]:
import torchvision
from torchvision import transforms

In [None]:
testing_dataset = torchvision.datasets.CIFAR10(
    root="./data",
    train=False,
    download=True,
    transform=transforms.Compose(
        [
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
        ]
    ),
)

testing_dataloader = torch.utils.data.DataLoader(
    testing_dataset, batch_size=1, shuffle=False, num_workers=1
)
calibrator = torch_tensorrt.ptq.DataLoaderCalibrator(
    testing_dataloader,
    cache_file="./calibration.cache",
    use_cache=False,
    algo_type=torch_tensorrt.ptq.CalibrationAlgo.ENTROPY_CALIBRATION_2,
    device=torch.device("cuda:0"),
)
model = models.mobilenet_v2(pretrained=True).eval().to(device)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:03<00:00, 44847432.29it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data




In [None]:
trt_mod = torch_tensorrt.compile(model, inputs=[torch_tensorrt.Input((1, 3, 32, 32))],
                                    enabled_precisions={torch.float, torch.half, torch.int8},
                                    calibrator=calibrator,
                                    device={
                                         "device_type": torch_tensorrt.DeviceType.GPU,
                                         "gpu_id": 0,
                                         "dla_core": 0,
                                         "allow_gpu_fallback": False,
                                         "disable_tf32": False
                                     })

In [None]:
input_data = torch.randn((1, 3, 32, 32)).to("cuda")
for _ in range(10):
  t0 = time.time()
  result = trt_mod(input_data)
  torch.cuda.current_stream().synchronize()
  t1 = time.time()
  print(t1 - t0)

In [None]:
trt_mod.save("ptq.ts")

### Pytorch - ONNX - Tensorrt

https://github.com/NVIDIA/TensorRT/blob/main/quickstart/IntroNotebooks/4.%20Using%20PyTorch%20through%20ONNX.ipynb

### TensorRT + Nvidia pytorch_quantization

https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization

https://github.com/NVIDIA/TensorRT/blob/release/8.6/quickstart/quantization_tutorial/qat-ptq-workflow.ipynb

In [None]:
# import locale
# locale.getpreferredencoding = lambda: "UTF-8"

### Задача на лабораторную

Провести QAT для выбранной модели и датасета и представить результаты сравнения с лабораторной 1. Опционально рассказать о трудностях, которые возникли при работе с tensorrt.