In [None]:
!pip install huggingface torch transformers datasets accelerate optimum
!pip install onnxruntime-gpu==1.14.0
!pip install --upgrade setuptools pip wheel
!pip install nvidia-pyindex

!wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin 

!sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600
!sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub
!sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /"
!sudo apt-get update

!sudo apt-get install libcudnn8=8.7.0.84-1+cuda11.8
!sudo apt-get install libcudnn8-dev=8.7.0.84-1+cuda11.8

!pip install tensorrt==8.5.3.1

!export CUDA_PATH=/usr/local/cuda
!export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.8/lib64:/usr/local/lib/python3.9/dist-packages/tensorrt

import tensorrt
print(tensorrt.__version__)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [3]:
from transformers import AutoModel
import onnxruntime
from optimum.bettertransformer import BetterTransformer
import torch

In [4]:
def setup_pytorch_cpu(model_name):
    device = torch.device('cpu')
    model = AutoModel.from_pretrained(model_name)
    model.eval()
    model.to(device)
    return model, device

def setup_pytorch_cuda(model_name):
    device = torch.device('cuda')
    model = AutoModel.from_pretrained(model_name)
    model.eval()
    model.to(device)
    return model, device

def setup_pytorch_bettertransformers(model_name):
    device = torch.device('cuda')
    model = AutoModel.from_pretrained(model_name)
    model.eval()
    model.to(device)
    model = BetterTransformer.transform(model)
    return model, device

def setup_onnx_cpu(model_path):
    session = onnxruntime.InferenceSession(model_path, providers=['CPUExecutionProvider'])
    print(session.get_providers())
    return session, torch.device('cpu')

def setup_onnx_cuda(model_path):
    session = onnxruntime.InferenceSession(model_path, providers=['CUDAExecutionProvider'])
    print(session.get_providers())
    return session, torch.device('cuda')

def setup_onnx_tensorrt(model_path):
    session = onnxruntime.InferenceSession(model_path, providers=['TensorrtExecutionProvider'])
    print(session.get_providers())
    return session, torch.device('cuda')

def setup_onnx_openvino(model_path):
    !pip install onnxruntime-openvino==1.13.1
    session = onnxruntime.InferenceSession(model_path, providers=['OpenVINOExecutionProvider'])
    print(session.get_providers())
    return session, torch.device('cpu')

In [16]:
import numpy
from transformers import PreTrainedModel
import timeit

def measure_inference_time(model, device, input, num_iter):
    if isinstance(model, onnxruntime.InferenceSession):
        io_binding = None
        if device.type == 'cuda':
            io_binding = model.io_binding()

            datas = []

            for k, v in input.items():
                datas.append(onnxruntime.OrtValue.ortvalue_from_numpy(v, device.type, 0 if device.index is None else device.index))
                io_binding.bind_input(name=k, device_type=datas[-1].device_name(), device_id=(0 if device.index is None else device.index), element_type=v.dtype, shape=datas[-1].shape(), buffer_ptr=datas[-1].data_ptr())

            for o in model.get_outputs():
                io_binding.bind_output(o.name, device.type)

        if io_binding is None:
            # warmup
            for _ in range(3):
                model.run(None, input)
            return timeit.timeit(lambda: model.run(None, input), number=num_iter)
        else:
            # warmup
            for _ in range(3):
                model.run_with_iobinding(io_binding)
            return timeit.timeit(lambda: model.run_with_iobinding(io_binding), number=num_iter)
  
    elif isinstance(model, PreTrainedModel):
        proc_input = {k: torch.from_numpy(v).to(device) for k, v in input.items()}

        # warmup
        for _ in range(3):
            model(**proc_input)

        return timeit.timeit(lambda: model(**proc_input), number=num_iter)

In [None]:
from transformers import ViTImageProcessor

image_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
data = numpy.random.randint(low=0, high=256, size=224 * 224 * 3, dtype=numpy.uint8).reshape(224, 224, 3).astype(numpy.float32)
data = {'pixel_values': image_processor.preprocess(data, return_tensors="np")['pixel_values']}

In [None]:
model, device = setup_pytorch_cpu('google/vit-base-patch16-224')
# model, device = setup_pytorch_cuda('google/vit-base-patch16-224')
# model, device = setup_pytorch_bettertransformers('google/vit-base-patch16-224')
# model, device = setup_onnx_cpu('/content/drive/MyDrive/model.onnx')
# model, device = setup_onnx_cuda('/content/drive/MyDrive/model.onnx')
# model, device = setup_onnx_cpu('/content/drive/MyDrive/model_opt_no_att.onnx')
# model, device = setup_onnx_cpu('/content/drive/MyDrive/model_opt_full.onnx')
# model, device = setup_onnx_cuda('/content/drive/MyDrive/model_opt_full.onnx')
# model, device = setup_onnx_tensorrt('/content/drive/MyDrive/model.onnx')

# runtime restart will be required for openvino because of separate onnxruntime-lib
# !pip install onnxruntime-openvino
# model, device = setup_onnx_openvino('/content/drive/MyDrive/model.onnx')

print(measure_inference_time(model, device, data, 1000))

['CUDAExecutionProvider', 'CPUExecutionProvider']
2.9568021059994862


In [6]:
from transformers import BertConfig

batch_size = 1
sequence_length = 32

bert_config = BertConfig()
input_ids = numpy.random.randint(low=0, high=bert_config.vocab_size - 1, size=(batch_size, sequence_length), dtype=numpy.int64)
data = {"input_ids": input_ids,
          "attention_mask": numpy.ones([batch_size, sequence_length], dtype=numpy.int64),
          "token_type_ids": numpy.zeros([batch_size, sequence_length], dtype=numpy.int64)}

In [20]:
model, device = setup_pytorch_cpu('bert-base-uncased')
# model, device = setup_pytorch_cuda('bert-base-uncased')
# model, device = setup_pytorch_bettertransformers('bert-base-uncased')
# model, device = setup_onnx_cpu('/content/drive/MyDrive/bert.onnx')
# model, device = setup_onnx_cuda('/content/drive/MyDrive/bert.onnx')
# model, device = setup_onnx_cpu('/content/drive/MyDrive/bert_opt.onnx')
# model, device = setup_onnx_cuda('/content/drive/MyDrive/bert_opt.onnx')
# model, device = setup_onnx_tensorrt('/content/drive/MyDrive/bert.onnx')

print(measure_inference_time(model, device, data, 1000))


['TensorrtExecutionProvider', 'CPUExecutionProvider']
1.1626369360001263
