# Convert onnx models to TensorRT engine

<span style="color:red">RUN THIS NOTEBOOK IN iot-nlu-trt CONTAINER!!!</span>


## fp16 Quantization

In [None]:
import numpy as np
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda
import unicodedata
import json

In [None]:
# インテントの種類数 (None=0, LED_ON=1, LED_OFF=2, READ_THERMO=3, OPEN=4, CLOSE=5, SET_TEMP=6)
NUM_INTENT_LABELS = 7

# スロットの種類数 (COL=1, COLLTDEV=2, LOC=3, ONOFFDEV=4, OPENABLE=5, TEMPDEV=6, TEMPERTURE_NUM=7, THMDEV=8)
NUM_ENTITY_TYPE = 8

ONNX_SIM_FILE_NAME = "./model/iot-nlu-sim.onnx"
ENGINE_FILE_NAME   = "./model/iot-nlu-sim-fp16.engine"

In [None]:
import torch
print(torch.__version__)
import unicodedata
from ner_tokenizer_bio import NER_tokenizer_BIO

TOKENIZER_PATH = './model/iot-nlu-tokenizer'

# トークナイザのロード
tokenizer = NER_tokenizer_BIO.from_pretrained(
    TOKENIZER_PATH,
    num_entity_type=NUM_ENTITY_TYPE
)

In [None]:
text = unicodedata.normalize('NFKC', '会議室にある黄色い電灯の火を点灯してくださいな')

encoding, spans = tokenizer.encode_plus_untagged(
    text, max_length=128, return_tensors='pt'
)

input_ids      = encoding["input_ids"].numpy().astype(np.int32)
attention_mask = encoding["attention_mask"].numpy().astype(np.int32)
token_type_ids = encoding["token_type_ids"].numpy().astype(np.int32)

#total_loss    = np.zeros([1,1]).astype(np.float32)
total_loss    = np.zeros([1,1], np.float32)
logits_intent = np.zeros([1, NUM_INTENT_LABELS], np.float32)
logits_slot   = np.zeros([1, 128, 17], np.float32)

print(f'input_ids:\n\tshape={input_ids.shape}\n\tsize={input_ids.nbytes} bytes\n\tcontent={input_ids}')
print(f'attention_mask:\n\tshape={attention_mask.shape}\n\tsize={attention_mask.nbytes} bytes\n\tcontent={attention_mask}')
print(f'token_type_ids:\n\tshape={token_type_ids.shape}\n\tsize={token_type_ids.nbytes} bytes\n\tcontent={token_type_ids}')

In [None]:
class MyLogger(trt.ILogger):
    def __init__(self):
       trt.ILogger.__init__(self)

    def log(self, severity, msg):
        print(severity, msg)

In [None]:
def build_engine(model_path, output_path):
    print("building engine")
    logger = MyLogger()
    builder = trt.Builder(logger)
    
    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    parser = trt.OnnxParser(network, logger)
    success = parser.parse_from_file(model_path)
    for idx in range(parser.num_errors):
        print(parser.get_error(idx))

    config = builder.create_builder_config()
    config.set_flag(trt.BuilderFlag.FP16)
    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 4 << 33) # 1 MiB

    network.get_input(0).shape = input_ids.shape
    network.get_input(0).shape = attention_mask.shape
    network.get_input(0).shape = token_type_ids.shape
    
    serialized_engine = builder.build_serialized_network(network, config)
    with open( ENGINE_FILE_NAME, "wb") as f:
        f.write(serialized_engine)
        print(f"{output_path} saved")

build_engine(ONNX_SIM_FILE_NAME, ENGINE_FILE_NAME)

Inference

In [None]:
runtime = trt.Runtime(MyLogger())
with open(ENGINE_FILE_NAME, 'rb') as f:
    engine_bytes = f.read()
    engine = runtime.deserialize_cuda_engine(engine_bytes)

bert_context = engine.create_execution_context()

In [None]:
runtime = trt.Runtime(MyLogger())
with open(ENGINE_FILE_NAME, 'rb') as f:
    engine_bytes = f.read()
    engine = runtime.deserialize_cuda_engine(engine_bytes)

In [None]:
bert_context = engine.create_execution_context()

# outputs
bert_output = torch.zeros((1, NUM_INTENT_LABELS)).cpu().detach().numpy()

# memory allocation for inputs
nbytes_input_ids      = trt.volume(input_ids.shape)      * trt.int32.itemsize
nbytes_attention_mask = trt.volume(attention_mask.shape) * trt.int32.itemsize
nbytes_token_type_ids = trt.volume(token_type_ids.shape) * trt.int32.itemsize
nbytes_logits_intent  = trt.volume(logits_intent.shape)  * trt.float32.itemsize
nbytes_logits_slot    = trt.volume(logits_slot.shape)    * trt.float32.itemsize
nbytes_total_loss     = trt.volume(total_loss.shape)     * trt.float32.itemsize

# memory allocation for inputs
d_input_ids      = cuda.mem_alloc(nbytes_input_ids)
d_attention_mask = cuda.mem_alloc(nbytes_attention_mask)
d_token_type_ids = cuda.mem_alloc(nbytes_token_type_ids)

# memory allocation for outputs
d_logits_intent  = cuda.mem_alloc(nbytes_logits_intent)
d_logits_slot    = cuda.mem_alloc(nbytes_logits_slot)
d_total_loss     = cuda.mem_alloc(nbytes_total_loss)

# Transfer input data from python buffers to device(GPU)
stream = cuda.Stream()
cuda.memcpy_htod_async(d_input_ids,      input_ids,      stream)
cuda.memcpy_htod_async(d_attention_mask, attention_mask, stream)
cuda.memcpy_htod_async(d_token_type_ids, token_type_ids, stream)

# Run the model
bindings = [int(d_input_ids), int(d_attention_mask), int(d_token_type_ids), int(d_logits_intent), int(d_logits_slot), int(d_total_loss)]
bert_context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)

# Copy output from GPU to host
cuda.memcpy_dtoh_async(logits_intent, d_logits_intent, stream)
cuda.memcpy_dtoh_async(logits_slot,   d_logits_slot, stream)
cuda.memcpy_dtoh_async(total_loss,    d_total_loss, stream)

stream.synchronize()

In [None]:
intent       = logits_intent.argmax(-1)[0]
scores_slots = logits_slot[0]
entities_predicted = tokenizer.convert_bert_output_to_entities(
    text, scores_slots, spans
)
print(f"input text={text}")
print(f"inferred intent={intent}")
print(f"inferred entities={json.dumps(entities_predicted, indent=2, ensure_ascii=False)}")

Tips

In [None]:
# TRT tool
# https://github.com/NVIDIA/TensorRT/tree/main/samples/trtexec
!trtexec --loadEngine='sample.engine' --dumpLayerInfo --fp16

In [None]:
# How to check I/O of engine
for i, b in enumerate(engine):
    dtype = np.dtype(trt.nptype(engine.get_binding_dtype(i)))
    print(f'\t{dtype}')
    print(f'\t{engine.binding_is_input(b)}')
    print(f'\t{engine.get_binding_name(i)}')
    if(engine.binding_is_input(b)):
        print(f'\t{engine.get_profile_shape(0, b)}')


## Int8 Quantization

We use Polygraphy to quantize and convert onnx to TensorRT Engine.
See here for detail
https://github.com/NVIDIA/TensorRT/tree/main/tools/Polygraphy/examples/cli/convert/01_int8_calibration_in_tensorrt

Here is how to
1. Clone TensorRT repository

   Type following comnmands in iot-nlu-trt container.
   
   ```
   cd ~/Github
   git clone https://github.com/NVIDIA/TensorRT.git
       # clone right under $HOME/Github in this example
       # I used rev. 03f27009
   export PATH=$PATH:$HOME/Github/TensorRT/tools/Polygraphy/bin
   cd ~/Github/iot-nlu/src
   
   ```
2. Set path to Polygraphy

    ```
    export PATH=$PATH:$HOME/Github/TensorRT/tools/Polygraphy/bin
    ```
3. Convert onnx models

   ```
    cd ~/Github/iot-nlu/src
    polygraphy convert model/iot-nlu-sim.onnx --int8 --data-loader-script data_loader.py --calibration-cache model/iot-nlu-sim-int8.cache -o model/iot-nlu-sim-int8.engine
    polygraphy convert model/iot-nlu.onnx --int8 --data-loader-script data_loader.py --calibration-cache model/iot-nlu-int8.cache -o model/iot-nlu-int8.engine
   ```
    