In [None]:
import numpy as np
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda
import unicodedata
import json
from tqdm import tqdm
import datetime

import torch
print(torch.__version__)
from ner_tokenizer_bio import NER_tokenizer_BIO
from data_loader import DataLoaderContext

In [None]:
# インテントの種類数 (None=0, LED_ON=1, LED_OFF=2, READ_THERMO=3, OPEN=4, CLOSE=5, SET_TEMP=6)
NUM_INTENT_LABELS = 7

# スロットの種類数 (COL=1, COLLTDEV=2, LOC=3, ONOFFDEV=4, OPENABLE=5, TEMPDEV=6, TEMPERTURE_NUM=7, THMDEV=8)
NUM_ENTITY_TYPE = 8

TOKENIZER_PATH = './model/iot-nlu-tokenizer'
ENGINE_PATH    = "./model/iot-nlu-sim-int8.engine"
#ENGINE_PATH    = "./model/iot-nlu-int8.engine"
#ENGINE_PATH    = "model/iot-nlu-sim-fp16.engine"

In [None]:
# トークナイザのロード
tokenizer = NER_tokenizer_BIO.from_pretrained(
    TOKENIZER_PATH,
    num_entity_type=NUM_ENTITY_TYPE
)

In [None]:
dataset_test = DataLoaderContext(tokenizer).dataset_test

# Prepare input variables
input_ids      = dataset_test[0]["input_ids"]
attention_mask = dataset_test[0]["attention_mask"]
token_type_ids = dataset_test[0]["token_type_ids"]

# Prepare output variables
logits_intent = np.zeros([1, NUM_INTENT_LABELS], np.float32)
logits_slot   = np.zeros([1, 128, 17], np.float32)
total_loss    = np.zeros([1,1], np.float32)

print(f'input_ids:\n\tshape={input_ids.shape}\n\tsize={input_ids.nbytes} bytes\n\tcontent={input_ids}')
print(f'attention_mask:\n\tshape={attention_mask.shape}\n\tsize={attention_mask.nbytes} bytes\n\tcontent={attention_mask}')
print(f'token_type_ids:\n\tshape={token_type_ids.shape}\n\tsize={token_type_ids.nbytes} bytes\n\tcontent={token_type_ids}')

In [None]:
class MyLogger(trt.ILogger):
    def __init__(self):
       trt.ILogger.__init__(self)

    def log(self, severity, msg):
        print(severity, msg)

In [None]:
def load_engine(runtime, engine_path):
    with open(engine_path, 'rb') as f:
        engine_bytes = f.read()
        engine = runtime.deserialize_cuda_engine(engine_bytes)
        return engine

In [None]:
# Measure throughput
with trt.Runtime(MyLogger()) as runtime:
    with load_engine(runtime, ENGINE_PATH) as engine:
        with engine.create_execution_context() as context:

            # memory allocation for inputs
            nbytes_input_ids      = trt.volume(input_ids.shape)      * trt.int32.itemsize
            nbytes_attention_mask = trt.volume(attention_mask.shape) * trt.int32.itemsize
            nbytes_token_type_ids = trt.volume(token_type_ids.shape) * trt.int32.itemsize
            nbytes_logits_intent  = trt.volume(logits_intent.shape)  * trt.float32.itemsize
            nbytes_logits_slot    = trt.volume(logits_slot.shape)    * trt.float32.itemsize
            nbytes_total_loss     = trt.volume(total_loss.shape)     * trt.float32.itemsize

            # memory allocation for inputs
            d_input_ids      = cuda.mem_alloc(nbytes_input_ids)
            d_attention_mask = cuda.mem_alloc(nbytes_attention_mask)
            d_token_type_ids = cuda.mem_alloc(nbytes_token_type_ids)

            # memory allocation for outputs
            d_logits_intent  = cuda.mem_alloc(nbytes_logits_intent)
            d_logits_slot    = cuda.mem_alloc(nbytes_logits_slot)
            d_total_loss     = cuda.mem_alloc(nbytes_total_loss)

            # Transfer input data from python buffers to device(GPU)
            stream = cuda.Stream()

            print(f"num of data={len(dataset_test)}")
            start_time = datetime.datetime.now()
            for encoding in tqdm(dataset_test):
                input_ids      = encoding["input_ids"]
                attention_mask = encoding["attention_mask"]
                token_type_ids = encoding["token_type_ids"]

                cuda.memcpy_htod_async(d_input_ids,      input_ids,      stream)
                cuda.memcpy_htod_async(d_attention_mask, attention_mask, stream)
                cuda.memcpy_htod_async(d_token_type_ids, token_type_ids, stream)

                # Run the model
                bindings = [int(d_input_ids), int(d_attention_mask), int(d_token_type_ids), int(d_logits_intent), int(d_logits_slot), int(d_total_loss)]
                context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)

                # Copy output from GPU to host
                cuda.memcpy_dtoh_async(logits_intent, d_logits_intent, stream)
                cuda.memcpy_dtoh_async(logits_slot,   d_logits_slot, stream)
                cuda.memcpy_dtoh_async(total_loss,    d_total_loss, stream)
                stream.synchronize()
            end_time = datetime.datetime.now()
            time_diff = (end_time - start_time)
            execution_time = time_diff.total_seconds() * 1000
            print(f"{execution_time} ms")


In [None]:
# Run shingle shot
text = unicodedata.normalize('NFKC', '会議室にある黄色い電灯の火を点灯してくださいな')
encoding, spans = tokenizer.encode_plus_untagged(
    text, max_length=128, return_tensors='pt'
)

input_ids      = encoding["input_ids"].numpy().astype(np.int32)
attention_mask = encoding["attention_mask"].numpy().astype(np.int32)
token_type_ids = encoding["token_type_ids"].numpy().astype(np.int32)

with trt.Runtime(MyLogger()) as runtime:
    with load_engine(runtime, ENGINE_PATH) as engine:
        with engine.create_execution_context() as context:

            # memory allocation for inputs
            nbytes_input_ids      = trt.volume(input_ids.shape)      * trt.int32.itemsize
            nbytes_attention_mask = trt.volume(attention_mask.shape) * trt.int32.itemsize
            nbytes_token_type_ids = trt.volume(token_type_ids.shape) * trt.int32.itemsize
            nbytes_logits_intent  = trt.volume(logits_intent.shape)  * trt.float32.itemsize
            nbytes_logits_slot    = trt.volume(logits_slot.shape)    * trt.float32.itemsize
            nbytes_total_loss     = trt.volume(total_loss.shape)     * trt.float32.itemsize
            
            # memory allocation for inputs
            d_input_ids      = cuda.mem_alloc(nbytes_input_ids)
            d_attention_mask = cuda.mem_alloc(nbytes_attention_mask)
            d_token_type_ids = cuda.mem_alloc(nbytes_token_type_ids)

            # memory allocation for outputs
            d_logits_intent  = cuda.mem_alloc(nbytes_logits_intent)
            d_logits_slot    = cuda.mem_alloc(nbytes_logits_slot)
            d_total_loss     = cuda.mem_alloc(nbytes_total_loss)

            # Transfer input data from python buffers to device(GPU)
            stream = cuda.Stream()

            cuda.memcpy_htod_async(d_input_ids,      input_ids,      stream)
            cuda.memcpy_htod_async(d_attention_mask, attention_mask, stream)
            cuda.memcpy_htod_async(d_token_type_ids, token_type_ids, stream)

            # Run the model
            bindings = [int(d_input_ids), int(d_attention_mask), int(d_token_type_ids), int(d_logits_intent), int(d_logits_slot), int(d_total_loss)]
            context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)

            # Copy output from GPU to host
            cuda.memcpy_dtoh_async(logits_intent, d_logits_intent, stream)
            cuda.memcpy_dtoh_async(logits_slot,   d_logits_slot, stream)
            cuda.memcpy_dtoh_async(total_loss,    d_total_loss, stream)
            stream.synchronize()
            
intent       = logits_intent.argmax(-1)[0]
scores_slots = logits_slot[0]
entities_predicted = tokenizer.convert_bert_output_to_entities(text, scores_slots, spans)
print(f"input text={text}")
print(f"inferred intent={intent}")
print(f"inferred entities={json.dumps(entities_predicted, indent=2, ensure_ascii=False)}")