## convert using docker 
https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/bert

dung voi tensorrt-llm version 0.12.0
1. docker run -it --rm --network host -v ./:/opt/tritonserver/data --gpus "device=1" heronq/trtllmserver:latest bash
2. cd /data/third_party/TensorRT-LLM/examples/bert
3. code convert

fp32

In [None]:
python3 build.py \
    --model BertModel \
    --model_dir="/data/mbert-retrieve-qry-base/" \
    --dtype=float32 --log_level=verbose \
    --output_dir "/data/mbert-retrieve-qry-base_float32_tllm_checkpoint"

python3 build.py \
    --model BertModel \
    --model_dir="/data/mbert-retrieve-ctx-base/" \
    --dtype=float32 --log_level=verbose \
    --output_dir "/data/mbert-retrieve-ctx-base_float32_tllm_checkpoint"

fp16

In [None]:
python3 build.py \
    --model BertModel \
    --model_dir="/data/mbert-retrieve-qry-base/" \
    --dtype=float16 --log_level=verbose \
    --output_dir "/data/mbert-retrieve-qry-base_float16_tllm_checkpoint"

python3 build.py \
    --model BertModel \
    --model_dir="/data/mbert-retrieve-ctx-base/" \
    --dtype=float16 --log_level=verbose \
    --output_dir "/data/mbert-retrieve-ctx-base_float16_tllm_checkpoint"

fp32 + bert_attetion_plugin float32

In [None]:
python3 build.py \
    --model BertModel \
    --model_dir="/data/mbert-retrieve-qry-base/" \
    --dtype=float32 --log_level=verbose \
    --output_dir "/data/mbert-retrieve-qry-base_float32_bert_atten_plugin" \
    --use_bert_attention_plugin float32

python3 build.py \
    --model BertModel \
    --model_dir="/data/mbert-retrieve-ctx-base/" \
    --dtype=float32 --log_level=verbose \
    --output_dir "/data/mbert-retrieve-ctx-base_float32_bert_atten_plugin" \
    --use_bert_attention_plugin float32

fp16 + bert_attetion_plugin float16

In [None]:
python3 build.py \
    --model BertModel \
    --model_dir="/data/mbert-retrieve-qry-base/" \
    --dtype=float32 --log_level=verbose \
    --output_dir "/data/mbert-retrieve-qry-base_float16_bert_atten_plugin" \
    --use_bert_attention_plugin float16

python3 build.py \
    --model BertModel \
    --model_dir="/data/mbert-retrieve-ctx-base/" \
    --dtype=float32 --log_level=verbose \
    --output_dir "/data/mbert-retrieve-ctx-base_float16_bert_atten_plugin" \
    --use_bert_attention_plugin float16

## Evaluate

### utils

In [None]:
# becnhmark run onnx model
import tensorrt as trt
import numpy as np
import os

import pycuda.driver as cuda
import pycuda.autoinit


from transformers import AutoTokenizer

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

class TrtModel:
    
    def __init__(self,engine_path,max_batch_size=1,dtype=np.float32):
        
        self.engine_path = engine_path
        self.dtype = dtype
        self.logger = trt.Logger(trt.Logger.WARNING)
        self.runtime = trt.Runtime(self.logger)
        self.engine = self.load_engine(self.runtime, self.engine_path)
        self.max_batch_size = max_batch_size
        # self.inputs, self.outputs, self.bindings = self.allocate_buffers()
        self.stream = cuda.Stream()
        self.context = self.engine.create_execution_context()

                
    @staticmethod
    def load_engine(trt_runtime, engine_path):
        trt.init_libnvinfer_plugins(None, "")             
        with open(engine_path, 'rb') as f:
            engine_data = f.read()
        engine = trt_runtime.deserialize_cuda_engine(engine_data)
        return engine
    
    def allocate_buffers(self, binding_shape):
        # Allocate host and device buffers
        inputs, outputs, bindings = [], [], []
        for binding in self.engine:
            # 
            # binding_idx = self.engine.get_binding_index(binding) # tensorRT:8.6.1
            
            # Set input shape based on image dimensions for inference
            # conditional: binding is input
            # if self.engine.binding_is_input(binding):
            #     self.context.set_binding_shape(binding_idx, binding_shape)
            if self.engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
                self.context.set_input_shape(binding, binding_shape)
                
            # size = trt.volume(self.context.get_binding_shape(binding_idx))
            print("binding: ", binding)
            size = trt.volume(self.context.get_tensor_shape(binding))
            print("size: ", size)
            print("batch_size: ", self.context.get_tensor_shape(binding))
            dtype = trt.nptype(self.engine.get_tensor_dtype(binding))
            print("dtype: ", dtype)

            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)
            bindings.append(int(device_mem))

            # if self.engine.binding_is_input(binding):
            if self.engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
                inputs.append(HostDeviceMem(host_mem, device_mem))
            else:
                outputs.append(HostDeviceMem(host_mem, device_mem))

        return inputs, outputs, bindings
        
            
    def __call__(self, inputs_id, attention_mask, token_type_ids, batch_size=2):

        
        x = np.array(inputs_id).astype(self.dtype)
        y = np.array(attention_mask).astype(self.dtype)
        z = np.array(token_type_ids).astype(self.dtype)


        inputs, outputs, bindings = self.allocate_buffers(x.shape)
    
            
        # Transfer input data to the GPU.
        # print(x.shape)
        np.copyto(inputs[0].host,x.ravel())
        np.copyto(inputs[1].host,y.ravel())
        np.copyto(inputs[2].host,z.ravel())
        
        # after copy -> transfer to device, transer first will error duo to hold old value
        for inp in inputs:
            cuda.memcpy_htod_async(inp.device, inp.host, self.stream)

        # Run inference
        self.context.execute_async_v2(bindings=bindings, stream_handle=self.stream.handle)
        
        # Transfer prediction output from the GPU.
        for out in outputs:
            cuda.memcpy_dtoh_async(out.host, out.device, self.stream)
        
        # Synchronize the stream
        self.stream.synchronize()
        return [out.host.reshape(batch_size,-1) for out in outputs]
    

    

In [None]:
import time

def encode_trt(texts, model, tokenizer, batch_size):
    # check if tokenize length is min 128
    encoded_input = tokenizer(
        texts, 
        padding='max_length', 
        truncation=True,
        max_length=128,
        return_tensors='np'
    )

    start_time = time.time()
    embeddings = model(
        encoded_input['input_ids'],
        encoded_input['attention_mask'],
        encoded_input['token_type_ids'],
        batch_size
    )[0]
    end_time = time.time() - start_time

    # print(embeddings.reshape(batch_size, -1, 768))
    return embeddings.reshape(batch_size, -1, 768)[:, 0], end_time

### prepare datasets

In [None]:
import datasets
from datasets import concatenate_datasets
en_eval = datasets.load_dataset('tiennv/mmarco-passage-vi', split='train[-500:]', cache_dir="./datahub/")
vi_eval = datasets.load_dataset('tiennv/mmarco-passage-vi', split='train[-500:]', cache_dir="./datahub/")

dataset_eval = concatenate_datasets([en_eval, vi_eval])
dataset_eval

### run

In [None]:
import numpy as np

# trt_version 8.6.1
# trt_engine_qry_path = "onnx_convert_outputs/mbert-retrieve-qry-onnx/model_fp32_dynamic_shape.engine"
# trt_engine_ctx_path = "onnx_convert_outputs/mbert-retrieve-ctx-onnx/model_fp32_dynamic_shape.engine"

# trt_engine_qry_path = "onnx_convert_outputs/mbert-retrieve-qry-onnx/model_fp32_int8_dynamic_shape.engine"
# trt_engine_ctx_path = "onnx_convert_outputs/mbert-retrieve-ctx-onnx/model_fp32_int8_dynamic_shape.engine"

# trt_engine_qry_path = "onnx_convert_outputs/mbert-retrieve-qry-onnx/model_calib_percential_fp32_dynamic_shape.engine"
# trt_engine_ctx_path = "onnx_convert_outputs/mbert-retrieve-ctx-onnx/model_calib_percential_fp32_dynamic_shape.engine"

# trt_engine_qry_path = "onnx_convert_outputs/mbert-retrieve-qry-onnx/model_calib_percential_fp32_int8_dynamic_shape.engine"
# trt_engine_ctx_path = "onnx_convert_outputs/mbert-retrieve-ctx-onnx/model_calib_percential_fp32_int8_dynamic_shape.engine"

# trtversion 10.3.0

trt_engine_qry_path = "./mbert-retrieve-qry-base_float16_tllm_checkpoint/BertModel_float16_tp1_rank0.engine"
trt_engine_ctx_path = "./mbert-retrieve-ctx-base_float16_tllm_checkpoint/BertModel_float16_tp1_rank0.engine"

# trt_engine_qry_path = "./mbert-retrieve-qry-base_float32_tllm_checkpoint/BertModel_float32_tp1_rank0.engine"
# trt_engine_ctx_path = "./mbert-retrieve-ctx-base_float32_tllm_checkpoint/BertModel_float32_tp1_rank0.engine"

model_query = TrtModel(trt_engine_qry_path, max_batch_size=1, dtype=np.int32)
model_ctx = TrtModel(trt_engine_ctx_path, max_batch_size=10, dtype=np.int32)
tokenizer_qry = AutoTokenizer.from_pretrained("onnx_convert_outputs/mbert-retrieve-qry-onnx/")
tokenizer_ctx = AutoTokenizer.from_pretrained("onnx_convert_outputs/mbert-retrieve-ctx-onnx/")

In [None]:
# device = 'cuda:1' if torch.cuda.is_available() else 'cpu'
accuracy, time_query_run, time_passage_run, time_query_total, time_passage_total = eval_accuracy_trt(
    dataset_eval, 
    encode_trt,
    num_passages=10, 
    model_ctx=model_ctx,
    model_qry=model_query, 
    tokenizer_ctx=tokenizer_ctx,
    tokenizer_query=tokenizer_qry,
    device="cpu"
)
print(f"Accuracy: {accuracy}")
print(f"Time Query Run: {time_query_run}")
print(f"Time Passage Run: {time_passage_run}")
print(f"Time Query Total: {time_query_total}")
print(f"Time Passage Total: {time_passage_total}")

### run with attention plugin
use code run of tensorrtllm

In [None]:
# trt llm run.py

# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
import os

# isort: off
import torch
import tensorrt as trt
# isort: on

import tensorrt_llm
from tensorrt_llm import logger
from tensorrt_llm.runtime import Session, TensorInfo

from build import get_engine_name  # isort:skip


def trt_dtype_to_torch(dtype):
    if dtype == trt.float16:
        return torch.float16
    elif dtype == trt.float32:
        return torch.float32
    elif dtype == trt.int32:
        return torch.int32
    else:
        raise TypeError("%s is not supported" % dtype)


def parse_arguments():
    parser = argparse.ArgumentParser()
    parser.add_argument('--log_level', type=str, default='info')
    parser.add_argument('--engine_dir', type=str, default='bert_outputs')

    return parser.parse_args()


if __name__ == '__main__':
    args = parse_arguments()

    tensorrt_llm.logger.set_level(args.log_level)

    config_path = os.path.join(args.engine_dir, 'config.json')
    with open(config_path, 'r') as f:
        config = json.load(f)

    assert config["plugin_config"]["remove_input_padding"] == False, \
        "Please refer to run_remove_input_padding.py for running BERT models with remove_input_padding enabled"

    dtype = config['builder_config']['precision']
    world_size = config['builder_config']['tensor_parallel']
    assert world_size == tensorrt_llm.mpi_world_size(), \
        f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})'

    model_name = config['builder_config']['name']
    runtime_rank = tensorrt_llm.mpi_rank() if world_size > 1 else 0

    runtime_mapping = tensorrt_llm.Mapping(world_size,
                                           runtime_rank,
                                           tp_size=world_size)
    torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)

    serialize_path = get_engine_name(model_name, dtype, world_size,
                                     runtime_rank)
    serialize_path = os.path.join(args.engine_dir, serialize_path)

    stream = torch.cuda.current_stream().cuda_stream
    logger.info(f'Loading engine from {serialize_path}')
    with open(serialize_path, 'rb') as f:
        engine_buffer = f.read()
    logger.info(f'Creating session from engine')
    session = Session.from_serialized_engine(engine_buffer)

    for i in range(3):
        batch_size = (i + 1) * 4
        seq_len = (i + 1) * 32
        input_ids = torch.randint(100, (batch_size, seq_len)).int().cuda()
        input_lengths = seq_len * torch.ones(
            (batch_size, ), dtype=torch.int32, device='cuda')
        token_type_ids = torch.randint(100, (batch_size, seq_len)).int().cuda()

        inputs = {
            'input_ids': input_ids,
            'input_lengths': input_lengths,
            'token_type_ids': token_type_ids
        }
        output_info = session.infer_shapes([
            TensorInfo('input_ids', trt.DataType.INT32, input_ids.shape),
            TensorInfo('input_lengths', trt.DataType.INT32,
                       input_lengths.shape),
            TensorInfo('token_type_ids', trt.DataType.INT32,
                       token_type_ids.shape),
        ])
        outputs = {
            t.name: torch.empty(tuple(t.shape),
                                dtype=trt_dtype_to_torch(t.dtype),
                                device='cuda')
            for t in output_info
        }
        if (model_name == 'BertModel' or model_name == 'RobertaModel'):
            output_name = 'hidden_states'
        elif (model_name == 'BertForQuestionAnswering'
              or model_name == 'RobertaForQuestionAnswering'):
            output_name = 'logits'
        elif (model_name == 'BertForSequenceClassification'
              or model_name == 'RobertaForSequenceClassification'):
            output_name = 'logits'
        else:
            assert False, f"Unknown BERT model {model_name}"

        assert output_name in outputs, f'{output_name} not found in outputs, check if build.py set the name correctly'

        ok = session.run(inputs, outputs, stream)
        assert ok, "Runtime execution failed"
        torch.cuda.synchronize()
        res = outputs[output_name]

## Appendix

combine calibration (API settings dfefault)

STATUS: 
- [Not supported model](https://github.com/NVIDIA/TensorRT-LLM/issues/1614#issuecomment-2122086630) 
- Config model type không có Bert - [line 110](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/quantization/quantize_by_modelopt.py#L550)


### prepare

In [None]:
from transformers import DataCollatorWithPadding, AutoTokenizer

query_tokenizer = AutoTokenizer.from_pretrained('mbert-retrieve-qry-base/')
ctx_tokenizer = AutoTokenizer.from_pretrained('mbert-retrieve-ctx-base/')

def query_collate_fn(examples):
    query = [example['query'] for example in examples]
    encoded_input = query_tokenizer(
        query, 
        padding='max_length', 
        truncation=True, 
        max_length=512, 
        return_tensors='pt'
    )
    return encoded_input


def ctx_collate_fn(examples):

    concate_passage = []
    for example in examples:
        concate_passage.extend(
            [example['positive']] + example['negatives'][:9]
        )

    # concate_passage = [examples['positive']] + examples['negatives'][:9]
    encoded_input = ctx_tokenizer(
        concate_passage, 
        padding='max_length', 
        truncation=True, 
        max_length=512, 
        return_tensors='pt'
    )
    return encoded_input


In [None]:
# Prepare the calibration set and define a forward loop
import torch

from torch.utils.data import DataLoader
from transformers import AutoModel, AutoTokenizer
import modelopt.torch.quantization as atq

batch_size = 4
calib_batches = number_samples*2 // batch_size

num_workers = 4
device = "cuda:0" if torch.cuda.is_available() else "cpu" #! in docker with gpus specify, device_id is 0

calib_query_loader = torch.utils.data.DataLoader(
    dataset_calib, 
    batch_size=batch_size,
    collate_fn=query_collate_fn,
    num_workers=num_workers, 
    pin_memory=True
)

calib_ctx_loader = torch.utils.data.DataLoader(
    dataset_calib, 
    batch_size=batch_size,
    collate_fn=ctx_collate_fn,
    num_workers=num_workers, 
    pin_memory=True
)


query_model = AutoModel.from_pretrained('mbert-retrieve-qry-base/', return_dict=True)
ctx_model = AutoModel.from_pretrained('mbert-retrieve-ctx-base/', return_dict=True)

query_model.to(device)
ctx_model.to(device)
print("Initialize ...")

In [None]:
from tqdm import tqdm
def calibrate_loop_query():
    for i, (encode_input) in tqdm(enumerate(calib_query_loader), total=calib_batches):
        for k, v in encode_input.items():
            encode_input[k] = v.to(device)
            # print(k, v.shape)
        query_model(**encode_input)
        if i >= calib_batches:
            break

def calibrate_loop_ctx():
    for i, (encode_input) in tqdm(enumerate(calib_ctx_loader), total=calib_batches):
        for k, v in encode_input.items():
            encode_input[k] = v.to(device)
            # print(k, v.shape)
        ctx_model(**encode_input)
        if i >= calib_batches:
            break

### quantize

In [None]:
import tensorrt_llm

In [None]:
import datasets
from datasets import concatenate_datasets 

number_samples = 250 
en = datasets.load_dataset('tiennv/mmarco-passage-vi', split=f'train[:{number_samples}]',
                          cache_dir="./datahub")
vi = datasets.load_dataset('tiennv/mmarco-passage-vi', split=f'train[:{number_samples}]', cache_dir="./datahub")

dataset_calib = concatenate_datasets([en, vi])
dataset_calib

In [None]:
# Select the quantization config, for example, FP8
config = atq.FP8_DEFAULT_CFG
# PTQ with in-place replacement to quantized modules
with torch.no_grad():
    atq.quantize(query_model, config, forward_loop=calibrate_loop_query)

In [None]:
from modelopt.torch.export import export_tensorrt_llm_checkpoint

decoder_type="bert"
dtype=torch.float32
export_dir="./mbert-retrieve-qry-base-quantize-trtllm-fp8"
# ["fp8", "int8_sq", "int4_awq", "w4a8_awq", "int8_wo", "int4_wo", "full_prec"]
qformat="fp8"
DEFAULT_MAX_SEQ_LEN=512
tp_size=1
pp_size=1
awq_block_size=128
# ["int8", "fp8", None]
kv_cache_dtype="int8"


with torch.inference_mode():
    export_tensorrt_llm_checkpoint(
        model=query_model,  # The quantized model.
        decoder_type="bert",
        # decoder_type,  # The type of the model as str, e.g gptj, llama or gptnext.
        dtype=dtype,  # The exported weights data type as torch.dtype.
        export_dir=export_dir,  # The directory where the exported files will be stored.
        inference_tensor_parallel=tp_size,  # The tensor parallelism size for inference.
        inference_pipeline_parallel=pp_size,  # The pipeline parallelism size for inference.
    )

In [None]:
# Select the quantization config, for example, FP8
config = atq.FP8_DEFAULT_CFG
# PTQ with in-place replacement to quantized modules
with torch.no_grad():
    atq.quantize(ctx_model, config, forward_loop=calibrate_loop_ctx)

In [None]:
from ammo.torch.export import export_model_config

decoder_type="bert"
dtype=torch.float32
export_dir="./mbert-retrieve-ctx-base-quantize-trtllm-fp8"
# ["fp8", "int8_sq", "int4_awq", "w4a8_awq", "int8_wo", "int4_wo", "full_prec"]
qformat="fp8"
DEFAULT_MAX_SEQ_LEN=512
tp_size=1
pp_size=1
awq_block_size=128
# ["int8", "fp8", None]
kv_cache_dtype="int8"

with torch.inference_mode():
    export_model_config(
        ctx_model,  # The quantized model.
        decoder_type,  # The type of the model as str, e.g gptj, llama or gptnext.
        dtype,  # The exported weights data type as torch.dtype.
        export_dir,  # The directory where the exported files will be stored.
        inference_tensor_parallel=tp_size,  # The tensor parallelism size for inference.
        inference_pipeline_parallel=pp_size,  # The pipeline parallelism size for inference.
        export_tensorrt_llm_config=True,  # Enable exporting TensorRT-LLM checkpoint config file.
    )