# Logger Severity

In [1]:
from polygraphy.logger import G_LOGGER

# Set verbosity level (choose one):
G_LOGGER.severity = G_LOGGER.VERBOSE  # Basic verbose output
G_LOGGER.severity = G_LOGGER.EXTRA_VERBOSE  # More detailed output



# Construct input data

In [2]:
"""Copyright(c) 2023 lyuwenyu. All Rights Reserved.
"""

import os 
import sys

import torch.utils
import torch.utils.data
import torch.utils.data.dataloader 
# sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))

import torch
import torch.nn as nn 

# from src.core import YAMLConfig


from torch.utils.data import Dataset
import torchvision.transforms as T
from torchvision.io import read_image
from PIL import Image
import numpy as np


class CalibrationDateset(Dataset):
    def __init__(self, img_dir, transform):
        self.img_dir = img_dir
        self.transform = transform
        self.img_names = [f for f in os.listdir(self.img_dir) if (os.path.isfile(
            os.path.join(self.img_dir, f)) and (f.endswith('jpg')))
        ]
    
    def __len__(self, ):
        return len(self.img_names)
    
    def __getitem__(self, idx):
        # img_path = os.path.join(self.img_dir, self.img_names[idx])
        # We manually pick one with objects
        img_path = "/root/workspace/coco_calib/COCO_train2014_000000556709.jpg"
        print("[INFO] Loading image %d : %s" % (idx, img_path))
        img = Image.open(img_path).convert('RGB')
        width, height = img.width, img.height
        # img = read_image(img_path)
        img = self.transform(img)
        # get width and height of the image
        size = (width, height)
        size = np.array(size)
        size = size[np.newaxis, :]
        return img, size
        

        

pre_transforms = T.Compose([
    T.Resize(( 640,640 )),
    T.ToTensor()
])
cali_set = "/root/workspace/coco_calib"
cali_dataset = CalibrationDateset(cali_set, transform=pre_transforms)
dataloader = torch.utils.data.DataLoader(
    cali_dataset, batch_size=1, shuffle=False
)

data_iterator = iter(dataloader)

# size = torch.tensor([[640, 640]]).numpy()
    
    
def load_data():
    for _ in range(1):
        image, size = next(data_iterator)
        yield {
            "images": image.numpy(),
            'orig_target_sizes': size
        }  # Still totally real data
        
input_data = list(load_data())

[INFO] Loading image 0 : /root/workspace/coco_calib/COCO_train2014_000000556709.jpg


# Compare onnx model and engine

In [13]:
from polygraphy.backend.onnxrt import OnnxrtRunner, SessionFromOnnx
from polygraphy.backend.trt import EngineFromNetwork, NetworkFromOnnxPath, TrtRunner, SaveEngine
from polygraphy.comparator import Comparator, CompareFunc
import tensorrt as trt


# The OnnxrtRunner requires an ONNX-RT session.
# We can use the SessionFromOnnx lazy loader to construct one easily:

def compare(onnx_model_path: str, engine_path: str, dataloader, use_fp16):
    build_onnxrt_session = SessionFromOnnx(onnx_model_path)

    # optimization config for build tensorrt engine
    logger = trt.Logger(trt.Logger.VERBOSE)

    builder = trt.Builder(logger)
    config = builder.create_builder_config()
    config.profiling_verbosity = trt.ProfilingVerbosity.DETAILED
    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30) # 1 GiB
    if use_fp16:
        print("use fp16")
        config.set_flag(trt.BuilderFlag.FP16)
    build_engine = SaveEngine(
        EngineFromNetwork(
            NetworkFromOnnxPath(onnx_model_path),
            config=config
        ), 
        path=engine_path
    )
    # save log into local file



    runners = [
        TrtRunner(build_engine),
        OnnxrtRunner(build_onnxrt_session),
    ]

    # `Comparator.run()` will run each runner separately using synthetic input data and
    #   return a `RunResults` instance. See `polygraphy/comparator/struct.py` for details.
    #
    # TIP: To use custom input data, you can set the `data_loader` parameter in `Comparator.run()``
    #   to a generator or iterable that yields `Dict[str, np.ndarray]`.
    run_results = Comparator.run(runners, data_loader=dataloader)

    # `Comparator.compare_accuracy()` checks that outputs match between runners.
    #
    # TIP: The `compare_func` parameter can be used to control how outputs are compared (see API reference for details).
    #   The default comparison function is created by `CompareFunc.simple()`, but we can construct it
    #   explicitly if we want to change the default parameters, such as tolerance.

    # We can use `RunResults.save()` method to save the inference results to a JSON file.
    # This can be useful if you want to generate and compare results separately.
    # run_results.save("inference_results.json")

    # assert bool(
    #     Comparator.compare_accuracy(
    #         run_results, compare_func=CompareFunc.simple(atol=1e-8)
    #     )
    # )
    compare_result = Comparator.compare_accuracy(
        run_results, compare_func=CompareFunc.simple(atol=1e-8)
    )
    return run_results, compare_result


## Mark outputs to be compared by modifying the output nodes of onnx

In [14]:
import onnx
from polygraphy.backend.onnx import modify_outputs
from polygraphy import constants
import pprint
import random


def modify_model(MODEL_PATH: str, modified_model_name: str, expected_outputs: list = []):
    model = onnx.load(MODEL_PATH)
    # model = onnx.load()
    # NOTE: we record the original meta info of output nodes, since we observe that polygrpahy
    # corrupts this part when we modify the outputs
    original_outputs_meta = {output.name: output.type for output in model.graph.output}
    original_output_name = list(original_outputs_meta.keys())
    print("\033[96m[INFO] Origianl output \033[0m")
    print(original_output_name)
    # onnx.save(model, 'out.onnx')

    model = modify_outputs(model, outputs=constants.MARK_ALL)
    # get all outputs name
    all_node_outputs = model.graph.output
    all_node_output_name = [_.name for _ in all_node_outputs]

    add_original_output = True
    if add_original_output:
        expected_outputs = expected_outputs + original_output_name
    print("\033[96m[INFO] output to be compared \033[0m")
    pprint.pprint(expected_outputs)
    model = onnx.load(MODEL_PATH)
    model = modify_outputs(model, outputs=expected_outputs)
    
    # NOTE: restore the corrput output info, as the original info is lost when we modify the outputs
    # NOTE: Maybe a bug, spent a lot of patience on it.
    for output in model.graph.output:
        if output.name in original_output_name:
            output.type.CopyFrom(original_outputs_meta[output.name])
    
    onnx.checker.check_model(model)
    onnx.save(model, modified_model_name)

def modify_and_compare(setting, onnx_model, input_data, if_modify: bool, use_custome_data: bool, expected_outputs: list, use_fp16: bool):
    onnx_path = onnx_model
    # get model basename
    model_name = os.path.splitext(os.path.basename(onnx_path))[0]
    modified_onnx_path = model_name + setting + "-output_modified.onnx"
    saved_engine_name = model_name + setting + "-output_modified.engine"
    # remove existing modified onnx model and engine
    if os.path.exists(modified_onnx_path):
        print("\033[96m[INFO] remove existing onnx file \033[0m")
        os.remove(modified_onnx_path)
    if os.path.exists(saved_engine_name):
        print("\033[96m[INFO] remove existing engine file \033[0m")
        os.remove(saved_engine_name)
    if if_modify:
        modify_model(onnx_path, modified_onnx_path, expected_outputs)
    else:
        modified_onnx_path = onnx_path
    if use_custome_data:
        run_result, compare_result = compare(modified_onnx_path, saved_engine_name, input_data, use_fp16)
    else:
        run_result, compare_result = compare(modified_onnx_path, saved_engine_name, None, use_fp16)
    return run_result, compare_result


# Comparison Results of Original Output

In [None]:
from contextlib import redirect_stdout, redirect_stderr
# onnx_model = "../benchmark_models/default_mtq_int8_q_qint8.onnx"
onnx_model = "../../benchmark_models/default_mtq_int8_q_qint8.onnx"
# only allow fp32
use_fp16 = False

# we first try the original output, then we manually mark some outputs to 
# prevent automatic fusion operations in the building process of engine
setting = 'baseline'
expected_outputs = []
output_dir = "polygraphy_results"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
log_file = f'{setting}.log'
log_file = os.path.join(output_dir, log_file)
print(f"\033[96m[INFO] log file: {log_file}\033[0m")
with open(log_file, "w") as f:
    with redirect_stdout(f), redirect_stderr(f):
        run_result, compare_result = modify_and_compare(setting, onnx_model, input_data, True, True, expected_outputs, use_fp16)

[96m[INFO] log file: polygraphy_results/baseline.log[0m
[05/19/2025-15:03:55] [TRT] [I] [MemUsageChange] Init CUDA: CPU -18, GPU +0, now: CPU 435, GPU 393 (MiB)
[05/19/2025-15:03:55] [TRT] [V] Trying to load shared library libnvinfer_builder_resource.so.10.7.0
[05/19/2025-15:03:55] [TRT] [V] Loaded shared library libnvinfer_builder_resource.so.10.7.0
[05/19/2025-15:03:57] [TRT] [I] [MemUsageChange] Init builder kernel library: CPU +2283, GPU +440, now: CPU 2594, GPU 833 (MiB)
[05/19/2025-15:03:57] [TRT] [V] CUDA lazy loading is enabled.
[05/19/2025-15:03:57] [TRT] [W] profileSharing0806 is on by default in TensorRT 10.0. This flag is deprecated and has no effect.


# Comparison Results of Problematic Nodes
Here are the problematic nodes that get fused during the conversion process.
```python
fusion
0.00511952 ms
__myl_FcMulAdd_myl85_40
[ONNX Layer: /model/decoder/decoder/layers.0/cross_attn/attention_weights/MatMul]
[ONNX Layer: /model/decoder/decoder/layers.0/cross_attn/sampling_offsets/input_quantizer/DequantizeLinear]
[ONNX Layer: /model/decoder/decoder/layers.0/cross_attn/attention_weights/weight_quantizer/DequantizeLinear]
[ONNX Layer: /model/decoder/decoder/layers.0/cross_attn/attention_weights/Add]
[ONNX Layer: /model/decoder/decoder/layers.0/cross_attn/sampling_offsets/MatMul]
[ONNX Layer: /model/decoder/decoder/layers.0/cross_attn/sampling_offsets/weight_quantizer/DequantizeLinear]
[ONNX Layer: /model/decoder/decoder/layers.0/cross_attn/sampling_offsets/Add]
```

The following nodes are directly connected with those problematic fused node:
```python
# sampling_offsets part
'/model/decoder/decoder/layers.0/cross_attn/Reshape_1_output_0',
'/model/decoder/decoder/layers.1/cross_attn/Reshape_1_output_0',
'/model/decoder/decoder/layers.2/cross_attn/Reshape_1_output_0',

# attn weights part
'/model/decoder/decoder/layers.0/cross_attn/Reshape_2_output_0',
'/model/decoder/decoder/layers.1/cross_attn/Reshape_2_output_0',
'/model/decoder/decoder/layers.2/cross_attn/Reshape_2_output_0',
```


In [None]:
from contextlib import redirect_stdout, redirect_stderr
# onnx_model = "../benchmark_models/default_mtq_int8_q_qint8.onnx"
onnx_model = "../../benchmark_models/default_mtq_int8_q_qint8.onnx"
use_fp16 = False
expected_outputs = [
# sampling_offsets part
'/model/decoder/decoder/layers.0/cross_attn/Reshape_1_output_0',
'/model/decoder/decoder/layers.1/cross_attn/Reshape_1_output_0',
'/model/decoder/decoder/layers.2/cross_attn/Reshape_1_output_0',

# attn weights part
'/model/decoder/decoder/layers.0/cross_attn/Reshape_2_output_0',
'/model/decoder/decoder/layers.1/cross_attn/Reshape_2_output_0',
'/model/decoder/decoder/layers.2/cross_attn/Reshape_2_output_0',
]
# we first try the original output, then we manually mark some outputs to 
# prevent automatic fusion operations in the building process of engine
setting = 'mark_outputs_of_fused_nodes'
log_file = f"{setting}.log"
log_file = os.path.join(output_dir, log_file)
print(f"log_file: {log_file}")
with open(log_file, "w") as f:
    with redirect_stdout(f), redirect_stderr(f):
        run_result, compare_result = modify_and_compare(setting, onnx_model, input_data, True, True, expected_outputs, use_fp16)

log_file: polygraphy_results/mark_outputs_of_fused_nodes.log
[05/19/2025-15:35:50] [TRT] [V] Trying to load shared library libnvinfer_builder_resource.so.10.7.0
[05/19/2025-15:35:50] [TRT] [V] Loaded shared library libnvinfer_builder_resource.so.10.7.0
[05/19/2025-15:35:51] [TRT] [I] [MemUsageChange] Init builder kernel library: CPU -2275, GPU +432, now: CPU 7124, GPU 3109 (MiB)
[05/19/2025-15:35:51] [TRT] [V] CUDA lazy loading is enabled.
[05/19/2025-15:35:51] [TRT] [W] profileSharing0806 is on by default in TensorRT 10.0. This flag is deprecated and has no effect.


# Forcibly  Break the Fusion 
Directly mark the outputs of some fused nodes as final model output break the original fusion

In [16]:

from contextlib import redirect_stdout, redirect_stderr
onnx_model = "../../benchmark_models/default_mtq_int8_q_qint8.onnx"
use_fp16 = False
expected_outputs = [
# sampling_offsets part
'/model/decoder/decoder/layers.0/cross_attn/Reshape_1_output_0',
'/model/decoder/decoder/layers.1/cross_attn/Reshape_1_output_0',
'/model/decoder/decoder/layers.2/cross_attn/Reshape_1_output_0',

# attn weights part
'/model/decoder/decoder/layers.0/cross_attn/Reshape_2_output_0',
'/model/decoder/decoder/layers.1/cross_attn/Reshape_2_output_0',
'/model/decoder/decoder/layers.2/cross_attn/Reshape_2_output_0',

# this break the origianl fusion
'/model/decoder/decoder/layers.0/cross_attn/attention_weights/Add_output_0',
'/model/decoder/decoder/layers.1/cross_attn/attention_weights/Add_output_0',
'/model/decoder/decoder/layers.2/cross_attn/attention_weights/Add_output_0',

'/model/decoder/decoder/layers.0/cross_attn/sampling_offsets/Add_output_0',
'/model/decoder/decoder/layers.1/cross_attn/sampling_offsets/Add_output_0',
'/model/decoder/decoder/layers.2/cross_attn/sampling_offsets/Add_output_0',
]
# we first try the original output, then we manually mark some outputs to 
# prevent automatic fusion operations in the building process of engine
setting = 'break_fusion'
output_dir = "polygraphy_results"
log_file = f"{setting}.log"
log_file = os.path.join(output_dir, log_file)
print(f"log_file: {log_file}")
with open(log_file, "w") as f:
    with redirect_stdout(f), redirect_stderr(f):
        run_result, compare_result = modify_and_compare(setting, onnx_model, input_data, True, True, expected_outputs, use_fp16)

log_file: polygraphy_results/break_fusion.log
[05/21/2025-15:15:03] [TRT] [V] Trying to load shared library libnvinfer_builder_resource.so.10.7.0
[05/21/2025-15:15:03] [TRT] [V] Loaded shared library libnvinfer_builder_resource.so.10.7.0
[05/21/2025-15:15:04] [TRT] [I] [MemUsageChange] Init builder kernel library: CPU -2274, GPU +424, now: CPU 10114, GPU 3121 (MiB)
[05/21/2025-15:15:04] [TRT] [V] CUDA lazy loading is enabled.
[05/21/2025-15:15:04] [TRT] [W] profileSharing0806 is on by default in TensorRT 10.0. This flag is deprecated and has no effect.
