In [2]:
import tensorrt as trt
import pycuda.driver as cuda
import numpy as np
import time

In [1]:
engine_path = 'sample.trt'
image_path = ''
image_size = (224, 224)

In [3]:
class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

In [4]:
def get_engine(engine_path):
    # If a serialized engine exists, use it instead of building an engine.
    print("Reading engine from file {}".format(engine_path))
    with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        return runtime.deserialize_cuda_engine(f.read())

In [5]:
def allocate_buffers(engine, batch_size):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:

        size = trt.volume(engine.get_binding_shape(binding)) * batch_size
        dims = engine.get_binding_shape(binding)
        
        # in case batch dimension is -1 (dynamic)
        if dims[0] < 0:
            size *= -1
        
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream

In [6]:
def do_inference(context, bindings, inputs, outputs, stream):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async(bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]

In [7]:
def detect(context, buffers, image_src, image_size, num_classes):
    IN_IMAGE_H, IN_IMAGE_W = image_size

    ta = time.time()
    # Input
    # resized = cv2.resize(image_src, (IN_IMAGE_W, IN_IMAGE_H), interpolation=cv2.INTER_LINEAR)
    # img_in = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
    # img_in = np.transpose(img_in, (2, 0, 1)).astype(np.float32)
    # img_in = np.expand_dims(img_in, axis=0)
    # img_in /= 255.0
    img_in = image_src.astype(np.float32)
    img_in = np.ascontiguousarray(img_in)
    print("Shape of the network input: ", img_in.shape)
    # print(img_in)

    inputs, outputs, bindings, stream = buffers
    print('Length of inputs: ', len(inputs))
    inputs[0].host = img_in

    trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)

    print('Len of outputs: ', len(trt_outputs))

    # trt_outputs[0] = trt_outputs[0].reshape(1, -1, 1, 4)
    # trt_outputs[1] = trt_outputs[1].reshape(1, -1, num_classes)

    tb = time.time()

    print('-----------------------------------')
    print('    TRT inference time: %f' % (tb - ta))
    print('-----------------------------------')

    # boxes = post_processing(img_in, 0.4, 0.6, trt_outputs)
    trt_outputs = trt_outputs[0].reshape((1, 672, 672))
    boxes = trt_outputs

    return boxes

In [8]:
TRT_LOGGER = trt.Logger()
cuda.init()
device = cuda.Device(0)  # enter your Gpu id here
ctx = device.make_context()

# allocate_buffers()  # load Cuda buffers or any other Cuda or TenosrRT operations

# ctx.pop()  # very important

In [9]:
with get_engine(engine_path) as engine, engine.create_execution_context() as context:
    buffers = allocate_buffers(engine, 1)
    IN_IMAGE_H, IN_IMAGE_W = image_size
    context.set_binding_shape(0, (1, 1, IN_IMAGE_H, IN_IMAGE_W))

    # image_src = cv2.imread(image_path)
    image_src = np.ones([1, 1, 224, 224])

    num_classes = 80

    for i in range(2):  # This 'for' loop is for speed check
                            # Because the first iteration is usually longer
        boxes = detect(context, buffers, image_src, image_size, num_classes)

Reading engine from file sample.trt
Shape of the network input:  (1, 1, 224, 224)
Length of inputs:  1
Len of outputs:  1
-----------------------------------
    TRT inference time: 0.001116
-----------------------------------
Shape of the network input:  (1, 1, 224, 224)
Length of inputs:  1
Len of outputs:  1
-----------------------------------
    TRT inference time: 0.001394
-----------------------------------


In [10]:
boxes

array([[[0.76464844, 0.87353516, 0.9038086 , ..., 0.85595703,
         0.7841797 , 0.7133789 ],
        [0.82421875, 1.0136719 , 1.1044922 , ..., 1.1005859 ,
         1.0097656 , 0.79589844],
        [0.8491211 , 1.0742188 , 1.140625  , ..., 1.1074219 ,
         1.0810547 , 0.8432617 ],
        ...,
        [0.92529297, 1.0283203 , 1.0556641 , ..., 1.0234375 ,
         1.0800781 , 0.9033203 ],
        [0.8520508 , 0.9140625 , 0.9482422 , ..., 0.9824219 ,
         1.0068359 , 0.86816406],
        [0.75683594, 0.7729492 , 0.77734375, ..., 0.75390625,
         0.8017578 , 0.77246094]]], dtype=float32)