# TensorRT Import

## Import TensorRT dependencies

In [1]:
try:
    import uff
    import tensorrt as trt
    from tensorrt.parsers import uffparser
except ImportError as err:
    raise ImportError("""ERROR: Failed to import module ({})
Please make sure you have the TensorRT Library installed
and accessible in your LD_LIBRARY_PATH""".format(err))

## Python dependencies import

In [2]:
from __future__ import division
from random import randint
import numpy as np
import sys
import os

try:
    from PIL import Image
except ImportError as err:
    raise ImportError("""ERROR: Failed to import module ({})
Please make sure you have Pillow installed.
For installation instructions, see:
http://pillow.readthedocs.io/en/stable/installation.html""".format(err))

try:
    import pycuda.driver as cuda
    import pycuda.gpuarray as gpuarray
    import pycuda.autoinit
    import argparse
except ImportError as err:
    raise ImportError("""ERROR: failed to import module ({})
Please make sure you have pycuda and the example dependencies installed.
https://wiki.tiker.net/PyCuda/Installation/Linux
pip(3) install tensorrt[examples]
""".format(err))

## Create some Global items

### Logger creation
**Log severity** means log's verbosity level

In [3]:
G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.ERROR)

## Profiler definition

In [4]:
class Profiler(trt.infer.Profiler):
    """
    Example Implimentation of a Profiler
    Is identical to the Profiler class in trt.infer so it is possible
    to just use that instead of implementing this if further
    functionality is not needed
    """
    def __init__(self, timing_iter):
        trt.infer.Profiler.__init__(self)
        self.timing_iterations = timing_iter
        self.profile = []

    def report_layer_time(self, layerName, ms):
        record = next((r for r in self.profile if r[0] == layerName), (None, None))
        if record == (None, None):
            self.profile.append((layerName, ms))
        else:
            self.profile[self.profile.index(record)] = (record[0], record[1] + ms)

    def print_layer_times(self):
        totalTime = 0
        for i in range(len(self.profile)):
            print("{:40.40} {:4.3f}ms".format(self.profile[i][0], self.profile[i][1] / self.timing_iterations))
            totalTime += self.profile[i][1]
        print("Time over all layers: {:4.3f}".format(totalTime / self.timing_iterations))


### Operational constants
This includes models' information

In [5]:
BATCH_SIZE = 32

G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.INFO)
INPUT_LAYERS = ["input"]
OUTPUT_LAYERS = ['resnet_v2_50/predictions/Reshape_1']

DATA_DIR = './frozen_models'

In [6]:
%%bash -s "$DATA_DIR"
ls $1

inception_resnet_v2_frozen.pb
inception_v1_frozen.pb
inception_v2_frozen.pb
inception_v3_frozen.pb
inception_v4_frozen.pb
resnetV150_frozen.pb
resnet_v2_101_frozen.pb
resnet_v2_152_frozen.pb
resnet_v2_50_frozen.pb


In [7]:
FROZENMODEL = os.path.join(DATA_DIR, "resnet_v2_50_frozen.pb")

In [8]:
# To be sure G_Profiler has created only once
try:
    TIMING_INTERATIONS
except NameError:
    TIMING_INTERATIONS = 10000
    G_PROFILER = Profiler(TIMING_INTERATIONS)

### Model location

In [9]:
def infer(context, input_img, output_size, batch_size):
    #load engine
    engine = context.get_engine()
    assert(engine.get_nb_bindings() == 2)
    #convert input data to Float32
    input_img = input_img.astype(np.float32)
    #create output array to receive data
    output = np.empty(output_size, dtype = np.float32)

    #alocate device memory
    d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize)
    d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize)

    bindings = [int(d_input), int(d_output)]

    stream = cuda.Stream()

    #transfer input data to device
    cuda.memcpy_htod_async(d_input, input_img, stream)
    #execute model
    context.enqueue(batch_size, bindings, stream.handle, None)
    #transfer predictions back
    cuda.memcpy_dtoh_async(output, d_output, stream)

    #synchronize threads
    stream.synchronize()

    #return predictions
    return output

### Run Inference on device

In [10]:
def time_inference(engine, batch_size):
    assert(engine.get_nb_bindings() == 2)

    input_index = engine.get_binding_index(INPUT_LAYERS[0])
    output_index = engine.get_binding_index(OUTPUT_LAYERS[0])

    input_dim = engine.get_binding_dimensions(input_index).to_DimsCHW()
    output_dim = engine.get_binding_dimensions(output_index).to_DimsCHW()
    
    print('dbg:', batch_size, input_dim.C(), input_dim.H(), input_dim.W(), TIMING_INTERATIONS)
    insize = batch_size * input_dim.C() * input_dim.H() * input_dim.W() * 4
    outsize = batch_size * output_dim.C() * output_dim.H() * output_dim.W() * 4

    d_input = cuda.mem_alloc(insize)
    d_output = cuda.mem_alloc(outsize)

    bindings = [int(d_input), int(d_output)]

    context = engine.create_execution_context()
    context.set_profiler(G_PROFILER)

    cuda.memset_d32(d_input, 0, insize // 4)

    for i in range(TIMING_INTERATIONS):
        context.execute(batch_size, bindings)

    context.destroy()
    return

In [11]:
from tensorrt.infer import Dims

In [12]:
def normalize(data):
    for i in range(len(data)):
        data[i] = 1.0 - data[i] / 255.0
    return data.reshape(3,224,224)

#Lamba to apply argmax to each result after inference to get prediction
argmax = lambda res: np.argmax(res.reshape(10))

In [13]:
uff_model = uff.from_tensorflow_frozen_model(FROZENMODEL, OUTPUT_LAYERS)

Using output node resnet_v2_50/predictions/Reshape_1
Converting to UFF graph
No. nodes: 475


In [14]:
path = dir_path = "./"

print("Building and running GPU inference for GoogleNet, N=%d" % (BATCH_SIZE))

#Convert caffe model to TensorRT engine
# engine = trt.utils.caffe_to_trt_engine(G_LOGGER,
#     MODEL_PROTOTXT,
#     CAFFEMODEL,
#     10,
#     16 << 20,
#     OUTPUT_LAYERS,
#     trt.infer.DataType.FLOAT)

# uff_model = uff.from_tensorflow_frozen_model(FROZENMODEL, OUTPUT_LAYERS)
uff_parser = uffparser.create_uff_parser()
uff_parser.register_input(INPUT_LAYERS[0], (3, 224, 224), 0)
uff_parser.register_output(OUTPUT_LAYERS[0])

engine = trt.utils.uff_to_trt_engine(
    logger=G_LOGGER,
    stream=uff_model,
    parser=uff_parser,
    max_batch_size=BATCH_SIZE,
    max_workspace_size=1 << 20,
    datatype=trt.infer.DataType.HALF,
    plugin_factory=None,
    calibrator=None
)

runtime = trt.infer.create_infer_runtime(G_LOGGER)

print("Bindings after deserializing")
for bi in range(engine.get_nb_bindings()):
    if engine.binding_is_input(bi) == True:
        print("Binding " + str(bi) + " (" + engine.get_binding_name(bi) + "): Input")
    else:
        print("Binding " + str(bi) + " (" + engine.get_binding_name(bi) + "): Output")

time_inference(engine, BATCH_SIZE)

engine.destroy()
runtime.destroy()

G_PROFILER.print_layer_times()

print("Done")

Building and running GPU inference for GoogleNet, N=32
Bindings after deserializing
Binding 0 (input): Input
Binding 1 (resnet_v2_50/predictions/Reshape_1): Output
dbg: 32 3 224 224 10000
resnet_v2_50/conv1/BiasAdd               0.579ms
(Unnamed Layer* 1)                       0.296ms
resnet_v2_50/pool1/MaxPool               0.175ms
resnet_v2_50/block1/unit_1/bottleneck_v2 0.070ms
resnet_v2_50/block1/unit_1/bottleneck_v2 0.056ms
resnet_v2_50/block1/unit_1/bottleneck_v2 0.131ms
resnet_v2_50/block1/unit_1/bottleneck_v2 0.051ms
resnet_v2_50/block1/unit_1/bottleneck_v2 0.124ms
resnet_v2_50/block1/unit_1/bottleneck_v2 0.176ms
resnet_v2_50/block1/unit_2/bottleneck_v2 0.131ms
resnet_v2_50/block1/unit_2/bottleneck_v2 0.169ms
resnet_v2_50/block1/unit_2/bottleneck_v2 0.168ms
resnet_v2_50/block1/unit_2/bottleneck_v2 0.104ms
resnet_v2_50/block1/unit_2/bottleneck_v2 0.122ms
resnet_v2_50/block1/unit_2/bottleneck_v2 0.178ms
resnet_v2_50/block1/unit_3/bottleneck_v2 0.045ms
resnet_v2_50/block1/unit_3/b