## TensorRT Import

In [1]:
try:
    import tensorrt as trt
    from tensorrt import parsers
except ImportError as err:
    raise ImportError("""ERROR: Flailed to import module ({})
Please make sure you have the TensorRT Library installed
and accessible in your LD_LIBRARY_PATH""".format(err))

## Python dependencies import

In [2]:
from __future__ import division
from random import randint
import numpy as np
import sys
import os

try:
    from PIL import Image
except ImportError as err:
    raise ImportError("""ERROR: Failed to import module ({})
Please make sure you have Pillow installed.
For installation instructions, see:
http://pillow.readthedocs.io/en/stable/installation.html""".format(err))

try:
    import pycuda.driver as cuda
    import pycuda.gpuarray as gpuarray
    import pycuda.autoinit
    import argparse
except ImportError as err:
    raise ImportError("""ERROR: failed to import module ({})
Please make sure you have pycuda and the example dependencies installed.
https://wiki.tiker.net/PyCuda/Installation/Linux
pip(3) install tensorrt[examples]
""".format(err))
    
os.environ["CUDA_VISIBLE_DEVICES"]="3" #selects a specific device

## Create some Global items

### Logger creation
**Log severity** means log's verbosity level

In [3]:
G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.ERROR)

## Profiler definition

In [4]:
class Profiler(trt.infer.Profiler):
    """
    Example Implimentation of a Profiler
    Is identical to the Profiler class in trt.infer so it is possible
    to just use that instead of implementing this if further
    functionality is not needed
    """
    def __init__(self, timing_iter):
        trt.infer.Profiler.__init__(self)
        self.timing_iterations = timing_iter
        self.profile = []

    def report_layer_time(self, layerName, ms):
        record = next((r for r in self.profile if r[0] == layerName), (None, None))
        if record == (None, None):
            self.profile.append((layerName, ms))
        else:
            self.profile[self.profile.index(record)] = (record[0], record[1] + ms)

    def print_layer_times(self):
        totalTime = 0
        for i in range(len(self.profile)):
            print("{:40.40} {:4.3f}ms".format(self.profile[i][0], self.profile[i][1] / self.timing_iterations))
            totalTime += self.profile[i][1]
        print("Time over all layers: {:4.3f}".format(totalTime / self.timing_iterations))


### Operational constants
This includes models' information

In [5]:
BATCH_SIZE = 8

G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.INFO)
INPUT_LAYERS = ["data"]
OUTPUT_LAYERS = ['prob']

DATA_DIR = '/workspace/tensorrt/python/data'

In [6]:
# To be sure G_Profiler has created only once
try:
    TIMING_INTERATIONS
except NameError:
    TIMING_INTERATIONS = 1000
    G_PROFILER = Profiler(TIMING_INTERATIONS)

### Model location

In [7]:
MODEL_PROTOTXT = DATA_DIR + "/googlenet/googlenet.prototxt"
CAFFEMODEL = DATA_DIR + "/googlenet/googlenet.caffemodel"
DATA =  DATA_DIR + '/googlenet/'

In [8]:
def infer(context, input_img, output_size, batch_size):
    #load engine
    engine = context.get_engine()
    assert(engine.get_nb_bindings() == 2)
    #convert input data to Float32
    input_img = input_img.astype(np.float32)
    #create output array to receive data
    output = np.empty(output_size, dtype = np.float32)

    #alocate device memory
    d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize)
    d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize)

    bindings = [int(d_input), int(d_output)]

    stream = cuda.Stream()

    #transfer input data to device
    cuda.memcpy_htod_async(d_input, input_img, stream)
    #execute model
    context.enqueue(batch_size, bindings, stream.handle, None)
    #transfer predictions back
    cuda.memcpy_dtoh_async(output, d_output, stream)

    #synchronize threads
    stream.synchronize()

    #return predictions
    return output

### Run Inference on device

In [9]:
def time_inference(engine, batch_size):
    assert(engine.get_nb_bindings() == 2)

    input_index = engine.get_binding_index(INPUT_LAYERS[0])
    output_index = engine.get_binding_index(OUTPUT_LAYERS[0])

    input_dim = engine.get_binding_dimensions(input_index).to_DimsCHW()
    output_dim = engine.get_binding_dimensions(output_index).to_DimsCHW()

    insize = batch_size * input_dim.C() * input_dim.H() * input_dim.W() * 4
    outsize = batch_size * output_dim.C() * output_dim.H() * output_dim.W() * 4

    d_input = cuda.mem_alloc(insize)
    d_output = cuda.mem_alloc(outsize)

    bindings = [int(d_input), int(d_output)]

    context = engine.create_execution_context()
    context.set_profiler(G_PROFILER)

    cuda.memset_d32(d_input, 0, insize // 4)

    for i in range(TIMING_INTERATIONS):
        context.execute(batch_size, bindings)

    context.destroy()
    return

In [10]:
path = dir_path = "./"

print("Building and running GPU inference for GoogleNet, N=%d" % (BATCH_SIZE))
#Convert caffe model to TensorRT engine
engine = trt.utils.caffe_to_trt_engine(G_LOGGER,
    MODEL_PROTOTXT,
    CAFFEMODEL,
    10,
    16 << 20,
    OUTPUT_LAYERS,
    trt.infer.DataType.FLOAT)


Building and running GPU inference for GoogleNet, N=8


In [11]:
runtime = trt.infer.create_infer_runtime(G_LOGGER)

print("Bindings after deserializing")
for bi in range(engine.get_nb_bindings()):
    if engine.binding_is_input(bi) == True:
        print("Binding " + str(bi) + " (" + engine.get_binding_name(bi) + "): Input")
    else:
        print("Binding " + str(bi) + " (" + engine.get_binding_name(bi) + "): Output")

time_inference(engine, BATCH_SIZE)

G_PROFILER.print_layer_times()

print("Done")

Bindings after deserializing
Binding 0 (data): Input
Binding 1 (prob): Output
conv1/7x7_s2 + conv1/relu_7x7            0.171ms
pool1/3x3_s2                             0.051ms
pool1/norm1                              0.037ms
conv2/3x3_reduce + conv2/relu_3x3_reduce 0.040ms
conv2/3x3 + conv2/relu_3x3               0.259ms
conv2/norm2                              0.101ms
pool2/3x3_s2                             0.041ms
inception_3a/1x1 + inception_3a/relu_1x1 0.054ms
inception_3a/3x3 + inception_3a/relu_3x3 0.093ms
inception_3a/5x5 + inception_3a/relu_5x5 0.060ms
inception_3a/pool                        0.021ms
inception_3a/pool_proj + inception_3a/re 0.040ms
inception_3a/1x1 copy                    0.012ms
inception_3b/1x1 + inception_3b/relu_1x1 0.109ms
inception_3b/3x3 + inception_3b/relu_3x3 0.148ms
inception_3b/5x5 + inception_3b/relu_5x5 0.157ms
inception_3b/pool                        0.026ms
inception_3b/pool_proj + inception_3b/re 0.048ms
inception_3b/1x1 copy                   

In [None]:
engine.destroy()
runtime.destroy()