This notebook walks through implementing a basic network using TensorRT's python API. The network is simple and illustrated layer fusion, tactic selection, concat ilision and mixed precision optimizations

First we will import the needed python libraries.  

In [None]:
import numpy as np
import time
import pycuda.driver as cuda
import pycuda.autoinit

import tensorrt as trt
import numpy
import sys, os
sys.path.insert(1, os.path.join(sys.path[0], ".."))


This model is a dummy model, it doesnt have a purpose other than illustrating some of the underlying TensorRT concepts.  



In [None]:

# Change the logger severity to control what messages are displayed.
TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)

BIAS = 64

class ModelData(object):
    INPUT_NAME = "data"
    INPUT_SHAPE = (1,3,224,224)
    OUTPUT_NAME = "prob"
    OUTPUT_SIZE = 1000
    DTYPE = trt.float32

#create a random simple network to explore layer fusion and scale fusion
def add_cbr(network, input_tensor):
    conv1_w = trt.Weights(numpy.random.rand(64,3,7,7).astype(np.float32))
    conv1_b = trt.Weights(np.random.rand(64,).astype(np.float32))
    conv1 = network.add_convolution(input=input_tensor, num_output_maps=64, kernel_shape=(7,7), kernel=conv1_w, bias=conv1_b)
    conv1.stride = (2,2)
    conv1.padding = (3,3)
    #add a constant_layer here:
    fc_bias = network.add_constant((1, 64, 112, 112), trt.Weights(numpy.random.rand(1, 64, 112, 112).astype(np.float32)))
    bias1 = network.add_elementwise(
                     conv1.get_output(0), 
                     fc_bias.get_output(0), 
                     trt.ElementWiseOperation.PROD) 
    relu1 = network.add_activation(input=bias1.get_output(0), type=trt.ActivationType.RELU)
    return relu1

#build a network using the TRT python api
#we will add some layers to fuse, layers to eliminated (dead layers)
#and layers to optimize with mixed precision
def populate_network(network):
    # Configure the network layers based on the weights provided.
    input_tensor = network.add_input(name=ModelData.INPUT_NAME, dtype=ModelData.DTYPE, shape=ModelData.INPUT_SHAPE)

    #add initial cbr block to the network:
    relu1 = add_cbr(network, input_tensor)
    
    #Add a dead layer - we just wont use the output of cbr_output1 anwyere.  
    #this can be made a useable layer by adding cbr_output1.get_output(0) to outputs 
    cbr_output1 = add_cbr(network, input_tensor)
        
    
    #add a layer to concat
    cbr_output2 = add_cbr(network, input_tensor)
    
    #collect the outputs for the concat layer
    outputs = []
    outputs.append(relu1.get_output(0))
    outputs.append(cbr_output2.get_output(0))
    
    #uncomment after first run
    #for i in range(10):
    #  outputs.append(add_cbr(network, input_tensor).get_output(0))
       
    concatLayer = network.add_concatenation(outputs)
    network.mark_output(tensor=concatLayer.get_output(0))

def build_engine(enable_fp16=False):
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network:
        builder.max_workspace_size = 1 << 30
        
        if(builder.platform_has_fast_fp16 and enable_fp16):
            print("fast fp16 enabled")
            builder.fp16_mode=True
   
        # Populate the network with some random data and layers
        # which demonstrate layer fusion, dead layer elimination, and horizontal fusion.
        populate_network(network)
        # Build and return an engine.
        return builder.build_cuda_engine(network)



Now build the TRT engine - we need a different engine for each precision type we plan on using. For this example there are only two - fp32|fp16.  Then we serialize the optimized model onto the disk.

In [None]:

engine = build_engine()
#serialize engine
with open('cbr_engine', 'wb') as f:
   f.write(engine.serialize())


Look at the terminal screen and read through the output from the build_engine() function. There should be things like:

[TensorRT] INFO: Original: 13 layers
[TensorRT] INFO: After dead-layer removal: 9 layers
...
[TensorRT] INFO: After vertical fusions: 5 layers
...
[TensorRT] INFO: After tensor merging: 4 layers


and the graph optimization section finishes with a section like:
[TensorRT] INFO: After concat removal: 3 layers
[TensorRT] INFO: Graph construction and optimization completed in 0.00046373 seconds.

Which shows the final layer count - 3 in this case from 13.



and the timing routines:
[TensorRT] INFO: --------------- Timing (Unnamed Layer* 0) [Convolution] || (Unnamed Layer* 8) [Convolution](2)
[TensorRT] INFO: Tactic 1 time 0.118112
[TensorRT] INFO: Tactic 49 time 0.119808
[TensorRT] INFO: Tactic 128 time 0.12016


Go back up to the function populate_network and uncomment the for loop which adds 10 additional layers and reexcute that cell and the build engine cell above.


In [None]:
#build an engine with fp16
enginefp16 = build_engine(enable_fp16=True)
with open('cbr_engine16', 'wb') as f2:
    f2.write(enginefp16.serialize())

In [None]:






!ls cbr_engine*


In [None]:
print(engine.device_memory_size)
for i in range(engine.num_bindings):
    print(engine.get_binding_shape(i))

Then create two helper functions - allocate_buffers and do_inference to move data to and from the engine once it has been loaded into the GPU.


In [None]:
DTYPE = trt.float32

#We need to provide memory locations to move data between the gpu and the system memory
#
#
#
#
def allocate_buffers(engine):
    h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(DTYPE))
    h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(DTYPE))
    d_input = cuda.mem_alloc(h_input.nbytes)
    d_output = cuda.mem_alloc(h_output.nbytes)
    stream = cuda.Stream()
    return h_input, d_input, h_output, d_output, stream


def do_inference(context, h_input, d_input, h_output, d_output,stream):
    # Transfer input data to the GPU.
    #cuda.memcpy_htod(d_input, h_input)
    #Try it async
    cuda.memcpy_htod_async(d_input, h_input, stream)
    # Run inference.
    st = time.time()
    #change from sync to async
    #context.execute_async(batch_size=1, bindings=[int(d_input), int(d_output)], stream_handle = stream.handle)
    #stream.synchronize()
    context.execute(batch_size=1, bindings=[int(d_input), int(d_output)])
    print('Inference time: {} [msec]'.format((time.time() - st)*1000))
    # Transfer predictions back from the GPU.
    #cuda.memcpy_dtoh_async(h_output, d_output, stream)

    return h_output



Lets allocate some buffers for input and output to the GPU

In [None]:

 h_input, d_input, h_output, d_output,stream = allocate_buffers(engine)

Then we will create a randomly initialized tensor for the input.

In [None]:
input = np.random.rand(1,3,224,224).astype(numpy.float32)
print(input.nbytes)
print(trt.volume(engine.get_binding_shape(0)))

In [None]:
engine_file16 = "cbr_engine16"

#make sure we pull from the prebuilt engine
with open(engine_file16, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
    engine16 = runtime.deserialize_cuda_engine(f.read())

input = np.random.rand(1,3,224,224).astype(numpy.float32).flatten()
with engine16.create_execution_context() as context:
    for i in range(10):
      #input = np.random.rand(1,3,224,224).astype(numpy.float32).flatten()
      np.copyto(h_input, input)
      output = do_inference(context, h_input, d_input, h_output, d_output,stream)     


In [None]:
engine_file = "cbr_engine"
with open(engine_file, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
    engine = runtime.deserialize_cuda_engine(f.read())

with engine.create_execution_context() as context:
    for i in range(10):
      #input = np.random.rand(1,3,224,224).astype(numpy.float32).flatten()
      np.copyto(h_input, input)
      output = do_inference(context, h_input, d_input, h_output, d_output,stream)
      
      


Now we will show how to create profile dump for use with NSight System. NSight System can be downlaoded here https://developer.nvidia.com/nsight-systems.  This will be very useful to understand the whats and whys of model performance on the system.

First we will want to clean up any old work, then we execute the nsys profile command line, first for the fp32 engine we have serialized, then for the fp16 implementation.


In [None]:
!rm cbrengine_fp32*
!nsys profile --show-output true --output cbrengine_fp32 --trace osrt,cuda,cudnn,cublas,nvtx  python do_inference.py cbr_engine

In [None]:
!rm cbrengine_fp16*
!nsys profile --show-output true --output cbrengine_fp16 --trace osrt,cuda,cudnn,cublas,nvtx  python do_inference.py cbr_engine16

If you want to download these files remember this is executing in a container, and they are located in the directory expoesed to the container here ~/ubuntu/amlc-2019/cbr_fusion_sample/




If you want to download nsight compute GUI https://developer.nvidia.com/nsight-compute. This is not required to complete the notebook but will be a useful tool for model optimization, but will allow you to look at the profile create by the nsys command line tool.


In [None]:
!nv-nsight-cu-cli -k scale -s 11 -c 1 '/usr/bin/python' do_inference.py cbr_engine16

In [None]:
!nv-nsight-cu-cli --target-processes all -k trt_volta_scudnn_128x64_relu_medium_nn_v1 -s 11 -c 1 '/usr/bin/python' do_inference.py cbr_engine