In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
from skimage.io import imread, imshow
from skimage.transform import resize
import tensorrt as trt; print('TensorRT Version: {}'.format(trt.__version__))
import time

In [None]:
# Set location of base directory
base_path = os.path.join('/', 'workspace', 'optimization')

# Set location of datasets
datasets_path = os.path.join(base_path, 'datasets')
images_path = os.path.join(datasets_path, 'images')
masks_path = os.path.join(datasets_path, 'masks')

# Set location of model
artifacts_path = os.path.join(base_path, 'artifacts')
engine_file_name = 'Jan_2019_99_w_rejects_{}_b{}_{}.engine'
engine_file_path = os.path.join(artifacts_path, engine_file_name)

In [None]:
# Set network settings
n_channel, n_height, n_width = 1, 512, 384
dimensions = [n_channel, n_height, n_width]
batch_size = 1
precision = 'fp16'  # options are 'fp16' (default), 'int8', and 'fp32'
architecture = 'v100'  # options are 't4' (default), 'v100' and 'xavier'

In [None]:
# You can set the logger severity higher to suppress messages (or lower to display more messages).
# TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
TRT_LOGGER = trt.Logger(trt.Logger.INFO)

In [None]:
# Create runtime
runtime = trt.Runtime(TRT_LOGGER)
print(runtime)

In [None]:
# Read engine
engine_file_path = engine_file_path.format(architecture, batch_size, precision)
with open(engine_file_path, 'rb') as f:
    engine = runtime.deserialize_cuda_engine(f.read())

In [None]:
# Check dimensions
input_dimensions = engine.get_binding_shape(0)
output_dimensions = engine.get_binding_shape(1)
print('Input Dimensions:', input_dimensions)
print('Output Dimensions:', output_dimensions)

In [None]:
# Create execution context
context = engine.create_execution_context()

In [None]:
# Load test images and masks
images_file_names = os.listdir(images_path)
images_file_paths = [os.path.join(images_path, f) for f in images_file_names]
masks_file_names = os.listdir(masks_path)
masks_file_paths = [os.path.join(masks_path, f) for f in masks_file_names]

# Print first 2
print('Images:', images_file_paths[:2])
print('Masks:', masks_file_paths[:2])

In [None]:
# Get the jth image and mask
j = 0
image_file_path = images_file_paths[j]
mask_file_path = masks_file_paths[j]
print('Image:', image_file_path)
print('Mask:', mask_file_path)

In [None]:
# Load image and mask
image = imread(image_file_path)
mask = imread(mask_file_path)
print('Image:', image.shape, image.dtype)
print('Mask:', mask.shape, mask.dtype)

In [None]:
# Change image dimension from 3 to 1
image = image[:, :, 0]
print('Image:', image.shape, image.dtype)
print('Mask:', mask.shape, mask.dtype)

In [None]:
# Show image
imshow(image, cmap='binary'); plt.show()

In [None]:
# Show mask
imshow(mask, cmap='binary'); plt.show()

In [None]:
# Resize image
image = resize(image, (n_height, n_width))
print('Image:', image.shape, image.dtype)

In [None]:
# Add channel dimension
image = image[np.newaxis, :, :]
print('Image:', image.shape, image.dtype)

In [None]:
# Change type
image = image.astype(np.float32)
print('Image:', image.shape, image.dtype)

In [None]:
# Add batch dimension
image = image[np.newaxis, :, :, :]
print('Image:', image.shape, image.dtype)

In [None]:
# Repeat data along batch axis
batch_size = 1
image = np.repeat(image, batch_size, axis=0)
print('Image:', image.shape, image.dtype)

In [None]:
# Create new data
# new_data = image / 255.
new_data = image
output_dimensions = (batch_size, n_height, n_width, n_channel)
output = np.empty(output_dimensions, dtype=np.float32)

In [None]:
# Allocate device memory
d_input = cuda.mem_alloc(1 * new_data.nbytes)
d_output = cuda.mem_alloc(1 * output.nbytes)

In [None]:
# Create bindings and stream
bindings = [int(d_input), int(d_output)]
stream = cuda.Stream()

In [None]:
# Transfer input data to device
cuda.memcpy_htod_async(d_input, new_data, stream)

In [None]:
# Execute model
context.execute_async(1, bindings, stream.handle, None)

In [None]:
# Transfer predictions back
cuda.memcpy_dtoh_async(output, d_output, stream)

In [None]:
# Syncronize threads
stream.synchronize()

In [None]:
# Log info
print("Prediction Shape: {}".format(output.shape)) 
# print("Prediction: {} ".format(output))

In [None]:
# Show prediction
if batch_size > 1:
    imshow(np.squeeze(output[0]), cmap='binary')
else:
    imshow(np.squeeze(output), cmap='binary')
plt.show()

## Benchmarking

In [None]:
n = 2000
start = time.time()
for _ in range(n):
    # Transfer input data to device
    cuda.memcpy_htod_async(d_input, new_data, stream)
    
    # Execute model
    context.execute_async(1, bindings, stream.handle, None)
    
    # Transfer predictions back
    cuda.memcpy_dtoh_async(output, d_output, stream)
    
    # Syncronize threads
    stream.synchronize()
end = time.time()

In [None]:
delta = end - start
average_latency = delta / n
average_throughput = batch_size * (1 / average_latency)
print('Inference: {} seconds'.format(delta))
print('Number of Inferences: {}'.format(n))
print('Average Latency: {} seconds'.format(average_latency))
print('Average Throughput w/ Batch Size {}: {} examples per second'.format(batch_size, average_throughput))