In [1]:
import tensorflow as tf
import tensorrt as trt
from tensorrt.parsers import uffparser
import uff

tf.__version__, trt.__version__, uff.__version__

('1.8.0', '4.0.1.6', '0.3.0')

In [2]:
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
from random import randint # generate a random test case
from PIL import Image
from matplotlib.pyplot import imshow # To show test case
import time
import os

## Load frozen graph

In [3]:
ckpt_path = '/home/trt/inferences/handol'
pb_path = '/home/trt/inferences/hd-nod.pb'

In [4]:
with open(pb_path, 'rb') as f:
    graph_def = tf.GraphDef()
    graph_def.ParseFromString(f.read())

In [5]:
graph_def = tf.graph_util.remove_training_nodes(graph_def)

In [6]:
tf.reset_default_graph()
tf.import_graph_def(graph_def, name="")

In [7]:
len(graph_def.node), len(tf.get_default_graph().as_graph_def().node)

(396, 396)

In [8]:
graph = tf.get_default_graph()

In [9]:
# state = graph.get_tensor_by_name('state:0') # 굳이 필요없음
logits = graph.get_tensor_by_name("tower0/policy_head/logits/BiasAdd:0")
probs = graph.get_tensor_by_name("tower0/policy_head/softmax:0")
value = graph.get_tensor_by_name("tower0/value_head/value/Tanh:0")

In [10]:
# s = np.random.randint(0, 2, size=(1, 18, 19, 19)).astype(np.float32)
s = np.ones([1, 18, 19, 19]).astype(np.float32)

In [11]:
gpu_options = tf.GPUOptions(allow_growth=True)
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

In [12]:
lg, p, v = sess.run([logits, probs, value], {'state:0': s})

In [13]:
v

array([[-0.4598145]], dtype=float32)

In [14]:
# write this graph to pb file
# tf.train.write_graph(graph_def, os.path.dirname(pb_path), 'hd_rm_trn.pb', as_text=False)

## Converting the TensorFlow Model to UFF

- 일단 이 frozen graph 를 serialized UFF model 로 컨버팅해야 한다.
- `uff.from_tensorflow()` 를 사용할것이며 output node name 만 알면 댐.
    - `quiet` mode to suppress conversion logging
    - `input_nodes` to allow you to define a set of input nodes in the graph (the defaults are Placeholder nodes)
    - `output_nodes`
    - `text` will let you save a human readable version of UFF model alongside the binary UFF
    - `list_nodes` will list the nodes in the graph
    - `output_filename` will write the model out to the filepath specified in addition to returning a serialized model
- `uff.from_tensorflow_frozen_model()` 은 pb 파일로부터 직접 읽어서 변경하는 함수인듯.

In [15]:
# [!] 주의: 여기서는 tensor:0 에서 :0 을 지워줘야 함! (즉 텐서를 가리키는게 아니라 오퍼레이션을 가리켜야 함)
input_node_names = [
    "state" # state
]
output_node_names = [
    "tower0/policy_head/softmax", # probs
    "tower0/value_head/value/Tanh" # values
]
uff_model = uff.from_tensorflow(graph_def, output_nodes=output_node_names, input_nodes=input_node_names)

Using output node tower0/policy_head/softmax
Using output node tower0/value_head/value/Tanh
Converting to UFF graph
DEBUG: convert reshape to flatten node
DEBUG: convert reshape to flatten node
No. nodes: 384


## Importing the UFF Model into TensorRT and Building an Engine

In [16]:
# 우리는 이제 UFF model stream 을 가지고 있다 (왜 스트림이지). 이걸로 TRT Engine 을 만들어보자.
# 먼저 로거를 세팅
G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.ERROR)

In [17]:
# UFF parser 를 세팅하고 input/output nodes 세팅
parser = uffparser.create_uff_parser()
parser.register_input("state", (18, 19, 19), 0)
for op_node_name in output_node_names:
    parser.register_output(op_node_name)

In [18]:
# 1 << 30 == 2 ** 30 == 1 GB (GPU memory)
# TensorRT 엔진 빌드
engine = trt.utils.uff_to_trt_engine(G_LOGGER, uff_model, parser, 
                                     max_batch_size=1,
                                     max_workspace_size=2<<30,
                                     datatype=trt.infer.DataType.FLOAT)

In [19]:
for i in range(engine.get_nb_bindings()):
    print("for binding {},".format(i))
    print("name  :", engine.get_binding_name(i))
    # data types: trt.infer.DataType
    dtype = engine.get_binding_data_type(i)
    print("dtype :", trt.infer.DataType(dtype))
    dims = engine.get_binding_dimensions(i).to_DimsCHW()
    print("dim   :", dims.C(), dims.H(), dims.W())
    print()

for binding 0,
name  : state
dtype : DataType.FLOAT
dim   : 18 19 19

for binding 1,
name  : tower0/policy_head/softmax
dtype : DataType.FLOAT
dim   : 1 1 362

for binding 2,
name  : tower0/value_head/value/Tanh
dtype : DataType.FLOAT
dim   : 1 1 1



In [20]:
parser.destroy()

## TensorRT Inference

In [21]:
# runtime 및 engine 의 execution context 생성
runtime = trt.infer.create_infer_runtime(G_LOGGER)
context = engine.create_execution_context()

In [22]:
p.shape, v.shape

((1, 362), (1, 1))

In [23]:
# GPU memory allocation for input/output
trt_probs = np.zeros(362, dtype=np.float32)
trt_value = np.zeros(1, dtype=np.float32)

# allocation
B = 1 # batch size
d_input = cuda.mem_alloc(B * s.nbytes)
d_probs = cuda.mem_alloc(B * trt_probs.nbytes)
d_value = cuda.mem_alloc(B * trt_value.nbytes)

In [24]:
# TRT engine 은 GPU memory pointer binding 을 해줘야 함
bindings = [int(d_input), int(d_probs), int(d_value)] # int 는 pointer 주소 타입캐스팅

In [25]:
stream = cuda.Stream()
# input data 를 GPU 로 이동 (미리 지정해둔 d_input 으로). 아마 htod 는 host to device 일듯?
cuda.memcpy_htod_async(d_input, s, stream)
# 모델 실행
context.enqueue(B, bindings, stream.handle, None)
# 결과를 다시 CPU 로 가져옴
cuda.memcpy_dtoh_async(trt_probs, d_probs, stream)
cuda.memcpy_dtoh_async(trt_value, d_value, stream)
# synchronize
stream.synchronize()

In [26]:
trt_value

array([-0.45982668], dtype=float32)

In [27]:
trt_probs

array([1.51043246e-03, 6.33322692e-04, 2.56291678e-04, 2.19770081e-04,
       2.00626411e-04, 1.83451484e-04, 1.53308400e-04, 1.36889910e-04,
       1.20051969e-04, 1.10205699e-04, 9.74680297e-05, 8.83465254e-05,
       8.56449333e-05, 8.11104619e-05, 7.72784260e-05, 8.17085456e-05,
       7.99662375e-05, 1.78305665e-04, 2.15001302e-04, 8.68667208e-04,
       1.77537018e-04, 9.63208731e-05, 7.90810154e-05, 7.12442561e-05,
       6.35947363e-05, 5.69060030e-05, 5.53523168e-05, 4.76644673e-05,
       4.34321992e-05, 3.99638302e-05, 3.55924312e-05, 3.25350120e-05,
       3.15799443e-05, 3.04895530e-05, 3.15419602e-05, 3.17104386e-05,
       4.26606821e-05, 1.64073586e-04, 1.36321061e-03, 2.67722644e-04,
       9.27539368e-05, 6.57643541e-05, 5.94052144e-05, 4.49812069e-05,
       4.53172688e-05, 3.62654682e-05, 3.39658072e-05, 2.78475491e-05,
       2.51144866e-05, 2.11071747e-05, 2.03853051e-05, 1.57005761e-05,
       1.82831864e-05, 1.68487077e-05, 1.75055557e-05, 3.02435710e-05,
      

In [28]:
np.max(np.abs(trt_value - v))

1.218915e-05

In [29]:
np.max(np.abs(p - trt_probs))

4.142523e-06

## Save TRT engine to file

In [30]:
# This is PLAN file!
# trt.utils.write_engine_to_file('./trt_hd.engine', engine.serialize())

In [31]:
# also you can load like this:
# new_engine = trt.utils.load_engine(G_LOGGER, "./trt_hd.engine")

## Finalize

In [32]:
context.destroy()
engine.destroy()
runtime.destroy()

# Runtime comparison

In [33]:
def tf_run(s):
    p, v = sess.run([probs, value], {'state:0': s})
    return p, v

In [34]:
# logger
G_LOGGER = trt.infer.ConsoleLogger(trt.infer.LogSeverity.ERROR)

# parser
parser = uffparser.create_uff_parser()
parser.register_input("state", (18, 19, 19), 0)
for op_node_name in output_node_names:
    parser.register_output(op_node_name)

# engine
engine = trt.utils.uff_to_trt_engine(G_LOGGER, uff_model, parser, 
                                     max_batch_size=1,
                                     max_workspace_size=2<<30)

# runtime & context
runtime = trt.infer.create_infer_runtime(G_LOGGER)
context = engine.create_execution_context()

# GPU memory allocation for input/output
trt_probs = np.zeros(362, dtype=np.float32)
trt_value = np.zeros(1, dtype=np.float32)

# allocation
B = 1 # batch size
d_input = cuda.mem_alloc(B * s.nbytes)
d_probs = cuda.mem_alloc(B * trt_probs.nbytes)
d_value = cuda.mem_alloc(B * trt_value.nbytes)

# TRT engine 은 GPU memory pointer binding 을 해줘야 함
bindings = [int(d_input), int(d_probs), int(d_value)] # int 는 pointer 주소 타입캐스팅

stream = cuda.Stream()

def trt_run(s):    
    cuda.memcpy_htod_async(d_input, s, stream)
    context.enqueue(B, bindings, stream.handle, None)
    cuda.memcpy_dtoh_async(trt_probs, d_probs, stream)
    cuda.memcpy_dtoh_async(trt_value, d_value, stream)
    stream.synchronize()
    
    return trt_probs, trt_value

In [35]:
import time

In [36]:
N = 1600

In [37]:
s = np.ones((1, 18, 19, 19), dtype=np.float32)
# warmup
for i in range(10):
    tf_run(s)
# run
st = time.time()
for i in range(N):
    # s = np.random.randint(0, 2, size=(1, 18, 19, 19)).astype(np.float32)
    p, v = tf_run(s)

elapsed = time.time() - st
print("elapsed: {:.2f}s".format(elapsed))
print("{} n/s".format(N / elapsed))

elapsed: 7.66s
208.83917188374068 n/s


In [38]:
s = np.ones((1, 18, 19, 19), dtype=np.float32)
# warmup
for i in range(10):
    trt_run(s)
# run
st = time.time()
for i in range(N):
    # s = np.random.randint(0, 2, size=(1, 18, 19, 19)).astype(np.float32)
    p, v = trt_run(s)

elapsed = time.time() - st
print("elapsed: {:.2f}s".format(elapsed))
print("{} n/s".format(N / elapsed))

elapsed: 3.42s
468.2764970064268 n/s


In [39]:
parser.destroy()
context.destroy()
engine.destroy()
runtime.destroy()