In [1]:
import os
import tensorflow as tf
import numpy as np
import copy
from tensorflow.core.framework import graph_pb2
from tensorflow.python.client import session
from tensorflow.python.framework import importer
from tensorflow.python.framework import ops
from tensorflow.python.platform import gfile
from tensorflow.python.summary import summary

try:
  from tensorflow.contrib.tensorrt.ops.gen_trt_engine_op import *
except ImportError:
  pass

cwd = os.getcwd() + '/'
model_path = cwd + 'squeezenet_tflite_pretrained/squeezenet.tflite'
model_dir = 'squeezenet_tflite_pretrained/squeezenet.pb'
log_dir = './logs/'
H = 224
W = 224
C = 3
input_name = 'Placeholder'

batch_size = 1
n_experiments = 10
use_gpu = False
filename_suffix = '_raw_graph_new'

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [25]:
'''
Modify the graph. It is important to disable eager execution since it adds concurrency into profiling...
'''


gpu_profiling_model_name = 'gpu_profiling_squeezenet.pb'

def save_graph(sess, log_dir):
    pb_visual_writer = summary.FileWriter(log_dir)
    pb_visual_writer.add_graph(sess.graph)

## Load a graph, cut off the placeholder, and replace it with a tf.constant.
#   This approach ensures that the input data is allocated at the GPU side.
with session.Session(graph=ops.Graph()) as sess:
    with gfile.GFile(model_dir, 'rb') as f:
        graph_def = graph_pb2.GraphDef()
        graph_def.ParseFromString(f.read())
        importer.import_graph_def(graph_def)
        
        tf_const_input = tf.constant(np.random.rand(batch_size, H, W, C), name='const_input', dtype=tf.float32)
        new_graph_def = graph_pb2.GraphDef()
        node_buf = None
        for node in graph_def.node:
            if 'Placeholder' in node.name:
                new_graph_def.node.extend([tf_const_input.op.node_def])
            else:
                if 'conv1/Conv2D' == node.name:
                    input_names = node.input
                    for (i, n) in enumerate(input_names):
                        if n == 'Placeholder':
                            node.input[i] = tf_const_input.name
                new_graph_def.node.extend([copy.deepcopy(node)])
       
        with gfile.GFile(gpu_profiling_model_name, 'wb') as fn:
            serialized_str = new_graph_def.SerializeToString()
            fn.write(new_graph_def.SerializeToString())

In [2]:
'''
Profile scopes
'''
from tensorflow.python.client import timeline

cpu_tline = 'cpu_time_line.json'

def save_graph_with_suffix(sess, log_dir, suffix):
    pb_visual_writer = summary.FileWriter(log_dir, filename_suffix=suffix)
    pb_visual_writer.add_graph(sess.graph)
    
def test_inference(batch_size, model_dir, timeline_fname, session_conf=None):
    with session.Session(graph=ops.Graph(), config=session_conf) as sess:
        with gfile.GFile(model_dir, 'rb') as f:
            graph_def = graph_pb2.GraphDef()
            graph_def.ParseFromString(f.read())
            importer.import_graph_def(graph_def)
            sess.graph.as_default()
            
            options = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE) 
            run_metadata = tf.compat.v1.RunMetadata()
            g = sess.graph
            output_tensor = g.get_tensor_by_name("import/ArgMax:0")
            try:
                input_tensor = g.get_tensor_by_name("import/Placeholder:0")
                np_val = np.random.rand(batch_size, H, W, C) 
                _ = sess.run(
                    output_tensor, feed_dict={input_tensor: np_val}, options=options, run_metadata=run_metadata
                )
            except KeyError:
                _ = sess.run(
                    output_tensor, feed_dict=dict(), options=options, run_metadata=run_metadata
                )
            
            fetched_timeline = timeline.Timeline(run_metadata.step_stats)
            serialized_trace = fetched_timeline.generate_chrome_trace_format()
            with open(timeline_fname, 'w') as flog:
                flog.write(serialized_trace)
                

In [3]:
#TODO: It seems that tensorflow const_propped the results since none of the nodes are data dependent...
#   As a result the runtime is spent on evaluating a const op...
# test_inference(1, gpu_profiling_model_name, './logs/cpu_timeline_no_placeholder.json', session_conf)

session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
test_inference(1, model_dir, './logs/cpu_timeline.json', session_conf)