In [1]:
import os
import tensorflow as tf
import numpy as np
import tvm
import tvm.relay as relay
import tvm.contrib.graph_runtime as runtime
from tvm.relay.expr_functor import ExprMutator
from tvm.contrib import util
from tvm.contrib.util import tempdir
import riptide.models
from riptide.get_models import get_model
from tvm import autotvm
from riptide.binary.binary_layers import Config, DQuantize, XQuantize

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = ''

In [3]:
config = Config(actQ=DQuantize, weightQ=XQuantize, bits=1, use_act=False, use_bn=False, use_maxpool=True)
#config = Config(actQ=None, weightQ=None, bits=None, use_act=True, use_bn=True, use_maxpool=True)

In [12]:
with config:
    model = get_model('squeezenet')
#model = riptide.models.vggnet_normal.vggnet()
#model = tf.keras.models.Sequential(model.layers[:30])
#model = tf.keras.models.Sequential()
#model.add(tf.keras.layers.Conv2D(filters=96, strides=2, padding='same', kernel_size=7, use_bias=False, data_format='channels_last'))
#model.add(tf.keras.layers.BatchNormalization(center=False, scale=False))

In [13]:
test_input = tf.keras.Input(shape=[224, 224, 3], batch_size=1, dtype='float32')
output = model(test_input)

In [14]:
conv_ops = 0
glue_ops = 0
for i, layer in enumerate(model.layers):
    if 'conv2d' in layer.name:
        _, h, w, f = layer.output_shape
        k, _, c, _ = layer.weights[0].shape
        #print("conv: ", k*k*h*w*f*c)
        #print("glue: ", 14*h*w*f)
        #print("ratio: ", k*k*c / 14)
        conv_ops += k*k*h*w*f*c
        glue_ops += 14 * h*w*f

In [15]:
print(conv_ops)
print(glue_ops)
print(conv_ops / glue_ops)

786057216
51806720
15.172881355932203


In [6]:
model.summary()

Model: "squeeze_net"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (1, 112, 112, 96)         14208     
_________________________________________________________________
batch_normalization (BatchNo (1, 112, 112, 96)         384       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (1, 56, 56, 96)           0         
_________________________________________________________________
enter_integer (EnterInteger) (1, 56, 56, 96)           0         
_________________________________________________________________
binary_conv2d (BinaryConv2D) (1, 56, 56, 16)           1536      
_________________________________________________________________
unfused_batch_norm (UnfusedB (1, 56, 56, 16)           64        
_________________________________________________________________
binary_conv2d_1 (BinaryConv2 (1, 56, 56, 64)           

In [7]:
func, params = relay.frontend.from_keras(model, shape={'input_1': [1, 224, 224, 3]}, layout='NHWC')

In [8]:
target = tvm.target.arm_cpu("rasp3b")
target_host = 'llvm -device=arm_cpu -target=arm-linux-gnueabihf -mattr=+neon'

In [9]:
with relay.build_config(opt_level=3):
    graph, lib, params = relay.build(func, target=target, params=params)
    #out = intrp.evaluate(func)(np.random.uniform(size=(1, 3, 28, 28)))

W0829 21:21:01.339347 139789975549760 dispatcher.py:381] Cannot find config for target=llvm -device=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon, workload=('dense', (1, 64, 'float32'), (1000, 64, 'float32'), 0, 'float32'). A fallback configuration is used, which may bring great performance regression.
W0829 21:21:01.352967 139789975549760 dispatcher.py:381] Cannot find config for target=llvm -device=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon, workload=('bitserial_conv2d_nhwc', (1, 14, 14, 32, 'int16'), (1, 1, 1, 4, 32, 'uint8'), (1, 1), (0, 0), 1, 1, 'uint8', 'int16', 1). A fallback configuration is used, which may bring great performance regression.
W0829 21:21:01.369346 139789975549760 dispatcher.py:381] Cannot find config for target=llvm -device=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon, workload=('bitserial_conv2d_nhwc', (1, 14, 14, 96, 'int16'), (1, 1, 1, 12, 32, 'uint8'), (1, 1), (0, 0), 1, 1, 'uint8', 'int16'

In [10]:
tmp = util.tempdir()
lib_fname = tmp.relpath('net.tar')
lib.export_library(lib_fname)

remote = autotvm.measure.request_remote(
    'rpi3b', 'fleet.cs.washington.edu', 9190, timeout=10000)

In [11]:
# upload the library to remote device and load it
remote.upload(lib_fname)
rlib = remote.load_module('net.tar')

# create the remote runtime module
ctx = remote.cpu(0)
module = runtime.create(graph, rlib, ctx)
# set parameter (upload params to the remote device. This may take a while)
module.set_input(**params)

In [12]:
#module.set_input(0, np.random.uniform(size=(1, 3, 224, 224)))
module.set_input(0, np.random.uniform(size=(1, 224, 224, 3)))
module.run()
print(module.get_output(0).shape)

(1, 1000)


In [13]:
 # Evaluate
print("Evaluate inference time cost...")
ftimer = module.module.time_evaluator("run", ctx, number=10, repeat=1)
prof_res = np.array(ftimer().results) * 1000  # Convert to milliseconds
print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
      (np.mean(prof_res), np.std(prof_res)))

Evaluate inference time cost...
Mean inference time (std dev): 159.68 ms (0.00 ms)
