# TVM, load transformer

In [1]:
!pwd

/Users/isong/Downloads/ml/sc/xilinx/Github/transformer_simple/src/python


## Test run

In [2]:
!python ./experiments/classify.py -e 1 -t -d1 -H1 -D -m single_transformer.pt

OPTIONS  Namespace(batch_size=4, debug=True, depth=1, embedding_size=128, final=False, gradient_clipping=1.0, lr=0.0001, lr_warmup=10000, max_length=512, max_pool=False, model_name='single_transformer.pt', num_epochs=1, num_heads=1, seed=1, tb_dir='./runs', tiny=True, vocab_size=50000)
- nr. of training examples 63
- nr. of validation examples 63
Model's state_dict:
token_embedding.weight 	 torch.Size([50000, 128])
pos_embedding.weight 	 torch.Size([512, 128])
trfm_blocks.0.mha.attentions.0.toqueries.weight 	 torch.Size([128, 128])
trfm_blocks.0.mha.attentions.0.toqueries.bias 	 torch.Size([128])
trfm_blocks.0.mha.attentions.0.tokeys.weight 	 torch.Size([128, 128])
trfm_blocks.0.mha.attentions.0.tokeys.bias 	 torch.Size([128])
trfm_blocks.0.mha.attentions.0.tovalues.weight 	 torch.Size([128, 128])
trfm_blocks.0.mha.attentions.0.tovalues.bias 	 torch.Size([128])
trfm_blocks.0.mha.w_o.0.weight 	 torch.Size([128, 128])
trfm_blocks.0.mha.w_o.0.bias 	 torch.Size([128])
trfm_blocks.0.norm1.w

100%|███████████████████████████████████████████| 63/63 [00:04<00:00, 13.18it/s]
-- validation accuracy 0.504
Save model to saved_model/single_transformer.pt
Load model to saved_model/single_transformer1.pt
output_test
tensor([[-0.7327, -0.6551],
        [-0.7253, -0.6620],
        [-0.7330, -0.6548],
        [-0.7324, -0.6554]], grad_fn=<LogSoftmaxBackward>)
output_load
tensor([[-0.7327, -0.6551],
        [-0.7253, -0.6620],
        [-0.7330, -0.6548],
        [-0.7324, -0.6554]])


## Load pytorch model to tvm 
- [tvm reference](https://tvm.apache.org/docs/tutorials/frontend/from_pytorch.html#sphx-glr-tutorials-frontend-from-pytorch-py)

In [3]:
# tvm modules


import numpy as np

from tvm.contrib.download import download_testdata

# PyTorch imports
import torch
import torchvision


import tvm
from tvm import te
from tvm import rpc, autotvm, relay
from tvm.contrib import graph_runtime, download
from tvm.contrib.debugger import debug_runtime
from tvm.relay import transform
from tvm import relay

import vta
from vta.testing import simulator
from vta.top import graph_pack

# Make sure that TVM was compiled with RPC=1
assert tvm.runtime.enabled("rpc")

In [4]:
# transformer modules

import transformer_simple
import classifier
import util

In [5]:
model_name = "transformer"
# single transformer
mx = 512
embedding_size = 128
vocab_size = 50000
NUM_CLS = 2
max_pool = False
num_heads = 1
depth = 1

PATH = 'saved_model/single_transformer1.pt'

model = classifier.TransformerSimpleClassify(n_seq=mx, dim_emb=embedding_size, dim_internal=embedding_size, \
                                                         num_tokens=vocab_size, num_classes=NUM_CLS, max_pool=max_pool, \
                                                         heads=num_heads, depth=depth)
model.load_state_dict(torch.load(PATH))
model = model.eval()

In [6]:
# We grab the TorchScripted model via tracing
input_shape = [4, 498]
input_data = torch.randint(0, vocab_size, input_shape)
scripted_model = torch.jit.trace(model, input_data).eval()

  assert e == self.dim_emb, f'Input embedding ({e}) should match the layer embedding ({self.dim_emb})'


## Import the graph to Relay

In [7]:
input_name = "input0"
shape_list = [(input_name, input_shape)]
mod, params = relay.frontend.from_pytorch(scripted_model, shape_list)

## VTA testing
from https://tvm.apache.org/docs/vta/tutorials/frontend/deploy_classification.html#sphx-glr-vta-tutorials-frontend-deploy-classification-py


## Loading VTA parameters


In [8]:
env = vta.get_env()

## define the platform and model targets

In [9]:
# Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file
env = vta.get_env()

# Set ``device=arm_cpu`` to run inference on the CPU
# or ``device=vta`` to run inference on the FPGA.
device = "vta"
target = env.target if device == "vta" else env.target_vta_cpu

## FPGA programming

In [10]:
if env.TARGET not in ["sim", "tsim"]:

    # Get remote from tracker node if environment variable is set.
    # To set up the tracker, you'll need to follow the "Auto-tuning
    # a convolutional network for VTA" tutorial.
    tracker_host = os.environ.get("TVM_TRACKER_HOST", None)
    tracker_port = os.environ.get("TVM_TRACKER_PORT", None)
    # Otherwise if you have a device you want to program directly from
    # the host, make sure you've set the variables below to the IP of
    # your board.
    device_host = os.environ.get("VTA_RPC_HOST", "192.168.2.99")
    device_port = os.environ.get("VTA_RPC_PORT", "9091")
    if not tracker_host or not tracker_port:
        remote = rpc.connect(device_host, int(device_port))
    else:
        remote = autotvm.measure.request_remote(
            env.TARGET, tracker_host, int(tracker_port), timeout=10000
        )

    # Reconfigure the JIT runtime and FPGA.
    # You can program the FPGA with your own custom bitstream
    # by passing the path to the bitstream file instead of None.
    reconfig_start = time.time()
    vta.reconfig_runtime(remote)
    vta.program_fpga(remote, bitstream=None)
    reconfig_time = time.time() - reconfig_start
    print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time))

# In simulation mode, host the RPC server locally.
else:
    remote = rpc.LocalSession()

# Get execution context from remote
ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)

## Input placeholders

In [11]:

input_vta = tvm.te.placeholder(input_shape, name="input", dtype=env.acc_dtype)


In [12]:
target

ext_dev -keys=vta,cpu -device=vta -model=sim_1x16_i8w8a32_15_15_18_17

## build the inference graph runtime

In [13]:

shape_dict = {}
dtype_dict = {}
shape_dict.update({k: v.shape for k, v in params.items()})
dtype_dict.update({k: str(v.dtype) for k, v in params.items()})

In [14]:
shape_dict

{'toprobs.bias': (2,),
 'toprobs.weight': (2, 128),
 'trfm_blocks.0.norm2.bias': (128,),
 'trfm_blocks.0.norm2.weight': (128,),
 'trfm_blocks.0.ff.2.bias': (128,),
 'trfm_blocks.0.ff.2.weight': (128, 512),
 'trfm_blocks.0.ff.0.bias': (512,),
 'trfm_blocks.0.ff.0.weight': (512, 128),
 'trfm_blocks.0.norm1.bias': (128,),
 'trfm_blocks.0.norm1.weight': (128,),
 'trfm_blocks.0.mha.w_o.0.bias': (128,),
 'trfm_blocks.0.mha.w_o.0.weight': (128, 128),
 'trfm_blocks.0.mha.attentions.0.tovalues.bias': (128,),
 'trfm_blocks.0.mha.attentions.0.tovalues.weight': (128, 128),
 'trfm_blocks.0.mha.attentions.0.tokeys.bias': (128,),
 'trfm_blocks.0.mha.attentions.0.tokeys.weight': (128, 128),
 'trfm_blocks.0.mha.attentions.0.toqueries.bias': (128,),
 'trfm_blocks.0.mha.attentions.0.toqueries.weight': (128, 128),
 'pos_embedding.weight': (512, 128),
 'token_embedding.weight': (50000, 128)}

In [15]:
dtype_dict

{'toprobs.bias': 'float32',
 'toprobs.weight': 'float32',
 'trfm_blocks.0.norm2.bias': 'float32',
 'trfm_blocks.0.norm2.weight': 'float32',
 'trfm_blocks.0.ff.2.bias': 'float32',
 'trfm_blocks.0.ff.2.weight': 'float32',
 'trfm_blocks.0.ff.0.bias': 'float32',
 'trfm_blocks.0.ff.0.weight': 'float32',
 'trfm_blocks.0.norm1.bias': 'float32',
 'trfm_blocks.0.norm1.weight': 'float32',
 'trfm_blocks.0.mha.w_o.0.bias': 'float32',
 'trfm_blocks.0.mha.w_o.0.weight': 'float32',
 'trfm_blocks.0.mha.attentions.0.tovalues.bias': 'float32',
 'trfm_blocks.0.mha.attentions.0.tovalues.weight': 'float32',
 'trfm_blocks.0.mha.attentions.0.tokeys.bias': 'float32',
 'trfm_blocks.0.mha.attentions.0.tokeys.weight': 'float32',
 'trfm_blocks.0.mha.attentions.0.toqueries.bias': 'float32',
 'trfm_blocks.0.mha.attentions.0.toqueries.weight': 'float32',
 'pos_embedding.weight': 'float32',
 'token_embedding.weight': 'float32'}

In [16]:
mod["main"]

FunctionNode([Var(token_embedding.weight, ty=TensorType([50000, 128], float32)), Var(input0, ty=TensorType([4, 498], int64)), Var(pos_embedding.weight, ty=TensorType([512, 128], float32)), Var(trfm_blocks.0.mha.attentions.0.toqueries.weight, ty=TensorType([128, 128], float32)), Var(trfm_blocks.0.mha.attentions.0.toqueries.bias, ty=TensorType([128], float32)), Var(trfm_blocks.0.mha.attentions.0.tokeys.weight, ty=TensorType([128, 128], float32)), Var(trfm_blocks.0.mha.attentions.0.tokeys.bias, ty=TensorType([128], float32)), Var(trfm_blocks.0.mha.attentions.0.tovalues.weight, ty=TensorType([128, 128], float32)), Var(trfm_blocks.0.mha.attentions.0.tovalues.bias, ty=TensorType([128], float32)), Var(trfm_blocks.0.mha.w_o.0.weight, ty=TensorType([128, 128], float32)), Var(trfm_blocks.0.mha.w_o.0.bias, ty=TensorType([128], float32)), Var(trfm_blocks.0.norm1.weight, ty=TensorType([128], float32)), Var(trfm_blocks.0.norm1.bias, ty=TensorType([128], float32)), Var(trfm_blocks.0.ff.0.weight, ty=T

In [17]:
mod

IRModuleNode( {GlobalVar(main): FunctionNode([Var(token_embedding.weight, ty=TensorType([50000, 128], float32)), Var(input0, ty=TensorType([4, 498], int64)), Var(pos_embedding.weight, ty=TensorType([512, 128], float32)), Var(trfm_blocks.0.mha.attentions.0.toqueries.weight, ty=TensorType([128, 128], float32)), Var(trfm_blocks.0.mha.attentions.0.toqueries.bias, ty=TensorType([128], float32)), Var(trfm_blocks.0.mha.attentions.0.tokeys.weight, ty=TensorType([128, 128], float32)), Var(trfm_blocks.0.mha.attentions.0.tokeys.bias, ty=TensorType([128], float32)), Var(trfm_blocks.0.mha.attentions.0.tovalues.weight, ty=TensorType([128, 128], float32)), Var(trfm_blocks.0.mha.attentions.0.tovalues.bias, ty=TensorType([128], float32)), Var(trfm_blocks.0.mha.w_o.0.weight, ty=TensorType([128, 128], float32)), Var(trfm_blocks.0.mha.w_o.0.bias, ty=TensorType([128], float32)), Var(trfm_blocks.0.norm1.weight, ty=TensorType([128], float32)), Var(trfm_blocks.0.norm1.bias, ty=TensorType([128], float32)), Var

In [18]:
# start_name="nn.batch_matmul"
# end_name="reshape"

In [19]:
    if target.device_name == "vta":
        # Perform quantization in Relay
        # Note: We set opt_level to 3 in order to fold batch norm
        with tvm.transform.PassContext(opt_level=3):
            with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]):
                mod = relay.quantize.quantize(mod, params=params)
            # Perform graph packing and constant folding for VTA target
            assert env.BLOCK_IN == env.BLOCK_OUT
#             relay_prog = graph_pack(
#                 mod["main"],
#                 env.BATCH,
#                 env.BLOCK_OUT,
#                 env.WGT_WIDTH,
#                 start_name=start_name,
#                 stop_name=end_name,

#             )
    else:
        relay_prog = mod["main"]
        


In [20]:
# Compile Relay program with AlterOpLayout disabled
if target.device_name != "vta":
    with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}):
        graph, lib, params = relay.build(
            mod, target=target, params=params, target_host=env.target_host
        )
else:
    with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
        lib = relay.build(mod, target=target, params=params, target_host=env.target_host)

"-target" is deprecated, use "-mtriple" instead.
"-target" is deprecated, use "-mtriple" instead.
"-target" is deprecated, use "-mtriple" instead.
"-target" is deprecated, use "-mtriple" instead.
"-target" is deprecated, use "-mtriple" instead.
"-target" is deprecated, use "-mtriple" instead.
Cannot find config for target=ext_dev -keys=vta,cpu -device=vta -model=sim_1x16_i8w8a32_15_15_18_17, workload=('dense_nopack.x86', ('TENSOR', (4, 128), 'float32'), ('TENSOR', (2, 128), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
Cannot find config for target=ext_dev -keys=vta,cpu -device=vta -model=sim_1x16_i8w8a32_15_15_18_17, workload=('batch_matmul.x86', ('TENSOR', (4, 498, 512), 'float32'), ('TENSOR', (4, 128, 512), 'float32')). A fallback configuration is used, which may bring great performance regression.
Cannot find config for target=ext_dev -keys=vta,cpu -device=vta -model=sim_1x16_i8w8a32_15_15_18_17, workload=('batch_matmu

In [24]:
    # Send the inference library over to the remote RPC server
    from tvm.contrib import util
    temp = util.tempdir()
    lib.export_library(temp.relpath("graphlib.tar"))
    remote.upload(temp.relpath("graphlib.tar"))
    lib = remote.load_module("graphlib.tar")

    # Graph runtime
    m = graph_runtime.GraphModule(lib["default"](ctx))

In [37]:
print(lib)

Module(rpc, 7f84d0edc5b8)


## perform the inference

In [25]:
# Set inputs
m.set_input(input_name, tvm.nd.array(input_data))
# Execute
m.run()
# Get outputs
tvm_output = m.get_output(0)

In [26]:
tvm_output

<tvm.nd.NDArray shape=(4, 2), remote[0]:ext_dev(0)>
array([[-0.7481267 , -0.6410334 ],
       [-0.75686437, -0.63324785],
       [-0.7323994 , -0.6553777 ],
       [-0.76960146, -0.6221253 ]], dtype=float32)

In [27]:
input_data

tensor([[42453, 23662, 24983,  ...,  1561,  1524, 43533],
        [36655, 11660, 41342,  ..., 30393,  6298, 16810],
        [25529,  4065, 21512,  ..., 36405, 39718, 45385],
        [47098,  4221, 36690,  ..., 47497, 22007,  7896]])

In [28]:
type(input_data)

torch.Tensor

In [29]:
INPUT_DATA_PATH='saved_model/input_data_vat.pt'
torch.save(input_data, INPUT_DATA_PATH)

In [30]:
np_output = tvm_output.asnumpy()

In [31]:
torch_output = torch.from_numpy(np_output)

In [32]:
torch_output

tensor([[-0.7481, -0.6410],
        [-0.7569, -0.6332],
        [-0.7324, -0.6554],
        [-0.7696, -0.6221]])

In [33]:
OUTPUT_DATA_PATH='saved_model/output_data_vta.pt'
torch.save(torch_output, OUTPUT_DATA_PATH)

## Generate code (WIP)

In [34]:
#mhost = tvm.build(mod, target=target)
print(mod.astext(show_meta_data=True))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## Save and Load Compiled Module

In [35]:
# save the graph, lib and params into separate files
from tvm.contrib import util

LIB_PATH='saved_model/deploy_lib_vta.tar'
lib.export_library(LIB_PATH)


RuntimeError: Tar error:
tar: no files or directories specified


In [None]:
LIB_PATH 

In [None]:
# load the module back.
loaded_lib = tvm.runtime.load_module(LIB_PATH)


m = graph_runtime.GraphModule(loaded_lib["default"](ctx))
# Set inputs
m.set_input(input_name, tvm.nd.array(input_data))
# Execute
m.run()
# Get outputs
tvm_output = m.get_output(0)


In [None]:
tvm_output