# TVM, load transformer

In [1]:
!pwd

/Users/isong/Downloads/ml/sc/xilinx/Github/transformer_simple/src/python


## Test run

In [2]:
!python ./experiments/classify.py -e 1 -t -d1 -H1 -D -m single_transformer.pt

OPTIONS  Namespace(batch_size=4, debug=True, depth=1, embedding_size=128, final=False, gradient_clipping=1.0, lr=0.0001, lr_warmup=10000, max_length=512, max_pool=False, model_name='single_transformer.pt', num_epochs=1, num_heads=1, seed=1, tb_dir='./runs', tiny=True, vocab_size=50000)
- nr. of training examples 63
- nr. of validation examples 63
Model's state_dict:
token_embedding.weight 	 torch.Size([50000, 128])
pos_embedding.weight 	 torch.Size([512, 128])
trfm_blocks.0.mha.attentions.0.toqueries.weight 	 torch.Size([128, 128])
trfm_blocks.0.mha.attentions.0.toqueries.bias 	 torch.Size([128])
trfm_blocks.0.mha.attentions.0.tokeys.weight 	 torch.Size([128, 128])
trfm_blocks.0.mha.attentions.0.tokeys.bias 	 torch.Size([128])
trfm_blocks.0.mha.attentions.0.tovalues.weight 	 torch.Size([128, 128])
trfm_blocks.0.mha.attentions.0.tovalues.bias 	 torch.Size([128])
trfm_blocks.0.mha.w_o.0.weight 	 torch.Size([128, 128])
trfm_blocks.0.mha.w_o.0.bias 	 torch.Size([128])
trfm_blocks.0.norm1.w

100%|███████████████████████████████████████████| 63/63 [00:04<00:00, 13.64it/s]
-- validation accuracy 0.544
Save model to saved_model/single_transformer.pt
Load model to saved_model/single_transformer1.pt
output_test
tensor([[-0.7327, -0.6551],
        [-0.7253, -0.6620],
        [-0.7330, -0.6548],
        [-0.7324, -0.6554]], grad_fn=<LogSoftmaxBackward>)
output_load
tensor([[-0.7327, -0.6551],
        [-0.7253, -0.6620],
        [-0.7330, -0.6548],
        [-0.7324, -0.6554]])


## Load pytorch model to tvm 
- [tvm reference](https://tvm.apache.org/docs/tutorials/frontend/from_pytorch.html#sphx-glr-tutorials-frontend-from-pytorch-py)

In [40]:
# tvm modules


import numpy as np

from tvm.contrib.download import download_testdata

# PyTorch imports
import torch
import torchvision


import tvm
from tvm import te
from tvm import rpc, autotvm, relay
from tvm.contrib import graph_runtime, download
from tvm.contrib.debugger import debug_runtime
from tvm.relay import transform
from tvm import relay

import vta
from vta.testing import simulator
from vta.top import graph_pack

# Make sure that TVM was compiled with RPC=1
assert tvm.runtime.enabled("rpc")

In [41]:
# transformer modules

import transformer_simple
import classifier
import util

In [77]:
model_name = "transformer"
# single transformer
mx = 512
embedding_size = 128
vocab_size = 50000
NUM_CLS = 2
max_pool = False
num_heads = 1
depth = 1

PATH = 'saved_model/single_transformer1.pt'

model = classifier.TransformerSimpleClassify(n_seq=mx, dim_emb=embedding_size, dim_internal=embedding_size, \
                                                         num_tokens=vocab_size, num_classes=NUM_CLS, max_pool=max_pool, \
                                                         heads=num_heads, depth=depth)
model.load_state_dict(torch.load(PATH))
model = model.eval()

In [78]:
# We grab the TorchScripted model via tracing
input_shape = [4, 498]
input_data = torch.randint(0, vocab_size, input_shape)
scripted_model = torch.jit.trace(model, input_data).eval()

  assert e == self.dim_emb, f'Input embedding ({e}) should match the layer embedding ({self.dim_emb})'


## Import the graph to Relay

In [79]:
input_name = "input0"
shape_list = [(input_name, input_shape)]
mod, params = relay.frontend.from_pytorch(scripted_model, shape_list)

In [80]:
print(mod)

type tensor_int8_t {
  tensor_nil_int8,
  tensor0_int8(int8),
  tensor1_int8(Tensor[(?), int8]),
  tensor2_int8(Tensor[(?, ?), int8]),
  tensor3_int8(Tensor[(?, ?, ?), int8]),
  tensor4_int8(Tensor[(?, ?, ?, ?), int8]),
  tensor5_int8(Tensor[(?, ?, ?, ?, ?), int8]),
  tensor6_int8(Tensor[(?, ?, ?, ?, ?, ?), int8]),
}

type tensor_uint16_t {
  tensor_nil_uint16,
  tensor0_uint16(uint16),
  tensor1_uint16(Tensor[(?), uint16]),
  tensor2_uint16(Tensor[(?, ?), uint16]),
  tensor3_uint16(Tensor[(?, ?, ?), uint16]),
  tensor4_uint16(Tensor[(?, ?, ?, ?), uint16]),
  tensor5_uint16(Tensor[(?, ?, ?, ?, ?), uint16]),
  tensor6_uint16(Tensor[(?, ?, ?, ?, ?, ?), uint16]),
}

type Option[A] {
  Some(A),
  None,
}

type tensor_uint8_t {
  tensor_nil_uint8,
  tensor0_uint8(uint8),
  tensor1_uint8(Tensor[(?), uint8]),
  tensor2_uint8(Tensor[(?, ?), uint8]),
  tensor3_uint8(Tensor[(?, ?, ?), uint8]),
  tensor4_uint8(Tensor[(?, ?, ?, ?), uint8]),
  tensor5_uint8(Tensor[(?, ?, ?, ?, ?), uint8]),
  tensor

## VTA testing
from https://tvm.apache.org/docs/vta/tutorials/frontend/deploy_classification.html#sphx-glr-vta-tutorials-frontend-deploy-classification-py


## Loading VTA parameters


In [81]:
env = vta.get_env()

## define the platform and model targets

In [82]:
# Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file
env = vta.get_env()

# Set ``device=arm_cpu`` to run inference on the CPU
# or ``device=vta`` to run inference on the FPGA.
device = "vta"
target = env.target if device == "vta" else env.target_vta_cpu

## FPGA programming

In [83]:
if env.TARGET not in ["sim", "tsim"]:

    # Get remote from tracker node if environment variable is set.
    # To set up the tracker, you'll need to follow the "Auto-tuning
    # a convolutional network for VTA" tutorial.
    tracker_host = os.environ.get("TVM_TRACKER_HOST", None)
    tracker_port = os.environ.get("TVM_TRACKER_PORT", None)
    # Otherwise if you have a device you want to program directly from
    # the host, make sure you've set the variables below to the IP of
    # your board.
    device_host = os.environ.get("VTA_RPC_HOST", "192.168.2.99")
    device_port = os.environ.get("VTA_RPC_PORT", "9091")
    if not tracker_host or not tracker_port:
        remote = rpc.connect(device_host, int(device_port))
    else:
        remote = autotvm.measure.request_remote(
            env.TARGET, tracker_host, int(tracker_port), timeout=10000
        )

    # Reconfigure the JIT runtime and FPGA.
    # You can program the FPGA with your own custom bitstream
    # by passing the path to the bitstream file instead of None.
    reconfig_start = time.time()
    vta.reconfig_runtime(remote)
    vta.program_fpga(remote, bitstream=None)
    reconfig_time = time.time() - reconfig_start
    print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time))

# In simulation mode, host the RPC server locally.
else:
    remote = rpc.LocalSession()

# Get execution context from remote
ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)

## Input placeholders

In [84]:

input_vta = tvm.te.placeholder(input_shape, name="input", dtype=env.acc_dtype)


In [85]:
target

ext_dev -keys=vta,cpu -device=vta -model=sim_1x16_i8w8a32_15_15_18_17

## build the inference graph runtime

In [86]:

# shape_dict = {}
# dtype_dict = {}
shape_dict.update({k: v.shape for k, v in params.items()})
dtype_dict.update({k: str(v.dtype) for k, v in params.items()})

In [87]:
shape_dict

{'toprobs.bias': (2,),
 'toprobs.weight': (2, 128),
 'trfm_blocks.0.norm2.bias': (128,),
 'trfm_blocks.0.norm2.weight': (128,),
 'trfm_blocks.0.ff.2.bias': (128,),
 'trfm_blocks.0.ff.2.weight': (128, 512),
 'trfm_blocks.0.ff.0.bias': (512,),
 'trfm_blocks.0.ff.0.weight': (512, 128),
 'trfm_blocks.0.norm1.bias': (128,),
 'trfm_blocks.0.norm1.weight': (128,),
 'trfm_blocks.0.mha.w_o.0.bias': (128,),
 'trfm_blocks.0.mha.w_o.0.weight': (128, 128),
 'trfm_blocks.0.mha.attentions.0.tovalues.bias': (128,),
 'trfm_blocks.0.mha.attentions.0.tovalues.weight': (128, 128),
 'trfm_blocks.0.mha.attentions.0.tokeys.bias': (128,),
 'trfm_blocks.0.mha.attentions.0.tokeys.weight': (128, 128),
 'trfm_blocks.0.mha.attentions.0.toqueries.bias': (128,),
 'trfm_blocks.0.mha.attentions.0.toqueries.weight': (128, 128),
 'pos_embedding.weight': (512, 128),
 'token_embedding.weight': (50000, 128)}

In [88]:
dtype_dict

{'toprobs.bias': 'float32',
 'toprobs.weight': 'float32',
 'trfm_blocks.0.norm2.bias': 'float32',
 'trfm_blocks.0.norm2.weight': 'float32',
 'trfm_blocks.0.ff.2.bias': 'float32',
 'trfm_blocks.0.ff.2.weight': 'float32',
 'trfm_blocks.0.ff.0.bias': 'float32',
 'trfm_blocks.0.ff.0.weight': 'float32',
 'trfm_blocks.0.norm1.bias': 'float32',
 'trfm_blocks.0.norm1.weight': 'float32',
 'trfm_blocks.0.mha.w_o.0.bias': 'float32',
 'trfm_blocks.0.mha.w_o.0.weight': 'float32',
 'trfm_blocks.0.mha.attentions.0.tovalues.bias': 'float32',
 'trfm_blocks.0.mha.attentions.0.tovalues.weight': 'float32',
 'trfm_blocks.0.mha.attentions.0.tokeys.bias': 'float32',
 'trfm_blocks.0.mha.attentions.0.tokeys.weight': 'float32',
 'trfm_blocks.0.mha.attentions.0.toqueries.bias': 'float32',
 'trfm_blocks.0.mha.attentions.0.toqueries.weight': 'float32',
 'pos_embedding.weight': 'float32',
 'token_embedding.weight': 'float32'}

In [54]:
mod["main"]

FunctionNode([Var(token_embedding.weight, ty=TensorType([50000, 128], float32)), Var(input0, ty=TensorType([4, 498], int64)), Var(pos_embedding.weight, ty=TensorType([512, 128], float32)), Var(trfm_blocks.0.mha.attentions.0.toqueries.weight, ty=TensorType([128, 128], float32)), Var(trfm_blocks.0.mha.attentions.0.toqueries.bias, ty=TensorType([128], float32)), Var(trfm_blocks.0.mha.attentions.0.tokeys.weight, ty=TensorType([128, 128], float32)), Var(trfm_blocks.0.mha.attentions.0.tokeys.bias, ty=TensorType([128], float32)), Var(trfm_blocks.0.mha.attentions.0.tovalues.weight, ty=TensorType([128, 128], float32)), Var(trfm_blocks.0.mha.attentions.0.tovalues.bias, ty=TensorType([128], float32)), Var(trfm_blocks.0.mha.w_o.0.weight, ty=TensorType([128, 128], float32)), Var(trfm_blocks.0.mha.w_o.0.bias, ty=TensorType([128], float32)), Var(trfm_blocks.0.norm1.weight, ty=TensorType([128], float32)), Var(trfm_blocks.0.norm1.bias, ty=TensorType([128], float32)), Var(trfm_blocks.0.ff.0.weight, ty=T

In [55]:
print(mod)

type tensor_int64_t {
  tensor_nil_int64,
  tensor0_int64(int64),
  tensor1_int64(Tensor[(?), int64]),
  tensor2_int64(Tensor[(?, ?), int64]),
  tensor3_int64(Tensor[(?, ?, ?), int64]),
  tensor4_int64(Tensor[(?, ?, ?, ?), int64]),
  tensor5_int64(Tensor[(?, ?, ?, ?, ?), int64]),
  tensor6_int64(Tensor[(?, ?, ?, ?, ?, ?), int64]),
}

type Option[A] {
  Some(A),
  None,
}

type tensor_float32_t {
  tensor_nil_float32,
  tensor0_float32(float32),
  tensor1_float32(Tensor[(?), float32]),
  tensor2_float32(Tensor[(?, ?), float32]),
  tensor3_float32(Tensor[(?, ?, ?), float32]),
  tensor4_float32(Tensor[(?, ?, ?, ?), float32]),
  tensor5_float32(Tensor[(?, ?, ?, ?, ?), float32]),
  tensor6_float32(Tensor[(?, ?, ?, ?, ?, ?), float32]),
}

type List[A] {
  Cons(A, List[A]),
  Nil,
}

type Tree[A] {
  Rose(A, List[Tree[A]]),
}

type tensor_int16_t {
  tensor_nil_int16,
  tensor0_int16(int16),
  tensor1_int16(Tensor[(?), int16]),
  tensor2_int16(Tensor[(?, ?), int16]),
  tensor3_int16(Tensor[(?

In [89]:
start_name="nn.batch_matmul"
end_name="add"

In [90]:
    if target.device_name == "vta":
        # Perform quantization in Relay
        # Note: We set opt_level to 3 in order to fold batch norm
        with tvm.transform.PassContext(opt_level=3):
#             with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0], target_vta=True):
            with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]):
                mod = relay.quantize.quantize(mod, params=params)
            # Perform graph packing and constant folding for VTA target
#             assert env.BLOCK_IN == env.BLOCK_OUT
#             relay_prog = graph_pack(
#                 mod["main"],
#                 env.BATCH,
#                 env.BLOCK_OUT,
#                 env.WGT_WIDTH,
#                 start_name=start_name,
#                 stop_name=end_name,

#             )
    else:
        relay_prog = mod["main"]
        


In [58]:
print(mod)

def @main(%input0: Tensor[(4, 498), int64]) -> Tensor[(4, 2), float32] {
  %0 = cast(%input0, dtype="int32") /* ty=Tensor[(4, 498), int32] */;
  %1 = take(meta[relay.Constant][0] /* ty=Tensor[(50000, 128), float32] */, %0, axis=0) /* ty=Tensor[(4, 498, 128), float32] */;
  %2 = add(%1, meta[relay.Constant][1] /* ty=Tensor[(4, 498, 128), float32] */) /* ty=Tensor[(4, 498, 128), float32] */;
  %3 = zeros_like(%2) /* ty=Tensor[(4, 498, 128), float32] */;
  %4 = reshape(%2, newshape=[-1, 498, 128]) /* ty=Tensor[(4, 498, 128), float32] */;
  %5 = nn.batch_matmul(%4, meta[relay.Constant][2] /* ty=Tensor[(4, 128, 128), float32] */) /* ty=Tensor[(4, 498, 128), float32] */;
  %6 = reshape(%5, newshape=[4, 498, 128]) /* ty=Tensor[(4, 498, 128), float32] */;
  %7 = add(%6, meta[relay.Constant][3] /* ty=Tensor[(128), float32] */) /* ty=Tensor[(4, 498, 128), float32] */;
  %8 = reshape(%7, newshape=[-1, 498, 128]) /* ty=Tensor[(4, 498, 128), float32] */;
  %9 = reshape(%2, newshape=[-1, 498, 128]) 

In [91]:
# Compile Relay program with AlterOpLayout disabled
if target.device_name != "vta":
    with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}):
        graph, lib, params = relay.build(
            mod, target=target, params=params, target_host=env.target_host
        )
else:
    with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
        lib = relay.build(mod, target=target, params=params, target_host=env.target_host)

"-target" is deprecated, use "-mtriple" instead.
"-target" is deprecated, use "-mtriple" instead.
"-target" is deprecated, use "-mtriple" instead.
"-target" is deprecated, use "-mtriple" instead.
"-target" is deprecated, use "-mtriple" instead.
"-target" is deprecated, use "-mtriple" instead.


In [92]:
    # Send the inference library over to the remote RPC server
    from tvm.contrib import util
    temp = util.tempdir()
    lib.export_library(temp.relpath("graphlib.tar"))
    remote.upload(temp.relpath("graphlib.tar"))
    lib = remote.load_module("graphlib.tar")

    # Graph runtime
    m = graph_runtime.GraphModule(lib["default"](ctx))

In [93]:
print(lib)

Module(rpc, 7fbcae67a068)


## perform the inference

In [95]:
# Set inputs
m.set_input(input_name, tvm.nd.array(input_data))
# Execute
m.run()
# Get outputs
tvm_output = m.get_output(0)

In [96]:
tvm_output

<tvm.nd.NDArray shape=(4, 2), remote[2]:ext_dev(0)>
array([[-0.7253047 , -0.66199154],
       [-0.7016208 , -0.68474466],
       [-0.73683256, -0.65129066],
       [-0.7204744 , -0.666547  ]], dtype=float32)

In [97]:
input_data

tensor([[47722, 37967, 44802,  ...,  1658,  7251, 32717],
        [43559, 46033, 12556,  ..., 15122, 49643,  8711],
        [31860, 20540, 38504,  ..., 49632, 45607, 46800],
        [21973,  5912, 45115,  ..., 26376, 42772, 39109]])

In [98]:
type(input_data)

torch.Tensor

In [99]:
INPUT_DATA_PATH='saved_model/input_data_vat.pt'
torch.save(input_data, INPUT_DATA_PATH)

In [100]:
np_output = tvm_output.asnumpy()

In [101]:
torch_output = torch.from_numpy(np_output)

In [102]:
torch_output

tensor([[-0.7253, -0.6620],
        [-0.7016, -0.6847],
        [-0.7368, -0.6513],
        [-0.7205, -0.6665]])

In [103]:
OUTPUT_DATA_PATH='saved_model/output_data_vta.pt'
torch.save(torch_output, OUTPUT_DATA_PATH)

### Checking VTA operations

It seems that there is no operations sent to VTA.

In [104]:
# Graph runtime
m = graph_runtime.GraphModule(lib["default"](ctx))

In [105]:
m.set_input(input_name, tvm.nd.array(input_data))

In [106]:
# Perform inference and gather execution statistics
# More on: :py:method:`tvm.runtime.Module.time_evaluator`
num = 4  # number of times we run module for a single measurement
rep = 3  # number of measurements (we derive std dev from this)
timer = m.module.time_evaluator("run", ctx, number=num, repeat=rep)


In [107]:
if env.TARGET in ["sim", "tsim"]:
    simulator.clear_stats()
    timer()
    sim_stats = simulator.stats()
    print("\nExecution statistics:")
    for k, v in sim_stats.items():
        # Since we execute the workload many times, we need to normalize stats
        # Note that there is always one warm up run
        # Therefore we divide the overall stats by (num * rep + 1)
        print("\t{:<16}: {:>16}".format(k, v // (num * rep + 1)))
else:
    tcost = timer()
    std = np.std(tcost.results) * 1000
    mean = tcost.mean * 1000
    print("\nPerformed inference in %.2fms (std = %.2f) for %d samples" % (mean, std, env.BATCH))
    print("Average per sample inference time: %.2fms" % (mean / env.BATCH))



Execution statistics:
	inp_load_nbytes :                0
	wgt_load_nbytes :                0
	acc_load_nbytes :                0
	uop_load_nbytes :                0
	out_store_nbytes:                0
	gemm_counter    :                0
	alu_counter     :                0


## Scheduling the computation (WIP)

In [119]:
s = te.create_schedule(m.get_output(0))

TVMError: Traceback (most recent call last):
  [bt] (6) 7   ???                                 0x00007ffee123ab30 0x0 + 140732675631920
  [bt] (5) 6   libffi.7.dylib                      0x000000010f56bead ffi_call_unix64 + 85
  [bt] (4) 5   libtvm.dylib                        0x00000001251d6bb6 TVMFuncCall + 70
  [bt] (3) 4   libtvm.dylib                        0x00000001248a4151 void tvm::runtime::TypedPackedFunc<tvm::te::Schedule (tvm::runtime::Array<tvm::te::Operation, void>)>::AssignTypedLambda<tvm::te::Schedule (*)(tvm::runtime::Array<tvm::te::Operation, void>)>(tvm::te::Schedule (*)(tvm::runtime::Array<tvm::te::Operation, void>))::'lambda'(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)::operator()(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*) const + 337
  [bt] (2) 3   libtvm.dylib                        0x0000000124861c6c tvm::runtime::TVMMovableArgValue_::operator tvm::runtime::Array<tvm::te::Operation, void><tvm::runtime::Array<tvm::te::Operation, void>, void>() const + 172
  [bt] (1) 2   libtvm.dylib                        0x0000000124862160 tvm::runtime::Array<tvm::te::Operation, void> tvm::runtime::TVMPODValue_::AsObjectRef<tvm::runtime::Array<tvm::te::Operation, void> >() const + 1104
  [bt] (0) 1   libtvm.dylib                        0x00000001244af50f dmlc::LogMessageFatal::~LogMessageFatal() + 111
  File "/Users/isong/Downloads/ml/sc/xilinx/Github/tvm/include/tvm/runtime/packed_func.h", line 1391
TVMError: Check failed: ObjectTypeChecker<TObjectRef>: :Check(ptr): Expect Array[Operation] but get Array

## TVM compilation (WIP)

In [None]:
# build GEMM VTA kernel
my_gemm = vta.build()

## Generate code (WIP)

In [33]:
#mhost = tvm.build(mod, target=target)
print(mod.astext(show_meta_data=True))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## Save and Load Compiled Module

In [71]:
# save the graph, lib and params into separate files
from tvm.contrib import util

LIB_PATH='saved_model/deploy_lib_vta.tar'
lib.export_library(LIB_PATH)


RuntimeError: Tar error:
tar: no files or directories specified


In [None]:
LIB_PATH 

In [72]:
# load the module back.
loaded_lib = tvm.runtime.load_module(LIB_PATH)


m = graph_runtime.GraphModule(loaded_lib["default"](ctx))
# Set inputs
m.set_input(input_name, tvm.nd.array(input_data))
# Execute
m.run()
# Get outputs
tvm_output = m.get_output(0)


RuntimeError: Tar error:
tar: Error opening archive: Failed to open 'saved_model/deploy_lib_vta.tar'


In [None]:
tvm_output