In [1]:
import tvm

In [2]:
from tvm import relay

In [3]:
import scipy.sparse as sp
import numpy as np

In [28]:
with open("./prunebert.params", "rb") as fi:
    param = relay.load_param_dict(fi.read())

In [29]:
with open("./prunebert.json") as fi:
    func = tvm.ir.load_json(fi.read())["main"]

In [30]:
new_f, new_param = relay.data_dep_optimization.simplify_fc_transpose.convert(func, param)

In [7]:
input_name = "input_ids"
input_shape = (1, 128)
input_dtype = "int64"

In [8]:
inputs = {input_name : tvm.nd.array(np.random.randint(0, 10000, size=input_shape).astype(input_dtype))}

In [9]:
def time_running(mod, params, inputs):
    with relay.build_config(opt_level=3):
        graph, lib, new_params = relay.build(mod, "llvm -mcpu=core-avx2", params=params)

    from tvm.contrib import graph_runtime
    #from tvm.contrib.debugger import debug_runtime as graph_runtime
    ctx = tvm.cpu(0)
    dtype = 'float32'
    m = graph_runtime.create(graph, lib, ctx)
    # set inputs
    m.set_input(**inputs)
    m.set_input(**new_params)
    # execute
    m.run()
    # get outputs
    tvm_output = m.get_output(0)
    ftimer = m.module.time_evaluator("run", tvm.cpu(), min_repeat_ms=1000, repeat=20)
    args = [v for _, v in inputs.items()]
    results = ftimer(*args)
    return results, tvm_output.asnumpy()

In [10]:
dense_t_time, dense_t_output = time_running(new_f, new_param, inputs)
print("Dense time: {tt} ms".format(tt=dense_t_time.mean * 10 ** 3))

Cannot find config for target=llvm -mcpu=core-avx2, workload=('dense_nopack.x86', ('TENSOR', (1, 768), 'float32'), ('TENSOR', (2, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
Cannot find config for target=llvm -mcpu=core-avx2, workload=('dense_nopack.x86', ('TENSOR', (1, 768), 'float32'), ('TENSOR', (768, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
Cannot find config for target=llvm -mcpu=core-avx2, workload=('dense_nopack.x86', ('TENSOR', (128, 3072), 'float32'), ('TENSOR', (768, 3072), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
Cannot find config for target=llvm -mcpu=core-avx2, workload=('dense_pack.x86', ('TENSOR', (128, 3072), 'float32'), ('TENSOR', (768, 3072), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression.
Cann

In [11]:
blocksize = (16, 1)

In [16]:
bsr_expr, bsr_params = relay.data_dep_optimization.bsr_dense.convert(new_f,
                                                                     new_param,
                                                                     blocksize,
                                                                     sparsity_threshold=0.8)

KeyError: '1852.T'

In [13]:
bsr_time, bsr_output = time_running(bsr_expr, bsr_params, inputs)
print("BSR {bs} time: {tt} ms".format(bs=blocksize, tt=bsr_time.mean * 10 ** 3))

BSR (16, 1) time: 131.3992558 ms


In [26]:
blocksize = (2, 1)
bsr_expr, bsr_params = relay.data_dep_optimization.bsr_dense.convert(new_f,
                                                                     new_param,
                                                                     blocksize,
                                                                     sparsity_threshold=0.8)
bsr_time, bsr_output = time_running(bsr_expr, bsr_params, inputs)
print("BSR {bs} time: {tt} ms".format(bs=blocksize, tt=bsr_time.mean * 10 ** 3))

BSR (2, 1) time: 69.3001828625 ms


In [22]:
blocksize = (32, 1)
bsr_expr, bsr_params = relay.data_dep_optimization.bsr_dense.convert(new_f,
                                                                     new_param,
                                                                     blocksize,
                                                                     sparsity_threshold=0.8)
bsr_time, bsr_output = time_running(bsr_expr, bsr_params, inputs)
print("BSR {bs} time: {tt} ms".format(bs=blocksize, tt=bsr_time.mean * 10 ** 3))

BSR (32, 1) time: 79.802192384375 ms


In [26]:
blocksize = (8, 1)
bsr_expr, bsr_params = relay.data_dep_optimization.bsr_dense.convert(new_f,
                                                                     new_param,
                                                                     blocksize,
                                                                     sparsity_threshold=0.8)
bsr_time, bsr_output = time_running(bsr_expr, bsr_params, inputs)
print("BSR {bs} time: {tt} ms".format(bs=blocksize, tt=bsr_time.mean * 10 ** 3))

BSR (8, 1) time: 84.4655857375 ms


In [30]:
blocksize = (1, 16)
bsr_expr, bsr_params = relay.data_dep_optimization.bsr_dense.convert(new_f,
                                                                     new_param,
                                                                     blocksize,
                                                                     sparsity_threshold=0.8)
bsr_time, bsr_output = time_running(bsr_expr, bsr_params, inputs)
print("BSR {bs} time: {tt} ms".format(bs=blocksize, tt=bsr_time.mean * 10 ** 3))

BSR (1, 16) time: 463.27865783 ms


In [31]:
blocksize = (1, 1)
bsr_expr, bsr_params = relay.data_dep_optimization.bsr_dense.convert(new_f,
                                                                     new_param,
                                                                     blocksize,
                                                                     sparsity_threshold=0.8)
bsr_time, bsr_output = time_running(bsr_expr, bsr_params, inputs)
print("BSR {bs} time: {tt} ms".format(bs=blocksize, tt=bsr_time.mean * 10 ** 3))

BSR (1, 1) time: 60.39342264411764 ms


In [38]:
blocksize = (4, 4)
bsr_expr, bsr_params = relay.data_dep_optimization.bsr_dense.convert(new_f,
                                                                     new_param,
                                                                     blocksize,
                                                                     sparsity_threshold=0.8)
bsr_time, bsr_output = time_running(bsr_expr, bsr_params, inputs)
print("BSR {bs} time: {tt} ms".format(bs=blocksize, tt=bsr_time.mean * 10 ** 3))

BSR (4, 4) time: 174.13454849000004 ms
