# How to write an AutoTVM template?

In this tutorial, we are going to show how to convert the schedule we just created into an AutoTVM template.

In [1]:
import sys
import numpy
import logging
import numpy as np
import tvm
from tvm import te
from tvm import autotvm

# The size of the matrix
# (M, K) x (K, N)
# You are free to try out different shapes, sometimes TVM optimization outperforms numpy with MKL.
M = 1024
K = 1024
N = 1024

# The default tensor type in tvm
dtype = "float32"

# using Intel AVX2(Advanced Vector Extensions) ISA for SIMD
# To get the best performance, please change the following line
# to llvm -mcpu=core-avx2, or specific type of CPU you use
target = 'llvm -mcpu=core-avx2'
ctx = tvm.context(target, 0)

The following code defines a manual schedule for matrix multiplication.

In [2]:
def matmul_v0(M, K, N):
    A = te.placeholder((M, K), name='A')
    B = te.placeholder((K, N), name='B')
    k = te.reduce_axis((0, K), 'k')
    C = te.compute((M, N),
                   lambda y, x: te.sum(A[y, k] * B[k, x], axis=k),
                   name='C')

    # schedule
    s = te.create_schedule(C.op)

    bn = 32
    y, x = s[C].op.axis
    k, = s[C].op.reduce_axis
    yo, yi = s[C].split(y, bn)
    xo, xi = s[C].split(x, bn)
    ko, ki = s[C].split(k, 4)

    # re-ordering
    s[C].reorder(yo, xo, ko, yi, ki, xi)
    s[C].vectorize(xi)
    s[C].parallel(yo)

    return s, [A, B, C]

In the previous schedule code, we use constant 32 and 8 as tiling factors. However, it might not be the best ones on your hardware.

To solve this problem, we can convert the constants into a tunable parameter and let AutoTVM to learn which value works best.

In [3]:
# 1. use a decorator
@autotvm.template("example/matmul_v1")
def matmul_v1(M, K, N):
    A = te.placeholder((M, K), name='A')
    B = te.placeholder((K, N), name='B')
    k = te.reduce_axis((0, K), 'k')
    C = te.compute((M, N),
                   lambda y, x: te.sum(A[y, k] * B[k, x], axis=k),
                   name='C')

    # schedule
    s = te.create_schedule(C.op)
    y, x = s[C].op.axis
    k, = s[C].op.reduce_axis
    
    # 2. get the config object
    cfg = autotvm.get_config()
    
    # 3. define search space
    cfg.define_split("tile_y", M, num_outputs=2)
    cfg.define_split("tile_x", N, num_outputs=2, filter=lambda x: x.size[1] in [1, 2, 4, 8, 16])
    cfg.define_split("tile_k", K, num_outputs=2)
    
    # 4. apply the config
    yo, yi = cfg["tile_y"].apply(s, C, y)
    xo, xi = cfg["tile_x"].apply(s, C, x)
    ko, ki = cfg["tile_k"].apply(s, C, k)

    # 5. Finish the rest of schedule
    s[C].reorder(yo, xo, ko, yi, ki, xi)
    s[C].vectorize(xi)
    s[C].parallel(yo)

    return s, [A, B, C]

We then create the a tuning task, and we can inspect the search space.

In [4]:
task = autotvm.task.create("example/matmul_v1", args=(M, K, N), target='llvm -mcpu=core-avx2')
print(task.config_space)

ConfigSpace (len=605, space_map=
   0 tile_y: Split(policy=factors, product=1024, num_outputs=2) len=11
   1 tile_x: Split(policy=factors, product=1024, num_outputs=2) len=5
   2 tile_k: Split(policy=factors, product=1024, num_outputs=2) len=11
)


There are four different tuners predefined in AutoTVM: random, grid search, genetic algorithm, and XGBoost. Usually we use XGBoost tuner to tune the task. In this tutorial, since the search space is small, we use random tuner to tune 10 iterations for demonstration.

In [None]:
# logging config (for printing tuning log to the screen)
logging.getLogger('autotvm').setLevel(logging.DEBUG)
logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))

# There are two steps for measuring a config: build and run.
# By default, we use all CPU cores to compile program. Then measure them sequentially.
# We measure 5 times and take average to reduce variance.
measure_option = autotvm.measure_option(
    builder='local',
    runner=autotvm.LocalRunner(number=5))

# Begin tuning with RandomTuner, log records to file `matmul.log`
# You can use alternatives like XGBTuner.
tuner = autotvm.tuner.RandomTuner(task)
tuner.tune(n_trial=10,
           measure_option=measure_option,
           callbacks=[autotvm.callback.log_to_file('matmul.log')])

Sample output:
```
No: 1	GFLOPS: 70.62/70.62	result: MeasureResult(costs=(0.030408550200000002,), error_no=0, all_cost=0.9211561679840088, timestamp=1587486930.823651)	[('tile_y', [-1, 32]), ('tile_x', [-1, 128]), ('tile_k', [-1, 8])],None,445
No: 2	GFLOPS: 29.38/70.62	result: MeasureResult(costs=(0.07308577720000001,), error_no=0, all_cost=1.4772090911865234, timestamp=1587486932.216256)	[('tile_y', [-1, 8]), ('tile_x', [-1, 16]), ('tile_k', [-1, 4])],None,289
No: 3	GFLOPS: 43.27/70.62	result: MeasureResult(costs=(0.0496339182,), error_no=0, all_cost=1.098529577255249, timestamp=1587486933.2311969)	[('tile_y', [-1, 64]), ('tile_x', [-1, 8]), ('tile_k', [-1, 8])],None,402
No: 4	GFLOPS: 0.00/70.62	result: MeasureResult(costs=(RuntimeError("Traceback (most recent call last):\n  [bt] (5) 6   ???                                 0x00007ffee33395f0 0x0 + 140732710229488\n  [bt] (4) 5   libffi.6.dylib                      0x000000010f1e4884 ffi_call_unix64 + 76\n  [bt] (3) 4   libtvm.dylib                        0x000000011d949988 TVMFuncCall + 72\n  [bt] (2) 3   libtvm.dylib                        0x000000011d985c0d std::__1::__function::__func<tvm::runtime::RPCModuleNode::WrapRemote(void*)::'lambda'(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*), std::__1::allocator<tvm::runtime::RPCModuleNode::WrapRemote(void*)::'lambda'(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>, void (tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)>::operator()(tvm::runtime::TVMArgs&&, tvm::runtime::TVMRetValue*&&) + 93\n  [bt] (1) 2   libtvm.dylib                        0x000000011d98a856 tvm::runtime::RPCSession::CallFunc(void*, tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*, void* (*)(int, tvm::runtime::TVMArgValue const&), tvm::runtime::PackedFunc const*) + 310"),), error_no=4, all_cost=10.226943254470825, timestamp=1587486943.3877)	[('tile_y', [-1, 16]), ('tile_x', [-1, 1]), ('tile_k', [-1, 1])],None,4
No: 5	GFLOPS: 49.93/70.62	result: MeasureResult(costs=(0.0430116264,), error_no=0, all_cost=0.9452111721038818, timestamp=1587486947.725454)	[('tile_y', [-1, 8]), ('tile_x', [-1, 16]), ('tile_k', [-1, 16])],None,531
No: 6	GFLOPS: 21.08/70.62	result: MeasureResult(costs=(0.1018855638,), error_no=0, all_cost=1.9517230987548828, timestamp=1587486949.593319)	[('tile_y', [-1, 128]), ('tile_x', [-1, 4]), ('tile_k', [-1, 8])],None,392
No: 7	GFLOPS: 30.83/70.62	result: MeasureResult(costs=(0.069658234,), error_no=0, all_cost=4.6122565269470215, timestamp=1587486950.92193)	[('tile_y', [-1, 256]), ('tile_x', [-1, 1024]), ('tile_k', [-1, 32])],None,723
No: 8	GFLOPS: 17.86/70.62	result: MeasureResult(costs=(0.1202257842,), error_no=0, all_cost=2.2247397899627686, timestamp=1587486953.050207)	[('tile_y', [-1, 256]), ('tile_x', [-1, 64]), ('tile_k', [-1, 1])],None,74
No: 9	GFLOPS: 12.10/70.62	result: MeasureResult(costs=(0.17751151,), error_no=0, all_cost=2.986374616622925, timestamp=1587486956.2598379)	[('tile_y', [-1, 2]), ('tile_x', [-1, 8]), ('tile_k', [-1, 2])],None,155
No: 10	GFLOPS: 31.15/70.62	result: MeasureResult(costs=(0.068937509,), error_no=0, all_cost=1.3671948909759521, timestamp=1587486957.581886)	[('tile_y', [-1, 4]), ('tile_x', [-1, 256]), ('tile_k', [-1, 2])],None,211
```

After the tuning job finishes, we can apply history best from the log file and check its correctness.

In [5]:
# apply history best from log file
with autotvm.apply_history_best('matmul.log'):
    with tvm.target.create("llvm"):
        s, arg_bufs = matmul_v1(M, K, N)
        func = tvm.build(s, arg_bufs)

# check correctness
a_np = np.random.uniform(size=(M, K)).astype(np.float32)
b_np = np.random.uniform(size=(K, N)).astype(np.float32)
c_np = a_np.dot(b_np)

a_tvm = tvm.nd.array(a_np)
b_tvm = tvm.nd.array(b_np)
c_tvm = tvm.nd.empty(c_np.shape)
func(a_tvm, b_tvm, c_tvm)

tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)

evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
print('Opt: %f' % evaluator(a_tvm, b_tvm, c_tvm).mean)

Opt4: 0.088083
