In [None]:
"""
Taken directly from https://tvm.apache.org/docs/how_to/tutorials/e2e_opt_model.html
Model Type: CNN
Model Definition: PyTorch
Model Export: torch.export
Model Ingestion: tvm.relax.frontend.torch.from_exported_program
Target: CUDA
Compile and Test Result: FAIL: Did you forget to bind?
"""

'\nTaken directly from https://tvm.apache.org/docs/how_to/tutorials/e2e_opt_model.html\nModel Type: CNN\nModel Definition: PyTorch\nModel Export: torch.export\nModel Ingestion: tvm.relax.frontend.torch.from_exported_program\nTarget: CUDA\nCompile and Test Result: FAIL:\n'

In [2]:
import sys
import os
import torch

# Add TVM path
os.environ['PYTHONPATH'] = "/ssd1/htalendr/tvm/python:" + os.environ.get('PYTHONPATH', '')

# Verify it's set
print(os.environ['PYTHONPATH'])

# Reload sys.path
sys.path.append("/ssd1/htalendr/tvm/python")

# Test import
import tvm
from tvm import relax
print("TVM successfully imported!")


/ssd1/htalendr/tvm/python:
TVM successfully imported!




# End-to-End Optimize Model
This tutorial demonstrates how to optimize a machine learning model using Apache TVM. We will
use a pre-trained ResNet-18 model from PyTorch and end-to-end optimize it using TVM's Relax API.
Please note that default end-to-end optimization may not suit complex models.


## Preparation
First, we prepare the model and input information. We use a pre-trained ResNet-18 model from
PyTorch.



In [3]:
import os
import numpy as np
import torch
from torch import nn
from torch.export import export
from torchvision.models.resnet import ResNet18_Weights, resnet18

class TorchModel(nn.Module):
    def __init__(self):
        super(TorchModel, self).__init__()
        self.fc1 = nn.Linear(784, 256)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(256, 10)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        return x

torch_model = TorchModel().eval()


## Review Overall Flow
The overall flow consists of the following steps:

- **Construct or Import a Model**: Construct a neural network model or import a pre-trained
  model from other frameworks (e.g. PyTorch, ONNX), and create the TVM IRModule, which contains
  all the information needed for compilation, including high-level Relax functions for
  computational graph, and low-level TensorIR functions for tensor program.
- **Perform Composable Optimizations**: Perform a series of optimization transformations,
  such as graph optimizations, tensor program optimizations, and library dispatching.
- **Build and Universal Deployment**: Build the optimized model to a deployable module to the
  universal runtime, and execute it on different devices, such as CPU, GPU, or other accelerators.




### Convert the model to IRModule
Next step, we convert the model to an IRModule using the Relax frontend for PyTorch for further
optimization.



In [4]:
import tvm
from tvm import relax
from tvm.relax.frontend.torch import from_exported_program

# Give an example argument to torch.export
example_args = (torch.randn(10, 784, dtype=torch.float32),)

# Convert the model to IRModule
with torch.no_grad():
    exported_program = export(torch_model, example_args)
    mod = from_exported_program(exported_program, keep_params_as_input=True)

mod, params = relax.frontend.detach_params(mod)
mod.show()

## IRModule Optimization
Apache TVM Unity provides a flexible way to optimize the IRModule. Everything centered
around IRModule optimization can be composed with existing pipelines. Note that each
transformation can be combined as an optimization pipeline via ``tvm.ir.transform.Sequential``.

In this tutorial, we focus on the end-to-end optimization of the model via auto-tuning. We
leverage MetaSchedule to tune the model and store the tuning logs to the database. We also
apply the database to the model to get the best performance.




In [5]:
TOTAL_TRIALS = 2  # Change to 20000 for better performance if needed
target = tvm.target.Target("nvidia/geforce-rtx-3090-ti")  # Change to your target device
work_dir = "tuning_logs"

# Skip running in CI environment
IS_IN_CI = os.getenv("CI", "") == "true"
if not IS_IN_CI:
    mod = relax.get_pipeline("static_shape_tuning", target=target, total_trials=TOTAL_TRIALS)(mod)

    # Only show the main function
    mod["main"].show()

2025-02-10 10:10:21 [INFO] Logging directory: tuning_logs/logs


2025-02-10 10:10:35 [INFO] LocalBuilder: max_workers = 32
2025-02-10 10:10:36 [INFO] LocalRunner: max_workers = 1
2025-02-10 10:10:37 [INFO] [task_scheduler.cc:159] Initializing Task #0: "fused_matmul1_add1"
2025-02-10 10:10:37 [INFO] [task_scheduler.cc:159] Initializing Task #1: "transpose1"
2025-02-10 10:10:37 [INFO] [task_scheduler.cc:159] Initializing Task #2: "fused_matmul_add_relu"
2025-02-10 10:10:37 [INFO] [task_scheduler.cc:159] Initializing Task #3: "transpose"


Unnamed: 0,Name,FLOP,Weight,Speed (GFLOPS),Latency (us),Weighted Latency (us),Trials,Done
0,fused_matmul1_add1,51300,1,,,,0,
1,transpose1,1,1,,,,0,
2,fused_matmul_add_relu,4019200,1,,,,0,
3,transpose,1,1,,,,0,


2025-02-10 10:10:37 [DEBUG] [task_scheduler.cc:318] 
 ID |                  Name |    FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done 
-----------------------------------------------------------------------------------------------------------------------
  0 |    fused_matmul1_add1 |   51300 |      1 |            N/A |          N/A |                   N/A |      0 |      
  1 |            transpose1 |       1 |      1 |            N/A |          N/A |                   N/A |      0 |      
  2 | fused_matmul_add_relu | 4019200 |      1 |            N/A |          N/A |                   N/A |      0 |      
  3 |             transpose |       1 |      1 |            N/A |          N/A |                   N/A |      0 |      
-----------------------------------------------------------------------------------------------------------------------
Total trials: 0
Total latency (us): 0


Total trials: 0
Total latency (us): 0

2025-02-10 10:10:37 [INFO] [

Unnamed: 0,Name,FLOP,Weight,Speed (GFLOPS),Latency (us),Weighted Latency (us),Trials,Done
0,fused_matmul1_add1,51300,1,17.5788,2.9183,2.9183,2,
1,transpose1,1,1,,,,0,
2,fused_matmul_add_relu,4019200,1,,,,0,
3,transpose,1,1,,,,0,



Total trials: 2
Total latency (us): 2.91828

2025-02-10 10:10:48 [DEBUG] [task_scheduler.cc:318] 
 ID |                  Name |    FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done 
-----------------------------------------------------------------------------------------------------------------------
  0 |    fused_matmul1_add1 |   51300 |      1 |        17.5788 |       2.9183 |                2.9183 |      2 |      
  1 |            transpose1 |       1 |      1 |            N/A |          N/A |                   N/A |      0 |      
  2 | fused_matmul_add_relu | 4019200 |      1 |            N/A |          N/A |                   N/A |      0 |      
  3 |             transpose |       1 |      1 |            N/A |          N/A |                   N/A |      0 |      
-----------------------------------------------------------------------------------------------------------------------
Total trials: 2
Total latency (us): 2.91828

2025-02-10 10:10

Unnamed: 0,Name,FLOP,Weight,Speed (GFLOPS),Latency (us),Weighted Latency (us),Trials,Done
0,fused_matmul1_add1,51300,1,17.5788,2.9183,2.9183,2,Y
1,transpose1,1,1,,,,0,
2,fused_matmul_add_relu,4019200,1,,,,0,
3,transpose,1,1,,,,0,


2025-02-10 10:10:49 [DEBUG] [task_scheduler.cc:318] 
 ID |                  Name |    FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done 
-----------------------------------------------------------------------------------------------------------------------
  0 |    fused_matmul1_add1 |   51300 |      1 |        17.5788 |       2.9183 |                2.9183 |      2 |    Y 
  1 |            transpose1 |       1 |      1 |            N/A |          N/A |                   N/A |      0 |      
  2 | fused_matmul_add_relu | 4019200 |      1 |            N/A |          N/A |                   N/A |      0 |      
  3 |             transpose |       1 |      1 |            N/A |          N/A |                   N/A |      0 |      
-----------------------------------------------------------------------------------------------------------------------
Total trials: 2
Total latency (us): 2.91828


Total trials: 2
Total latency (us): 2.91828

2025-02-10 10:10

Unnamed: 0,Name,FLOP,Weight,Speed (GFLOPS),Latency (us),Weighted Latency (us),Trials,Done
0,fused_matmul1_add1,51300,1,17.5788,2.9183,2.9183,2,Y
1,transpose1,1,1,,,,0,Y
2,fused_matmul_add_relu,4019200,1,,,,0,
3,transpose,1,1,,,,0,



Total trials: 2
Total latency (us): 2.91828

2025-02-10 10:10:49 [DEBUG] [task_scheduler.cc:318] 
 ID |                  Name |    FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done 
-----------------------------------------------------------------------------------------------------------------------
  0 |    fused_matmul1_add1 |   51300 |      1 |        17.5788 |       2.9183 |                2.9183 |      2 |    Y 
  1 |            transpose1 |       1 |      1 |            N/A |          N/A |                   N/A |      0 |    Y 
  2 | fused_matmul_add_relu | 4019200 |      1 |            N/A |          N/A |                   N/A |      0 |      
  3 |             transpose |       1 |      1 |            N/A |          N/A |                   N/A |      0 |      
-----------------------------------------------------------------------------------------------------------------------
Total trials: 2
Total latency (us): 2.91828

2025-02-10 10:10

Unnamed: 0,Name,FLOP,Weight,Speed (GFLOPS),Latency (us),Weighted Latency (us),Trials,Done
0,fused_matmul1_add1,51300,1,17.5788,2.9183,2.9183,2,Y
1,transpose1,1,1,,,,0,Y
2,fused_matmul_add_relu,4019200,1,,,,0,Y
3,transpose,1,1,,,,0,


2025-02-10 10:10:49 [DEBUG] [task_scheduler.cc:318] 
 ID |                  Name |    FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done 
-----------------------------------------------------------------------------------------------------------------------
  0 |    fused_matmul1_add1 |   51300 |      1 |        17.5788 |       2.9183 |                2.9183 |      2 |    Y 
  1 |            transpose1 |       1 |      1 |            N/A |          N/A |                   N/A |      0 |    Y 
  2 | fused_matmul_add_relu | 4019200 |      1 |            N/A |          N/A |                   N/A |      0 |    Y 
  3 |             transpose |       1 |      1 |            N/A |          N/A |                   N/A |      0 |      
-----------------------------------------------------------------------------------------------------------------------
Total trials: 2
Total latency (us): 2.91828


Total trials: 2
Total latency (us): 2.91828

2025-02-10 10:10

Unnamed: 0,Name,FLOP,Weight,Speed (GFLOPS),Latency (us),Weighted Latency (us),Trials,Done
0,fused_matmul1_add1,51300,1,17.5788,2.9183,2.9183,2,Y
1,transpose1,1,1,,,,0,Y
2,fused_matmul_add_relu,4019200,1,,,,0,Y
3,transpose,1,1,,,,0,Y



Total trials: 2
Total latency (us): 2.91828

2025-02-10 10:10:49 [DEBUG] [task_scheduler.cc:318] 
 ID |                  Name |    FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done 
-----------------------------------------------------------------------------------------------------------------------
  0 |    fused_matmul1_add1 |   51300 |      1 |        17.5788 |       2.9183 |                2.9183 |      2 |    Y 
  1 |            transpose1 |       1 |      1 |            N/A |          N/A |                   N/A |      0 |    Y 
  2 | fused_matmul_add_relu | 4019200 |      1 |            N/A |          N/A |                   N/A |      0 |    Y 
  3 |             transpose |       1 |      1 |            N/A |          N/A |                   N/A |      0 |    Y 
-----------------------------------------------------------------------------------------------------------------------
Total trials: 2
Total latency (us): 2.91828





## Build and Deploy
Finally, we build the optimized model and deploy it to the target device.
We skip this step in the CI environment.



In [6]:
if not IS_IN_CI:
    ex = relax.build(mod, target="cuda")
    dev = tvm.device("cuda", 0)
    vm = relax.VirtualMachine(ex, dev)
    # Need to allocate data and params on GPU device
    gpu_data = tvm.nd.array(np.random.rand(1, 3, 224, 224).astype("float32"), dev)
    gpu_params = [tvm.nd.array(p, dev) for p in params["main"]]
    gpu_out = vm["main"](gpu_data, *gpu_params).numpy()

    print(gpu_out.shape)

TVMError: Traceback (most recent call last):
  4: operator()
        at /ssd1/htalendr/tvm/src/driver/driver_api.cc:531
  3: tvm::TIRToRuntime(tvm::runtime::Map<tvm::Target, tvm::IRModule, void, void> const&, tvm::Target const&)
        at /ssd1/htalendr/tvm/src/driver/driver_api.cc:492
  2: tvm::SplitMixedModule(tvm::IRModule, tvm::Target const&, tvm::Target const&)
        at /ssd1/htalendr/tvm/src/driver/driver_api.cc:418
  1: tvm::ApplyPasses(tvm::IRModule, tvm::transform::Sequential)
        at /ssd1/htalendr/tvm/src/driver/driver_api.cc:291
  0: operator()
        at /ssd1/htalendr/tvm/src/tir/analysis/verify_memory.cc:205
  Did you forget to bind?
    Variable `p_fc2_weight` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
    Variable `T_transpose` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
  File "/ssd1/htalendr/tvm/src/tir/analysis/verify_memory.cc", line 205
RuntimeError: Memory verification failed with the following errors:
# from tvm.script import tir as T

@T.prim_func
def transpose1(p_fc2_weight: T.Buffer((T.int64(10), T.int64(256)), "float32"), T_transpose: T.Buffer((T.int64(256), T.int64(10)), "float32")):
    T.func_attr({"op_pattern": 2, "target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mtriple": "x86_64-conda-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "max_num_threads": 1024, "tag": "", "thread_warp_size": 32}), "tir.noalias": T.bool(True)})
    for ax0, ax1 in T.grid(256, 10):
        T_transpose_1 = T.Buffer((T.int64(2560),), data=T_transpose.data)
        p_fc2_weight_1 = T.Buffer((T.int64(2560),), data=p_fc2_weight.data)
        T_transpose_1[ax0 * 10 + ax1] = p_fc2_weight_1[ax1 * 256 + ax0]