In [1]:
"""
Taken directly from https://tvm.apache.org/docs/how_to/tutorials/e2e_opt_model.html
Model Type: CNN
Model Definition: PyTorch
Model Export: torch.export
Model Ingestion: tvm.relax.frontend.torch.from_exported_program
Target: CUDA
Compile and Test Result: FAIL: Did you forget to bind?
    Variable `p_conv3_weight` is directly accessed by host memory
"""

'\nTaken directly from https://tvm.apache.org/docs/how_to/tutorials/e2e_opt_model.html\nModel Type: CNN\nModel Definition: PyTorch\nModel Export: torch.export\nModel Ingestion: tvm.relax.frontend.torch.from_exported_program\nTarget: CUDA\nCompile and Test Result: FAIL: Did you forget to bind?\n    Variable `p_conv3_weight` is directly accessed by host memory\n'

In [2]:
import sys
import os
import torch

# Add TVM path
os.environ['PYTHONPATH'] = "/ssd1/htalendr/tvm/python:" + os.environ.get('PYTHONPATH', '')

# Verify it's set
print(os.environ['PYTHONPATH'])

# Reload sys.path
sys.path.append("/ssd1/htalendr/tvm/python")

# Test import
import tvm
from tvm import relax
print("TVM successfully imported!")


/ssd1/htalendr/tvm/python:
TVM successfully imported!


In [3]:
!nvidia-smi

Mon Feb 10 10:18:51 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.14              Driver Version: 550.54.14      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        Off |   00000000:01:00.0 Off |                  Off |
|  0%   35C    P8             36W /  450W |     419MiB /  24564MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA GeForce RTX 2070        Off |   00



# End-to-End Optimize Model
This tutorial demonstrates how to optimize a machine learning model using Apache TVM. We will
use a pre-trained ResNet-18 model from PyTorch and end-to-end optimize it using TVM's Relax API.
Please note that default end-to-end optimization may not suit complex models.


## Preparation
First, we prepare the model and input information. We use a pre-trained ResNet-18 model from
PyTorch.



In [4]:
import os
import numpy as np
import torch
from torch.export import export
# from torchvision.models.resnet import ResNet18_Weights, resnet18
import torch
from torch import nn
from torch.export import export
from tvm.relax.frontend.torch import from_exported_program
import torch.nn.functional as F
import numpy as np

# Create a dummy model
class PyTorchCNN(nn.Module):
    def __init__(self, num_classes=3):
        super(PyTorchCNN, self).__init__()

        # Define convolutional layers
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=12, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2)
        self.conv2 = nn.Conv2d(in_channels=12, out_channels=12, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(in_channels=12, out_channels=24, kernel_size=3, stride=1, padding=1)
        # self.drop = nn.Dropout2d(p=0.2) # TODO retrain without dropout?
        
        # Fully connected layer
        self.fc = nn.Linear(in_features=32 * 32 * 24, out_features=num_classes)

    def forward(self, x):
        # # Ensure input is in the correct format (assumes already in NCHW if using PyTorch DataLoader)
        # if not isinstance(x, torch.Tensor):
        #     x = self.transformation(x).float() # Converts HWC -> CHW
        #     x = x.unsqueeze(0)  # Converts CHW -> NCHW
        #     x = Variable(x)

        # Forward pass through CNN layers
        x = F.relu(self.pool(self.conv1(x)))
        x = F.relu(self.pool(self.conv2(x)))
        x = F.relu(self.conv3(x)) # used to be: x = F.relu(self.drop(self.conv3(x)))
        x = F.dropout(x, training=self.training)
        
        # Flatten the tensor before passing to the fully connected layer
        x = x.view(x.size(0), -1)  # Use x.size(0) to handle batch size dynamically
        x = self.fc(x)
        
        # Return log probabilities for classification
        return F.log_softmax(x, dim=1)


torch_model = PyTorchCNN().eval()


## Review Overall Flow
The overall flow consists of the following steps:

- **Construct or Import a Model**: Construct a neural network model or import a pre-trained
  model from other frameworks (e.g. PyTorch, ONNX), and create the TVM IRModule, which contains
  all the information needed for compilation, including high-level Relax functions for
  computational graph, and low-level TensorIR functions for tensor program.
- **Perform Composable Optimizations**: Perform a series of optimization transformations,
  such as graph optimizations, tensor program optimizations, and library dispatching.
- **Build and Universal Deployment**: Build the optimized model to a deployable module to the
  universal runtime, and execute it on different devices, such as CPU, GPU, or other accelerators.




### Convert the model to IRModule
Next step, we convert the model to an IRModule using the Relax frontend for PyTorch for further
optimization.



In [5]:
import tvm
from tvm import relax
from tvm.relax.frontend.torch import from_exported_program

# Give an example argument to torch.export
example_args = (torch.randn(1, 3, 128, 128, dtype=torch.float32),)

# Convert the model to IRModule
with torch.no_grad():
    exported_program = export(torch_model, example_args)
    mod = from_exported_program(exported_program, keep_params_as_input=True)

mod, params = relax.frontend.detach_params(mod)
mod.show()

## IRModule Optimization
Apache TVM Unity provides a flexible way to optimize the IRModule. Everything centered
around IRModule optimization can be composed with existing pipelines. Note that each
transformation can be combined as an optimization pipeline via ``tvm.ir.transform.Sequential``.

In this tutorial, we focus on the end-to-end optimization of the model via auto-tuning. We
leverage MetaSchedule to tune the model and store the tuning logs to the database. We also
apply the database to the model to get the best performance.




In [6]:
# Skipping the optimization

# TOTAL_TRIALS = 8  # Change to 20000 for better performance if needed
# target = tvm.target.Target("nvidia/geforce-rtx-4090")  # Change to your target device
# work_dir = "tuning_logs"

# # Skip running in CI environment
# IS_IN_CI = os.getenv("CI", "") == "true"
# print("IS_IN_CI:", IS_IN_CI)
# if not IS_IN_CI:
#     mod = relax.get_pipeline("static_shape_tuning", target=target, total_trials=TOTAL_TRIALS)(mod)

#     # Only show the main function
#     mod["main"].show()

## Build and Deploy
Finally, we build the optimized model and deploy it to the target device.
We skip this step in the CI environment.



In [7]:
from tvm import dlight as dl

with tvm.target.Target("cuda"):
    gpu_mod = dl.ApplyDefaultSchedule(
        dl.gpu.GEMV(),
        dl.gpu.LowBatchGEMV(),
        dl.gpu.Fallback(),
        dl.gpu.Matmul(),
        dl.gpu.Reduction(),
        dl.gpu.Transpose(),
        dl.gpu.GeneralReduction(),
        dl.gpu.RMSNorm(),
    )(mod)


# if not IS_IN_CI:
ex = relax.build(gpu_mod, target="cuda")
dev = tvm.device("cuda", 0)
vm = relax.VirtualMachine(ex, dev)
# Need to allocate data and params on GPU device
gpu_data = tvm.nd.array(np.random.rand(1, 3, 224, 224).astype("float32"), dev)
gpu_params = [tvm.nd.array(p, dev) for p in params["main"]]
gpu_out = vm["main"](gpu_data, *gpu_params).numpy()

print(gpu_out.shape)

TVMError: Traceback (most recent call last):
  4: operator()
        at /ssd1/htalendr/tvm/src/driver/driver_api.cc:531
  3: tvm::TIRToRuntime(tvm::runtime::Map<tvm::Target, tvm::IRModule, void, void> const&, tvm::Target const&)
        at /ssd1/htalendr/tvm/src/driver/driver_api.cc:492
  2: tvm::SplitMixedModule(tvm::IRModule, tvm::Target const&, tvm::Target const&)
        at /ssd1/htalendr/tvm/src/driver/driver_api.cc:418
  1: tvm::ApplyPasses(tvm::IRModule, tvm::transform::Sequential)
        at /ssd1/htalendr/tvm/src/driver/driver_api.cc:291
  0: operator()
        at /ssd1/htalendr/tvm/src/tir/analysis/verify_memory.cc:205
  Did you forget to bind?
    Variable `lv3` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
    Variable `compute` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
  File "/ssd1/htalendr/tvm/src/tir/analysis/verify_memory.cc", line 205
RuntimeError: Memory verification failed with the following errors:
# from tvm.script import tir as T

@T.prim_func
def relu(lv3: T.Buffer((T.int64(1), T.int64(12), T.int64(64), T.int64(64)), "float32"), compute: T.Buffer((T.int64(1), T.int64(12), T.int64(64), T.int64(64)), "float32")):
    T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mtriple": "x86_64-conda-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "max_num_threads": 1024, "tag": "", "thread_warp_size": 32}), "tir.noalias": T.bool(True)})
    for i1, i2, i3 in T.grid(12, 64, 64):
        cse_var_1: T.int32 = i1 * 4096 + i2 * 64 + i3
        compute_1 = T.Buffer((T.int64(49152),), data=compute.data)
        lv3_1 = T.Buffer((T.int64(49152),), data=lv3.data)
        compute_1[cse_var_1] = T.max(lv3_1[cse_var_1], T.float32(0.0))