# CUDA + ONNX Sanity

This notebook checks:
- PyTorch + CUDA availability and speed
- Exporting a small PyTorch model to ONNX
- Running it with ONNX Runtime (GPU if available)
- Output parity and quick benchmarks

**Prereqs** (already set up per your env):
```bash
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
pip install onnx onnxruntime-gpu  # falls back to CPU if no GPU provider


In [None]:

### Cell 2 — Code: versions + providers
# python
import os, time, pathlib, sys, math
import numpy as np
import torch
import onnx
import onnxruntime as ort

print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA device:", torch.cuda.get_device_name(0))

print("onnx:", onnx.__version__)
print("onnxruntime:", ort.__version__)
print("ORT providers available:", ort.get_available_providers())


In [None]:
if torch.cuda.is_available():
    x = torch.randn(2048, 2048, device="cuda")
    y = torch.randn(2048, 2048, device="cuda")
    # Warmup
    _ = x @ y
    torch.cuda.synchronize()
    t0 = time.time()
    _ = x @ y
    torch.cuda.synchronize()
    print("GPU matmul 2048x2048:", round(time.time() - t0, 4), "s")
else:
    print("CUDA not available; skipping GPU sanity.")


In [None]:
# Tiny MLP: Linear → ReLU → Linear
class TinyMLP(torch.nn.Module):
    def __init__(self, in_dim: int = 32, hidden: int = 64, out_dim: int = 10) -> None:
        super().__init__()
        self.net = torch.nn.Sequential(
            torch.nn.Linear(in_dim, hidden),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden, out_dim),
        )
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.net(x)

in_dim, hidden, out_dim = 32, 64, 10
model = TinyMLP(in_dim, hidden, out_dim).eval()

# Random test input
np.random.seed(0)
x_np = np.random.randn(128, in_dim).astype("float32")  # batch=128
x_t = torch.from_numpy(x_np)

# Prefer GPU for PyTorch run if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)
x_t_dev = x_t.to(device)

# Torch forward
with torch.inference_mode():
    y_t = model(x_t_dev).cpu().numpy()

y_t[:2]

In [None]:
models_dir = pathlib.Path("models")
models_dir.mkdir(parents=True, exist_ok=True)
onnx_path = models_dir / "tiny_mlp.onnx"

dummy = torch.randn(1, in_dim, device=device)  # shape for tracing
torch.onnx.export(
    model,
    dummy,
    onnx_path.as_posix(),
    input_names=["input"],
    output_names=["output"],
    dynamic_axes={"input": {0: "batch"}, "output": {0: "batch"}},
    opset_version=17,
)
print("Exported:", onnx_path.as_posix())

# Sanity: structural check
onnx_model = onnx.load(onnx_path.as_posix())
onnx.checker.check_model(onnx_model)
print("ONNX model structure OK.")


In [None]:
# Prefer CUDA provider if present; else fall back to CPU
providers = ort.get_available_providers()
if "CUDAExecutionProvider" in providers:
    sess_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
else:
    sess_providers = ["CPUExecutionProvider"]

sess = ort.InferenceSession(onnx_path.as_posix(), providers=sess_providers)
print("Using providers:", sess.get_providers())

# Run ORT
ort_inputs = {"input": x_np}
y_ort = sess.run(["output"], ort_inputs)[0]

# Compare numerically: ONNX vs PyTorch
mae = np.max(np.abs(y_ort - y_t))
print("Max |ORT - Torch|:", float(mae))
print("Close? (rtol=1e-4, atol=1e-4):", np.allclose(y_ort, y_t, rtol=1e-4, atol=1e-4))
y_ort[:2]


In [None]:
from collections.abc import Callable

def bench(fn: Callable[[], None], warmup: int = 5, iters: int = 20) -> float:
    # Warmup
    for _ in range(warmup):
        fn()
    t0 = time.time()
    for _ in range(iters):
        fn()
    return (time.time() - t0) / iters

results: dict[str, float] = {}

# Torch (GPU if available, else CPU)
if torch.cuda.is_available():
    def run_torch() -> None:
        with torch.inference_mode():
            _ = model(x_t_dev)
        torch.cuda.synchronize()
    results["torch(gpu)"] = bench(run_torch)
else:
    def run_torch() -> None:
        with torch.inference_mode():
            _ = model(x_t)
    results["torch(cpu)"] = bench(run_torch)

# ORT GPU (if available)
if "CUDAExecutionProvider" in sess.get_providers():
    def run_ort_gpu() -> None:
        _ = sess.run(["output"], {"input": x_np})
    results["ort(gpu)"] = bench(run_ort_gpu)

# ORT CPU
sess_cpu = ort.InferenceSession(onnx_path.as_posix(), providers=["CPUExecutionProvider"])
def run_ort_cpu() -> None:
    _ = sess_cpu.run(["output"], {"input": x_np})
results["ort(cpu)"] = bench(run_ort_cpu)

results

## Notes & Troubleshooting

- **Different numbers between Torch and ORT?**  
  - Ensure `model.eval()` and `torch.inference_mode()` are used.  
  - Export with a recent `opset_version` (we used 17).  
  - Minor diffs (`~1e-4`) are normal due to kernels/precision.

- **No `CUDAExecutionProvider` in ORT?**  
  - Make sure you installed `onnxruntime-gpu` (not just `onnxruntime`).  
  - Confirm your NVIDIA driver + CUDA runtime are present.

- **Bigger models**  
  - This MLP is trivial. For CNN/Transformers, export the same way.  
  - You can also try `onnxruntime-gpu` graph optimizations or TensorRT (later).
