In [1]:
!pip install max==25.4.0 --index-url https://dl.modular.com/public/nightly/python/simple/

Looking in indexes: https://dl.modular.com/public/nightly/python/simple/
Collecting max==25.4.0
  Downloading https://dl.modular.com/public/nightly/python/max-25.4.0-py3-none-manylinux_2_34_x86_64.whl (285.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m285.0/285.0 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: max
Successfully installed max-25.4.0


In [2]:
!git clone https://github.com/modular/mojo-gpu-puzzles

Cloning into 'mojo-gpu-puzzles'...
remote: Enumerating objects: 6332, done.[K
remote: Counting objects: 100% (481/481), done.[K
remote: Compressing objects: 100% (65/65), done.[K
remote: Total 6332 (delta 449), reused 416 (delta 416), pack-reused 5851 (from 3)[K
Receiving objects: 100% (6332/6332), 148.64 MiB | 30.80 MiB/s, done.
Resolving deltas: 100% (3923/3923), done.


In [3]:
!curl -fsSL https://astral.sh/uv/install.sh | sh

downloading uv 0.8.14 x86_64-unknown-linux-gnu
no checksums to verify
installing to /usr/local/bin
  uv
  uvx
everything's installed!


In [4]:
import max.support.notebook

In [5]:
def save_code_to_file(text: str, filename: str):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(text)

In [6]:
mojo_code = """
from typing import Optional
from pathlib import Path
import numpy as np
# ANCHOR: conv1d_pytorch
import torch
from max.torch import CustomOpLibrary


def conv1d_pytorch(input_tensor: torch.Tensor, kernel_tensor: torch.Tensor) -> torch.Tensor:
    # Load our custom operations
    mojo_kernels = Path(__file__).parent / "op"
    ops = CustomOpLibrary(mojo_kernels)

    # Create output tensor with same shape as input
    output_tensor = torch.empty_like(input_tensor)

    # Call our custom conv1d operation with explicit output tensor
    # The Mojo signature expects: (out, input, kernel)
    conv1d = ops.conv1d[{"input_size": input_tensor.shape[0], "conv_size": kernel_tensor.shape[0]}]

    # FILL IN with 1 line of code
    torch.compile(conv1d)(output_tensor, input_tensor, kernel_tensor)
    return output_tensor

# ANCHOR_END: conv1d_pytorch

def conv1d_max_graph_reference(
    input_array: np.ndarray,
    kernel_array: np.ndarray,
    device: Optional[str] = None
) -> np.ndarray:
    from max.driver import CPU, Accelerator, Tensor, accelerator_count
    from max.dtype import DType
    from max.engine import InferenceSession
    from max.graph import DeviceRef, Graph, TensorType, ops

    # Use the same device logic as p15
    if device is None:
        device_obj = CPU() if accelerator_count() == 0 else Accelerator()
    else:
        device_obj = CPU() if device == "cpu" else Accelerator()

    session = InferenceSession(devices=[device_obj])

    # Convert to MAX Graph tensors
    input_tensor = Tensor.from_numpy(input_array).to(device_obj)
    kernel_tensor = Tensor.from_numpy(kernel_array).to(device_obj)

    # Same graph setup as p15
    with Graph(
        "conv_1d_reference_graph",
        input_types=[
            TensorType(DType.float32, shape=input_tensor.shape, device=DeviceRef.from_device(device_obj)),
            TensorType(DType.float32, shape=kernel_tensor.shape, device=DeviceRef.from_device(device_obj)),
        ],
        custom_extensions=[Path(__file__).parent / "op"],
    ) as graph:
        input_value, kernel_value = graph.inputs
        output = ops.custom(
            name="conv1d",
            values=[input_value, kernel_value],
            device=DeviceRef.from_device(device_obj),
            out_types=[TensorType(
                dtype=input_value.tensor.dtype,
                shape=input_value.tensor.shape,
                device=DeviceRef.from_device(device_obj),
            )],
            parameters={
                "input_size": input_tensor.shape[0],
                "conv_size": kernel_tensor.shape[0],
                "dtype": DType.float32,
            },
        )[0].tensor
        graph.output(output)

    model = session.load(graph)
    result = model.execute(input_tensor, kernel_tensor)[0]
    return result.to(CPU()).to_numpy()


def compute_numpy_reference(input_array: np.ndarray, kernel_array: np.ndarray) -> np.ndarray:
    INPUT_SIZE = len(input_array)
    KERNEL_SIZE = len(kernel_array)

    expected_result = np.zeros_like(input_array, dtype=np.float32)
    for i in range(INPUT_SIZE):
        for j in range(KERNEL_SIZE):
            if i + j < INPUT_SIZE:
                expected_result[i] += input_array[i + j] * kernel_array[j]
    return expected_result


if __name__ == "__main__":
    INPUT_SIZE = 15
    KERNEL_SIZE = 4

    # Create test data (same as p15 for easy comparison)
    input_array = np.arange(INPUT_SIZE, dtype=np.float32)
    kernel_array = np.arange(KERNEL_SIZE, dtype=np.float32)

    print("Puzzle 18: From MAX Graph to PyTorch Custom Ops")
    print("=" * 60)
    print(f"Input array: {input_array}")
    print(f"Convolution kernel: {kernel_array}")
    print()

    numpy_result = compute_numpy_reference(input_array, kernel_array)
    print(f"NumPy reference result: {numpy_result}")
    print()

    device = "cuda"
    input_tensor = torch.from_numpy(input_array).to(device)
    kernel_tensor = torch.from_numpy(kernel_array).to(device)

    print(f"Testing PyTorch Custom Op (device: {device})")
    print("-" * 40)

    try:
        pytorch_result = conv1d_pytorch(input_tensor, kernel_tensor)
        pytorch_result_cpu = pytorch_result.cpu().numpy()
        print(f"PyTorch custom op result: {pytorch_result_cpu}")

        # Verify PyTorch result
        np.testing.assert_allclose(pytorch_result_cpu, numpy_result, rtol=1e-5)
        print("✅ PyTorch custom op verification PASSED")

    except Exception as e:
        print(f"❌ PyTorch custom op failed: {e}")
        pytorch_result_cpu = None

    print()

    # Compare with MAX Graph approach (like p15)
    print("Comparing with MAX Graph approach (like p15)")
    print("-" * 40)

    try:
        max_graph_result = conv1d_max_graph_reference(input_array, kernel_array)
        print(f"MAX Graph result: {max_graph_result}")

        # Verify MAX Graph result
        np.testing.assert_allclose(max_graph_result, numpy_result, rtol=1e-5)
        print("✅ MAX Graph verification PASSED")

        if pytorch_result_cpu is not None:
            np.testing.assert_allclose(pytorch_result_cpu, max_graph_result, rtol=1e-5)
            print("✅ PyTorch and MAX Graph results MATCH")

    except Exception as e:
        print(f"❌ MAX Graph comparison failed: {e}")

    print()
    print("Key Learning Points:")
    print("• Same Mojo kernel works for both MAX Graph and PyTorch")
    print("• PyTorch CustomOpLibrary requires explicit output tensor allocation")
    print("• Both approaches call the exact same optimized GPU kernel")
    print("• PyTorch tensors can stay on GPU throughout the computation")
"""

In [7]:
save_code_to_file(mojo_code, "/content/mojo-gpu-puzzles/problems/p20/p20.py")

In [8]:
!cd /content/mojo-gpu-puzzles && uv run poe p20

Using CPython 3.12.11 interpreter at: [36m/usr/bin/python3[39m
Creating virtual environment at: [36m.venv[39m
[2K[2mInstalled [1m32 packages[0m [2min 416ms[0m[0m
[37mPoe =>[0m [94mpython problems/p20/p20.py[0m
Puzzle 18: From MAX Graph to PyTorch Custom Ops
Input array: [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14.]
Convolution kernel: [0. 1. 2. 3.]

NumPy reference result: [14. 20. 26. 32. 38. 44. 50. 56. 62. 68. 74. 80. 41. 14.  0.]

Testing PyTorch Custom Op (device: cuda)
----------------------------------------
ERROR:root:Error compiling Mojo at /content/mojo-gpu-puzzles/problems/p20/op. Command: package /content/mojo-gpu-puzzles/problems/p20/op -o /tmp/.modular/mojo_pkg/mojo_pkg_6c850c04a716713d3b3f9f3497021235.mojopkg

Included from /content/mojo-gpu-puzzles/problems/p20/op/__init__.mojo:1:
/content/mojo-gpu-puzzles/problems/p20/op/conv1d.mojo:6:17: error: package 'sys' does not contain 'sizeof'
from sys import sizeof, argv
                ^
/conten

In [9]:
!uv run mojo --version

Mojo 25.4.0 (fbeca2fa)


In [10]:
cat /etc/os-release

PRETTY_NAME="Ubuntu 22.04.4 LTS"
NAME="Ubuntu"
VERSION_ID="22.04"
VERSION="22.04.4 LTS (Jammy Jellyfish)"
VERSION_CODENAME=jammy
ID=ubuntu
ID_LIKE=debian
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
UBUNTU_CODENAME=jammy
