In [1]:
!pip install max==25.4.0 --index-url https://dl.modular.com/public/nightly/python/simple/

Looking in indexes: https://dl.modular.com/public/nightly/python/simple/
Collecting max==25.4.0
  Downloading https://dl.modular.com/public/nightly/python/max-25.4.0-py3-none-manylinux_2_34_x86_64.whl (285.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m285.0/285.0 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: max
Successfully installed max-25.4.0


In [2]:
!git clone https://github.com/modular/mojo-gpu-puzzles

Cloning into 'mojo-gpu-puzzles'...
remote: Enumerating objects: 6332, done.[K
remote: Counting objects: 100% (481/481), done.[K
remote: Compressing objects: 100% (65/65), done.[K
remote: Total 6332 (delta 449), reused 416 (delta 416), pack-reused 5851 (from 3)[K
Receiving objects: 100% (6332/6332), 148.64 MiB | 29.51 MiB/s, done.
Resolving deltas: 100% (3923/3923), done.


In [3]:
!curl -fsSL https://astral.sh/uv/install.sh | sh

downloading uv 0.8.14 x86_64-unknown-linux-gnu
no checksums to verify
installing to /usr/local/bin
  uv
  uvx
everything's installed!


In [4]:
import max.support.notebook

In [5]:
def save_code_to_file(text: str, filename: str):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(text)

In [6]:
mojo_code = """
from memory import UnsafePointer

# ANCHOR: softmax_gpu_kernel
from gpu import thread_idx, block_idx, block_dim, barrier
from gpu.host import DeviceContext, HostBuffer, DeviceBuffer
from layout import Layout, LayoutTensor
from layout.tensor_builder import LayoutTensorBuild as tb
from math import exp
from utils.numerics import max_finite, min_finite


alias SIZE = 128
alias TPB = 128
alias BLOCKS_PER_GRID = (1, 1)
alias THREADS_PER_BLOCK = (TPB, 1)
alias layout = Layout.row_major(SIZE)


fn softmax_gpu_kernel[
    layout: Layout,
    input_size: Int,
    dtype: DType = DType.float32,
](
    output: LayoutTensor[mut=True, dtype, layout],
    input: LayoutTensor[mut=False, dtype, layout],
):
    # FILL IN (roughly 31 lines)
    shared_max = tb[dtype]().row_major[TPB]().shared().alloc()
    shared_sum = tb[dtype]().row_major[TPB]().shared().alloc()
    global_i = block_dim.x * block_idx.x + thread_idx.x
    local_i = thread_idx.x

    var thread_max: Scalar[dtype] = min_finite[dtype]()
    if global_i < input_size:
      thread_max = rebind[Scalar[dtype]](input[global_i])
    shared_max[local_i] = thread_max

    barrier()

    stride = TPB // 2
    while stride > 0:
      if local_i < stride:
        shared_max[local_i] = max(
          shared_max[local_i], shared_max[local_i + stride]
        )
      barrier()
      stride = stride // 2

    block_max = shared_max[0]

    var exp_val: Scalar[dtype] = 0.0
    if global_i < input_size:
      exp_val = rebind[Scalar[dtype]](exp(input[global_i] - block_max))
    shared_sum[local_i] = exp_val
    barrier()

    stride = TPB // 2
    while stride > 0:
      if local_i < stride:
        shared_sum[local_i] += shared_sum[local_i + stride]
      barrier()
      stride = stride // 2

    block_sum = shared_sum[0]

    if global_i < input_size:
      output[global_i] = exp_val / block_sum

# ANCHOR_END: softmax_gpu_kernel


# ANCHOR: softmax_cpu_kernel
fn softmax_cpu_kernel[
    layout: Layout,
    input_size: Int,
    dtype: DType = DType.float32,
](
    output: LayoutTensor[dtype, layout, MutableAnyOrigin],
    input: LayoutTensor[dtype, layout, MutableAnyOrigin],
):
    # FILL IN (roughly 10 lines)
    var max_val: Scalar[dtype] = min_finite[dtype]()
    for i in range(input_size):
      max_val = max(max_val, rebind[Scalar[dtype]](input[i]))

    var sum_exp: Scalar[dtype] = 0.0
    for i in range(input_size):
      var exp_val = rebind[Scalar[dtype]](exp(input[i] - max_val))
      output[i] = exp_val
      sum_exp += exp_val

    for i in range(input_size):
      output[i] = output[i] / sum_exp



# ANCHOR_END: softmax_cpu_kernel

import compiler
from runtime.asyncrt import DeviceContextPtr
from tensor import InputTensor, OutputTensor


@compiler.register("softmax")
struct SoftmaxCustomOp:
    @staticmethod
    fn execute[
        target: StaticString,  # "cpu" or "gpu"
        input_size: Int,
        dtype: DType = DType.float32,
    ](
        output: OutputTensor[rank=1],
        input: InputTensor[rank = output.rank],
        ctx: DeviceContextPtr,
    ) raises:
        # Note: rebind is necessary now but it shouldn't be!
        var output_tensor = rebind[
            LayoutTensor[dtype, layout, MutableAnyOrigin]
        ](output.to_layout_tensor())
        var input_tensor = rebind[
            LayoutTensor[dtype, layout, MutableAnyOrigin]
        ](input.to_layout_tensor())
        alias layout = input_tensor.layout

        @parameter
        if target == "gpu":
            gpu_ctx = ctx.get_device_context()
            # making sure the output tensor is zeroed out before the kernel is called
            gpu_ctx.enqueue_memset(
                DeviceBuffer[output_tensor.dtype](
                    gpu_ctx,
                    rebind[UnsafePointer[Scalar[output_tensor.dtype]]](
                        output_tensor.ptr
                    ),
                    input_size,
                    owning=False,
                ),
                0,
            )

            gpu_ctx.enqueue_function[
                softmax_gpu_kernel[layout, input_size, dtype]
            ](
                output_tensor,
                input_tensor,
                grid_dim=BLOCKS_PER_GRID,
                block_dim=(TPB, 1),
            )

        elif target == "cpu":
            softmax_cpu_kernel[layout, input_size, dtype](
                output_tensor, input_tensor
            )
        else:
            raise Error("Unsupported target: " + target)

"""

In [7]:
save_code_to_file(mojo_code, "/content/mojo-gpu-puzzles/problems/p18/op/softmax.mojo")

In [8]:
!uv pip install scipy

[2mUsing Python 3.12.11 environment at: /usr[0m
[2mAudited [1m1 package[0m [2min 210ms[0m[0m


In [9]:
py_code = """
# ANCHOR: softmax_custom_op_graph
from pathlib import Path
import numpy as np
from max.driver import CPU, Accelerator, Device, Tensor, accelerator_count
from max.dtype import DType
from max.engine import InferenceSession
from max.graph import DeviceRef, Graph, TensorType, ops
from numpy.typing import NDArray
from scipy.special import softmax as scipy_softmax


def softmax(
    input: NDArray[np.float32],
    session: InferenceSession,
    device: Device,
) -> Tensor:
    dtype = DType.float32
    input_tensor = Tensor.from_numpy(input).to(device)
    mojo_kernels = Path(__file__).parent / "op"

    with Graph(
        "softmax_graph",
        input_types=[
            TensorType(
                dtype,
                shape=input_tensor.shape,
                device=DeviceRef.from_device(device),
            ),
        ],
        custom_extensions=[mojo_kernels],
    ) as graph:
        # FILL IN (roughly 4 unformatted lines)
        input_value = graph.inputs[0]
        output = ops.custom(
          name="softmax",
          values=[input_value],
          device=DeviceRef.from_device(device),
          out_types=[
            TensorType(
              dtype=input_value.tensor.dtype,
              shape=input_value.tensor.shape,
              device=DeviceRef.from_device(device),
            )
          ],
          parameters = {
            'target': "gpu" if device == Accelerator() else "cpu",
            'input_size': input_tensor.shape[0],
            'dtype': dtype,
          },
        )[0].tensor
        graph.output(output)

    # ANCHOR_END: softmax_custom_op_graph

    print(f"Compiling softmax graph on {device}")
    model = session.load(graph)
    print(f"Executing softmax on {device}")
    print("="*100)
    result = model.execute(input_tensor)[0]
    assert isinstance(result, Tensor)
    return result.to(CPU()) if device == Accelerator() else result


if __name__ == "__main__":
    INPUT_SIZE = 128
    cpu_session = InferenceSession(devices=[CPU()])
    gpu_session = InferenceSession(devices=[Accelerator()])
    input_array = np.random.randn(INPUT_SIZE).astype(np.float32)
    expected_result = scipy_softmax(input_array)

    print(f"Input shape: {input_array.shape}")
    print(f"First few random input values: {input_array[:5]}")

    cpu_result = softmax(input_array, cpu_session, CPU())
    gpu_result = softmax(input_array, gpu_session, Accelerator())
    print(f"First few softmax results on CPU (custom Mojo kernel): {cpu_result.to_numpy()[:5]}")
    print(f"First few softmax results on GPU (custom Mojo kernel): {gpu_result.to_numpy()[:5]}")
    print(f"First few expected results (SciPy calculation): {expected_result[:5]}")

    np.testing.assert_allclose(cpu_result.to_numpy(), expected_result, rtol=1e-5)
    print("Verification passed: Custom kernel results match SciPy calculation")

    total_prob_cpu = np.round(np.sum(cpu_result.to_numpy()), 5)
    total_prob_gpu = np.round(np.sum(gpu_result.to_numpy()), 5)
    print(f"Sum of all probabilities on CPU: {total_prob_cpu}")
    print(f"Sum of all probabilities on GPU: {total_prob_gpu}")

"""

In [10]:
save_code_to_file(py_code, "/content/mojo-gpu-puzzles/problems/p18/p18.py")

In [11]:
!cd /content/mojo-gpu-puzzles && uv run poe p18

Using CPython 3.12.11 interpreter at: [36m/usr/bin/python3[39m
Creating virtual environment at: [36m.venv[39m
[2K[2mInstalled [1m32 packages[0m [2min 360ms[0m[0m
[37mPoe =>[0m [94mmojo package problems/p18/op -o problems/p18/op.mojopkg[0m
[37mPoe =>[0m [94mpython problems/p18/p18.py[0m
Traceback (most recent call last):
  File "/content/mojo-gpu-puzzles/problems/p18/p18.py", line 10, in <module>
    from scipy.special import softmax as scipy_softmax
ModuleNotFoundError: No module named 'scipy'


In [12]:
!curl -fsSL https://pixi.sh/install.sh | bash


This script will automatically download and install Pixi (latest) for you.
Getting it from this url: https://github.com/prefix-dev/pixi/releases/latest/download/pixi-x86_64-unknown-linux-musl.tar.gz
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 23.2M  100 23.2M    0     0  19.6M      0  0:00:01  0:00:01 --:--:-- 75.5M
The 'pixi' binary is installed into '/root/.pixi/bin'
Updating '/root/.bashrc'
Please restart or source your shell.


In [13]:
!source /root/.bashrc

In [14]:
!pixi install

/bin/bash: line 1: pixi: command not found


In [15]:
!uv run mojo --version

Mojo 25.4.0 (fbeca2fa)
