In [1]:
!pip install max --index-url https://dl.modular.com/public/nightly/python/simple/

Looking in indexes: https://dl.modular.com/public/nightly/python/simple/
Collecting max
  Downloading https://dl.modular.com/public/nightly/python/max-25.5.0.dev2025061705-py3-none-manylinux_2_34_x86_64.whl (303.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m303.3/303.3 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: max
Successfully installed max-25.5.0.dev2025061705


In [2]:
!git clone https://github.com/modular/mojo-gpu-puzzles

Cloning into 'mojo-gpu-puzzles'...
remote: Enumerating objects: 4651, done.[K
remote: Counting objects: 100% (327/327), done.[K
remote: Compressing objects: 100% (111/111), done.[K
remote: Total 4651 (delta 277), reused 234 (delta 213), pack-reused 4324 (from 2)[K
Receiving objects: 100% (4651/4651), 98.04 MiB | 20.84 MiB/s, done.
Resolving deltas: 100% (2903/2903), done.


In [3]:
!curl -fsSL https://astral.sh/uv/install.sh | sh

downloading uv 0.7.13 x86_64-unknown-linux-gnu
no checksums to verify
installing to /usr/local/bin
  uv
  uvx
everything's installed!


In [4]:
import max.support.notebook

In [5]:
def save_code_to_file(text: str, filename: str):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(text)

In [31]:
mojo_code = """
from memory import UnsafePointer, stack_allocation
from gpu import thread_idx, block_idx, block_dim, barrier
from gpu.host import DeviceContext
from gpu.memory import AddressSpace
from sys import sizeof
from testing import assert_equal

# ANCHOR: pooling
alias TPB = 8
alias SIZE = 8
alias BLOCKS_PER_GRID = (1, 1)
alias THREADS_PER_BLOCK = (TPB, 1)
alias dtype = DType.float32


fn pooling(
    output: UnsafePointer[Scalar[dtype]],
    a: UnsafePointer[Scalar[dtype]],
    size: Int,
):
    shared = stack_allocation[
        TPB,
        Scalar[dtype],
        address_space = AddressSpace.SHARED,
    ]()
    global_i = block_dim.x * block_idx.x + thread_idx.x
    local_i = thread_idx.x

    # FILL ME IN (roughly 10 lines)
    if global_i < size:
      shared[local_i] = a[global_i]

    barrier()

    # A way to get 1, 2, 3 elements and add them and store them in output[global_i]
    if global_i == 0:
      output[0] = shared[0]
    elif global_i == 1:
      output[1] = shared[0] + shared[1]
    elif 1 < global_i < size:
      output[global_i] = shared[local_i] + shared[local_i - 1] + shared[local_i -2 ]

# ANCHOR_END: pooling


def main():
    with DeviceContext() as ctx:
        out = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)
        a = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)
        with a.map_to_host() as a_host:
            for i in range(SIZE):
                a_host[i] = i

        ctx.enqueue_function[pooling](
            out.unsafe_ptr(),
            a.unsafe_ptr(),
            SIZE,
            grid_dim=BLOCKS_PER_GRID,
            block_dim=THREADS_PER_BLOCK,
        )

        expected = ctx.enqueue_create_host_buffer[dtype](SIZE).enqueue_fill(0)

        ctx.synchronize()

        with a.map_to_host() as a_host:
            ptr = a_host.unsafe_ptr()
            for i in range(SIZE):
                s = Scalar[dtype](0)
                for j in range(max(i - 2, 0), i + 1):
                    s += ptr[j]

                expected[i] = s

        with out.map_to_host() as out_host:
            print("out:", out_host)
            print("expected:", expected)
            for i in range(SIZE):
                assert_equal(out_host[i], expected[i])

"""

In [32]:
save_code_to_file(mojo_code, "/content/mojo-gpu-puzzles/problems/p09/p09.mojo")

In [33]:
!cd /content/mojo-gpu-puzzles && uv run poe p09

[37mPoe =>[0m [94mmojo problems/p09/p09.mojo[0m
out: HostBuffer([0.0, 1.0, 3.0, 6.0, 9.0, 12.0, 15.0, 18.0])
expected: HostBuffer([0.0, 1.0, 3.0, 6.0, 9.0, 12.0, 15.0, 18.0])
