In [11]:
!pip install max --index-url https://dl.modular.com/public/nightly/python/simple/

Looking in indexes: https://dl.modular.com/public/nightly/python/simple/


In [12]:
!git clone https://github.com/modular/mojo-gpu-puzzles

fatal: destination path 'mojo-gpu-puzzles' already exists and is not an empty directory.


In [13]:
!curl -fsSL https://astral.sh/uv/install.sh | sh

downloading uv 0.7.12 x86_64-unknown-linux-gnu
no checksums to verify
installing to /usr/local/bin
  uv
  uvx
everything's installed!


In [14]:
import max.support.notebook

In [15]:
def save_code_to_file(text: str, filename: str):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(text)

In [59]:
mojo_code = """
from gpu import thread_idx, block_dim, block_idx
from gpu.host import DeviceContext, HostBuffer
from layout import Layout, LayoutTensor
from testing import assert_equal

# ANCHOR: broadcast_add_layout_tensor
alias SIZE = 2
alias BLOCKS_PER_GRID = 1
alias THREADS_PER_BLOCK = (3, 3)
alias dtype = DType.float32
alias out_layout = Layout.row_major(SIZE, SIZE)
alias a_layout = Layout.row_major(1, SIZE)
alias b_layout = Layout.row_major(SIZE, 1)


fn broadcast_add[
    out_layout: Layout,
    a_layout: Layout,
    b_layout: Layout,
](
    output: LayoutTensor[mut=True, dtype, out_layout],
    a: LayoutTensor[mut=False, dtype, a_layout],
    b: LayoutTensor[mut=False, dtype, b_layout],
    size: Int,
):
    row = thread_idx.y
    col = thread_idx.x
    # FILL ME IN (roughly 2 lines)
    if row < size and col < size:
      output[row, col] = a[0, col] + b[row, 0]


# ANCHOR_END: broadcast_add_layout_tensor
def main():
    with DeviceContext() as ctx:
        out_buf = ctx.enqueue_create_buffer[dtype](SIZE * SIZE).enqueue_fill(0)
        out_tensor = LayoutTensor[mut=True, dtype, out_layout](
            out_buf.unsafe_ptr()
        )
        print("out shape:", out_tensor.shape[0](), "x", out_tensor.shape[1]())

        expected_buf = ctx.enqueue_create_host_buffer[dtype](
            SIZE * SIZE
        ).enqueue_fill(0)
        expected_tensor = LayoutTensor[mut=True, dtype, out_layout](
            expected_buf.unsafe_ptr()
        )

        a = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)
        b = ctx.enqueue_create_buffer[dtype](SIZE).enqueue_fill(0)
        with a.map_to_host() as a_host, b.map_to_host() as b_host:
            for i in range(SIZE):
                a_host[i] = i
                b_host[i] = i

            for i in range(SIZE):
                for j in range(SIZE):
                    expected_tensor[i, j] = a_host[i] + b_host[j]

        a_tensor = LayoutTensor[dtype, a_layout](a.unsafe_ptr())
        b_tensor = LayoutTensor[dtype, b_layout](b.unsafe_ptr())

        ctx.enqueue_function[broadcast_add[out_layout, a_layout, b_layout]](
            out_tensor,
            a_tensor,
            b_tensor,
            SIZE,
            grid_dim=BLOCKS_PER_GRID,
            block_dim=THREADS_PER_BLOCK,
        )

        ctx.synchronize()

        with out_buf.map_to_host() as out_buf_host:
            print("out:", out_buf_host)
            print("expected:", expected_buf)
            for i in range(SIZE):
                for j in range(SIZE):
                    assert_equal(
                        out_buf_host[i * SIZE + j], expected_buf[i * SIZE + j]
                    )

"""

In [60]:
save_code_to_file(mojo_code, "/content/mojo-gpu-puzzles/problems/p05/p05_layout_tensor.mojo")

In [61]:
!cd /content/mojo-gpu-puzzles && uv run poe p05_layout_tensor

[37mPoe =>[0m [94mmojo problems/p05/p05_layout_tensor.mojo[0m
out shape: 2 x 2
out: HostBuffer([0.0, 1.0, 1.0, 2.0])
expected: HostBuffer([0.0, 1.0, 1.0, 2.0])
