In [61]:
from typing import Any
import numpy as np
from max import engine
from max.graph import Graph, TensorType, ops
from max.driver import accelerator_count, CPU, CUDA, Device
from max.dtype import DType

See if we have accelerators available through CUDA

In [62]:
if count := accelerator_count():
    print(f'We have a {count} GPUs available!')
else:
    print('Sorry, CPU only.')

We have a 1 GPUs available!


In [63]:
my_device = CPU() if accelerator_count() == 0 else CUDA()
my_device.label

'gpu'

Tensor [addition on the GPU via MAX graphs](https://github.com/modular/max/blob/main/tutorials/max-graph-python/src/max_ops/addition.py)

In [64]:
def add_tensors(a: np.ndarray, b: np.ndarray, device: Device) -> dict[str, Any]:
    # 1. Build the graph
    input_type = TensorType(dtype=DType.float32, shape=(1,))
    with Graph(
        "simple_add_graph", input_types=(input_type, input_type)
    ) as graph:
        lhs, rhs = graph.inputs
        out = ops.add(lhs, rhs)
        graph.output(out)
        print("final graph:", graph)

    # 2. Create an inference session
    session = engine.InferenceSession(devices=[device])
    model = session.load(graph)

    for tensor in model.input_metadata:
        print(
            f"name: {tensor.name}, shape: {tensor.shape}, dtype: {tensor.dtype}"
        )

    # 3. Execute the graph
    ret = model.execute(a, b)[0]
    print("result:", ret)
    return ret

Start with regular np arrays and pass them to the graph

In [65]:
input0 = np.array([1.0], dtype=np.float32)
input1 = np.array([1.0], dtype=np.float32)
ret = add_tensors(input0, input1, my_device)
print(ret.device)

final graph: mo.graph @simple_add_graph(%arg0: !mo.tensor<[1], f32>, %arg1: !mo.tensor<[1], f32>) -> !mo.tensor<[1], f32> attributes {argument_names = ["input0", "input1"], result_names = ["output0"]} {
  %0 = mo.chain.create()
  %1 = rmo.add(%arg0, %arg1) : (!mo.tensor<[1], f32>, !mo.tensor<[1], f32>) -> !mo.tensor<[1], f32>
  mo.output %1 : !mo.tensor<[1], f32>
}
name: input0, shape: [1], dtype: DType.float32
name: input1, shape: [1], dtype: DType.float32
result: max.driver.Tensor(DType.float32, (1,))
Device(type=gpu,gpu_id=0,target_info(triple=nvptx64-nvidia-cuda,arch=sm_86,features=[])


Copy the result tensor back to CPU and convert back to a np array

In [66]:
ret_cpu = ret.to(CPU())
print(ret_cpu.device)

Device(type=cpu,target_info(triple=x86_64-unknown-linux-gnu,arch=znver3,features=[adx, aes, avx, avx2, bmi, bmi2, clflushopt, clwb, clzero, crc32, cx16, cx8, f16c, fma, fsgsbase, fxsr, invpcid, lzcnt, mmx, movbe, mwaitx, pclmul, pku, popcnt, prfchw, rdpid, rdpru, rdrnd, rdseed, sahf, sha, sse, sse2, sse3, sse4.1, sse4.2, sse4a, ssse3, vaes, vpclmulqdq, wbnoinvd, x87, xsave, xsavec, xsaveopt, xsaves])


In [67]:
ret_cpu.to_numpy()

array([2.], dtype=float32)