In [10]:
import time
import torch
import triton
import triton.language as tl


# -----------------------------
# Triton vector add kernel
# y = x + z
# -----------------------------
@triton.jit
def vector_add_kernel(x_ptr, y_ptr, z_ptr, N, BLOCK_SIZE: tl.constexpr):
    pid = tl.program_id(axis=0)

    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)

    mask = offsets < N

    x = tl.load(x_ptr + offsets, mask=mask, other=0.0)
    y = tl.load(y_ptr + offsets, mask=mask, other=0.0)

    z = x + y

    tl.store(z_ptr + offsets, z, mask=mask)

def triton_vector_add(x: torch.Tensor, z: torch.Tensor, block: int = 1024):
    assert x.is_cuda and z.is_cuda, "Triton example requires CUDA tensors."
    assert x.dtype == torch.float32 and z.dtype == torch.float32, "Use float32 for this demo."
    assert x.is_contiguous() and z.is_contiguous(), "Please use contiguous tensors."

    N = x.numel()
    y = torch.empty_like(x)
    grid = (triton.cdiv(N, block),)
    vector_add_kernel[grid](x, z, y, N=N, BLOCK_SIZE=block)
    return y

In [11]:
# -----------------------------
# Benchmark utils
# -----------------------------
@torch.no_grad()
def bench_cuda(fn, iters=200, warmup=50):
    # Warmup
    for _ in range(warmup):
        fn()
    torch.cuda.synchronize()

    starter = torch.cuda.Event(enable_timing=True)
    ender = torch.cuda.Event(enable_timing=True)

    starter.record()
    for _ in range(iters):
        fn()
    ender.record()
    torch.cuda.synchronize()

    ms = starter.elapsed_time(ender) / iters
    return ms  # per-iter milliseconds

def gbps_for_vector_add(N, dtype_bytes=4, ms=1.0):
    # vector add: read x + read z + write y => 3 * N * bytes
    bytes_moved = 3 * N * dtype_bytes
    seconds = ms / 1e3
    return (bytes_moved / seconds) / 1e9  # GB/s

In [12]:
# -----------------------------
# Main
# -----------------------------
def main():
    if not torch.cuda.is_available():
        print("CUDA is not available. Please run on a GPU machine.")
        return

    torch.manual_seed(0)
    device = "cuda"
    dtype = torch.float32

    # Try a few sizes
    sizes = [1_000_000, 10_000_000, 50_000_000]  # you can change
    blocks = [256, 512, 1024]  # try different BLOCK sizes

    print(f"Device: {torch.cuda.get_device_name(0)}")
    print(f"dtype: {dtype}\n")

    rows = []
    for N in sizes:
        x = torch.randn(N, device=device, dtype=dtype)
        z = torch.randn(N, device=device, dtype=dtype)

        # PyTorch baseline
        def torch_fn():
            return torch.add(x, z)

        torch_ms = bench_cuda(torch_fn)
        torch_gbs = gbps_for_vector_add(N, dtype_bytes=4, ms=torch_ms)

        # Triton: pick the best among blocks (simple manual "tuning")
        best = None
        for b in blocks:
            def triton_fn():
                return triton_vector_add(x, z, block=b)
            ms = bench_cuda(triton_fn)
            gbs = gbps_for_vector_add(N, dtype_bytes=4, ms=ms)
            if (best is None) or (ms < best["ms"]):
                best = {"block": b, "ms": ms, "gbs": gbs}

        # Correctness check (one run)
        y_triton = triton_vector_add(x, z, block=best["block"])
        y_torch = torch.add(x, z)
        max_err = (y_triton - y_torch).abs().max().item()

        rows.append({
            "N": N,
            "torch_ms": torch_ms,
            "torch_GBs": torch_gbs,
            "triton_block": best["block"],
            "triton_ms": best["ms"],
            "triton_GBs": best["gbs"],
            "speedup": torch_ms / best["ms"],
            "max_abs_err": max_err,
        })

    # Print table
    header = (
        f"{'N':>12} | {'torch (ms)':>10} | {'torch GB/s':>10} | "
        f"{'triton BLOCK':>11} | {'triton (ms)':>11} | {'triton GB/s':>11} | "
        f"{'speedup':>8} | {'max_err':>9}"
    )
    print(header)
    print("-" * len(header))
    for r in rows:
        print(
            f"{r['N']:>12} | {r['torch_ms']:>10.4f} | {r['torch_GBs']:>10.2f} | "
            f"{r['triton_block']:>11} | {r['triton_ms']:>11.4f} | {r['triton_GBs']:>11.2f} | "
            f"{r['speedup']:>8.2f} | {r['max_abs_err']:>9.2e}"
        )

if __name__ == "__main__":
    main()

Device: Tesla T4
dtype: torch.float32

           N | torch (ms) | torch GB/s | triton BLOCK | triton (ms) | triton GB/s |  speedup |   max_err
--------------------------------------------------------------------------------------------------------
     1000000 |     0.0526 |     228.34 |         512 |      0.0521 |      230.14 |     1.01 |  0.00e+00
    10000000 |     0.4915 |     244.15 |         256 |      0.4672 |      256.87 |     1.05 |  0.00e+00
    50000000 |     2.4857 |     241.38 |         256 |      2.3667 |      253.51 |     1.05 |  0.00e+00


In [14]:
# -----------------------------
# Benchmark helper (CUDA events)
# -----------------------------
@torch.no_grad()
def bench_ms(fn, iters=200, warmup=50):
    for _ in range(warmup):
        fn()
    torch.cuda.synchronize()

    s = torch.cuda.Event(enable_timing=True)
    e = torch.cuda.Event(enable_timing=True)
    s.record()
    for _ in range(iters):
        fn()
    e.record()
    torch.cuda.synchronize()
    return s.elapsed_time(e) / iters  # ms/iter

def eff_gbs(N, ms, dtype_bytes=4):
    # vector add: read x + read z + write y => 3 * N * bytes
    bytes_moved = 3 * N * dtype_bytes
    return (bytes_moved / (ms / 1e3)) / 1e9



In [16]:
# -----------------------------
# Run + print a small table
# -----------------------------
def main():
    if not torch.cuda.is_available():
        print("CUDA not available.")
        return

    torch.manual_seed(0)
    device = "cuda"
    dtype = torch.float32

    # Choose a size
    N = 10_000_000
    x = torch.randn(N, device=device, dtype=dtype)
    z = torch.randn(N, device=device, dtype=dtype)

    # PyTorch baseline
    torch_fn = lambda: torch.add(x, z)
    torch_ms = bench_ms(torch_fn)
    torch_gbs = eff_gbs(N, torch_ms)

    # Triton (try a few blocks, pick best)
    best = None
    for bs in [256, 512, 1024, 2048]:
        tri_fn = lambda bs=bs: triton_vector_add(x, z, block=bs)
        ms = bench_ms(tri_fn)
        gbs = eff_gbs(N, ms)
        if best is None or ms < best["ms"]:
            best = {"bs": bs, "ms": ms, "gbs": gbs}

    # Correctness
    y_torch = torch.add(x, z)
    y_triton = triton_vector_add(x, z, block=best["bs"])
    max_err = (y_torch - y_triton).abs().max().item()

    # Print table
    print(f"GPU: {torch.cuda.get_device_name(0)} | dtype: {dtype} | N={N}\n")
    header = f"{'Impl':<10} | {'ms/iter':>10} | {'GB/s':>10} | {'speedup':>8} | {'note':<18}"
    print(header)
    print("-" * len(header))
    print(f"{'PyTorch':<10} | {torch_ms:>10.4f} | {torch_gbs:>10.2f} | {1.00:>8.2f} | {'torch.add':<18}")
    print(f"{'Triton':<10} | {best['ms']:>10.4f} | {best['gbs']:>10.2f} | {(torch_ms/best['ms']):>8.2f} | {('BLOCK='+str(best['bs'])):<18}")
    print(f"\nmax_abs_err = {max_err:.3e}")

if __name__ == "__main__":
    main()

GPU: Tesla T4 | dtype: torch.float32 | N=10000000

Impl       |    ms/iter |       GB/s |  speedup | note              
--------------------------------------------------------------------
PyTorch    |     0.4914 |     244.20 |     1.00 | torch.add         
Triton     |     0.4630 |     259.17 |     1.06 | BLOCK=256         

max_abs_err = 0.000e+00
