In [None]:
# Copyright 2025 by Garmin Ltd. or its subsidiaries.

import time
import typing

import numpy as np
import torch

In [None]:
# Check for access to GPU // NVIDIA -- CUDA
if torch.cuda.is_available():
    gpu_backend = "cuda"
    def time_torch_gpu_mm(
        a: torch.Tensor, b: torch.Tensor
    ) -> typing.Tuple[float, torch.Tensor]:
        assert a.is_cuda and b.is_cuda, "Both tensors must be on the GPU"
        start = time.time()
        val = torch.mm(a, b)
        end = time.time()
        return end - start, val
# Check for access to GPU // Apple Silicon MPS
elif torch.backends.mps.is_available():
    gpu_backend = "mps"
    def time_torch_gpu_mm(
        a: torch.Tensor, b: torch.Tensor
    ) -> typing.Tuple[float, torch.Tensor]:
        assert a.is_mps and b.is_mps, "Both tensors must be on the GPU"
        start = time.time()
        val = torch.mm(a, b)
        end = time.time()
        return end - start, val 
else: 
    print("GPU Device Not Availible")


def time_torch_cpu_mm(
    a: torch.Tensor, b: torch.Tensor
) -> typing.Tuple[float, torch.Tensor]:
    assert a.is_cpu and b.is_cpu, "Both tensors must be on the CPU"
    start = time.time()
    val = torch.mm(a, b)
    end = time.time()
    return end - start, val


def time_numpy_mm(a: np.ndarray, b: np.ndarray) -> typing.Tuple[float, np.ndarray]:
    start = time.time()
    val = np.matmul(a, b)
    end = time.time()
    return end - start, val

In [None]:
def ensure_correctness(
    torch_gpu: torch.Tensor, torch_cpu: torch.Tensor, numpy: np.ndarray
) -> None:
    torch_gpu_as_np = torch_gpu.cpu().numpy().astype(np.float32)
    torch_cpu_as_np = torch_cpu.numpy().astype(np.float32)

    ATOL = 0.01
    assert np.allclose(
        torch_gpu_as_np, torch_cpu_as_np, atol=ATOL
    ), "Torch GPU and CPU results differ"
    assert np.allclose(
        torch_cpu_as_np, numpy, atol=ATOL
    ), "Torch CPU and NumPy results differ"
    assert np.allclose(
        torch_gpu_as_np, numpy, atol=ATOL
    ), "Torch GPU and NumPy results differ"

In [None]:
torch.manual_seed(0)
np.random.seed(0)
SIZE = 10000

# Create random tensors
a = torch.randn(SIZE, SIZE, device=gpu_backend, dtype=torch.float32)
b = torch.randn(SIZE, SIZE, device=gpu_backend, dtype=torch.float32)
a_cpu = a.cpu()
b_cpu = b.cpu()
a_numpy = a_cpu.numpy().astype(np.float32)
b_numpy = b_cpu.numpy().astype(np.float32)

ATOL = 0.0001
assert np.allclose(
    a.cpu().numpy(), a_cpu.numpy(), atol=ATOL
), f"Torch GPU and CPU results differ: {a.cpu().numpy()[0][0]} vs {a_cpu.numpy()[0][0]}"
assert np.allclose(
    a_cpu.numpy(), a_numpy, atol=ATOL
), f"Torch CPU and NumPy results differ: {a_cpu.numpy()[0][0]} vs {a_numpy[0][0]}"
assert np.allclose(
    a.cpu().numpy(), a_numpy, atol=ATOL
), f"Torch GPU and NumPy results differ: {a.cpu().numpy()[0][0]} vs {a_numpy[0][0]}"


In [None]:
# Time the operations
print(f"Timing {SIZE:,} x {SIZE:,} matrix multiplication...")
torch_gpu_time, torch_gpu_result = time_torch_gpu_mm(a, b)
torch_cpu_time, torch_cpu_result = time_torch_cpu_mm(a_cpu, b_cpu)
numpy_time, numpy_result = time_numpy_mm(a_numpy, b_numpy)

# Ensure correctness
ensure_correctness(torch_gpu_result, torch_cpu_result, numpy_result)

# Print the results
print(f"Time taken by Torch GPU: {torch_gpu_time:.6f} seconds")
print(f"Time taken by Torch CPU: {torch_cpu_time:.6f} seconds")
print(f"Time taken by NumPy: {numpy_time:.6f} seconds")