In [2]:
import timeit

from numba import njit
import torch
import numpy as np

# 1. Vectorization


### 1.1 Sum of array elements


In [8]:
@njit
def sum_numba(x):
    total = 0.0
    for i in range(x.shape[0]):
        total += x[i]
    return total


def sum_numpy(x):
    return np.sum(x)


@njit
def sum_numpy_numba(x):
    return np.sum(x)


def sum_torch(x):
    x_torch = torch.from_numpy(x)
    return torch.sum(x_torch).item()


def sum_iterative(x):
    total = 0.0
    for value in x:
        total += value
    return total


def sum_builtin(x):
    return sum(x)


sum_functions = {
    "Numba Iterative": sum_numba,
    "NumPy": sum_numpy,
    "NumPy with Numba": sum_numpy_numba,
    "PyTorch": sum_torch,
    "Python Iterative": sum_iterative,
    "Python Built-in": sum_builtin,
}

In [9]:
def benchmark(n=1_000_000, runs=10):
    print("Bencharking with array size:", n)
    a = np.random.rand(n).astype(np.float32)
    results = {}
    for name, func in sum_functions.items():
        func(a)  # warmup
        duration = timeit.timeit(lambda: func(a), number=runs)
        results[name] = duration / runs

    for name, duration in sorted(results.items(), key=lambda item: item[1]):
        print(f"{name}: {duration:.6f} seconds per run")
    print()

In [10]:
benchmark(1_000_000, runs=10)
benchmark(10_000_000, runs=10)
benchmark(10_000, runs=10)
benchmark(100, runs=10)

Bencharking with array size: 1000000
PyTorch: 0.000063 seconds per run
NumPy: 0.000178 seconds per run
NumPy with Numba: 0.000782 seconds per run
Numba Iterative: 0.000803 seconds per run
Python Built-in: 0.075395 seconds per run
Python Iterative: 0.112011 seconds per run

Bencharking with array size: 10000000
PyTorch: 0.001957 seconds per run
NumPy: 0.002432 seconds per run
Numba Iterative: 0.007900 seconds per run
NumPy with Numba: 0.008032 seconds per run
Python Built-in: 0.709326 seconds per run
Python Iterative: 0.987269 seconds per run

Bencharking with array size: 10000
NumPy: 0.000006 seconds per run
PyTorch: 0.000006 seconds per run
NumPy with Numba: 0.000008 seconds per run
Numba Iterative: 0.000008 seconds per run
Python Built-in: 0.000703 seconds per run
Python Iterative: 0.001013 seconds per run

Bencharking with array size: 100
NumPy with Numba: 0.000000 seconds per run
Numba Iterative: 0.000001 seconds per run
NumPy: 0.000004 seconds per run
PyTorch: 0.000006 seconds per

### 1.2 Sum of rows vs columns


#### PyTorch

Usually summing over rows (dim=1) is faster than summing over columns (dim=0) due to memory layout (row-major order).


In [None]:
def benchmark_torch(rows=1000, cols=1000, runs=10):
    device = torch.device("cpu")
    tensor = torch.randn(rows, cols, device=device)
    print(f"Benchmarking sum over array of shape: {rows}x{cols}")
    print(f"Device: {device}")

    t_rows = timeit.timeit(lambda: torch.sum(tensor, dim=1), number=runs)
    print(f"Sum Rows: {t_rows / runs:.6f} seconds per run")

    t_cols = timeit.timeit(lambda: torch.sum(tensor, dim=0), number=runs)
    print(f"Sum Columns: {t_cols / runs:.6f} seconds per run")

    print(f"Rows faster by: {t_cols / t_rows:.2f}x\n")


runs = 50
benchmark_torch(25_000, 25_000, runs=runs)
benchmark_torch(10_000, 10_000, runs=runs)
benchmark_torch(1_000, 1_000, runs=runs)
benchmark_torch(100, 100, runs=runs)
benchmark_torch(100_000, 100, runs=runs)
benchmark_torch(100, 100_000, runs=runs)

Benchmarking sum over array of shape: 25000x25000
Device: cpu
Sum Rows: 0.066713 seconds per run
Sum Columns: 0.250001 seconds per run
Rows faster by: 3.75x

Benchmarking sum over array of shape: 10000x10000
Device: cpu
Sum Rows: 0.010016 seconds per run
Sum Columns: 0.020654 seconds per run
Rows faster by: 2.06x

Benchmarking sum over array of shape: 1000x1000
Device: cpu
Sum Rows: 0.000019 seconds per run
Sum Columns: 0.000031 seconds per run
Rows faster by: 1.58x

Benchmarking sum over array of shape: 100x100
Device: cpu
Sum Rows: 0.000005 seconds per run
Sum Columns: 0.000005 seconds per run
Rows faster by: 0.91x

Benchmarking sum over array of shape: 100000x100
Device: cpu
Sum Rows: 0.001189 seconds per run
Sum Columns: 0.003459 seconds per run
Rows faster by: 2.91x

Benchmarking sum over array of shape: 100x100000
Device: cpu
Sum Rows: 0.001066 seconds per run
Sum Columns: 0.001944 seconds per run
Rows faster by: 1.82x



#### Numpy

Interestingly in Numpy it doesn't work that way.


In [58]:
def benchmark_np(rows=1000, cols=1000, runs=10):
    a = np.random.rand(rows, cols).astype(np.float32)
    print(f"Benchmarking sum over array of shape ({rows}, {cols})")

    t_rows = timeit.timeit(lambda: np.sum(a, axis=1), number=runs)
    print(f"Sum Rows: {t_rows / runs:.6f} seconds per run")

    t_cols = timeit.timeit(lambda: np.sum(a, axis=0), number=runs)
    print(f"Sum Columns: {t_cols / runs:.6f} seconds per run")

    print(f"Rows faster by: {t_cols / t_rows:.2f}x\n")


runs = 50
benchmark_np(25_000, 25_000, runs=runs)
benchmark_np(10_000, 10_000, runs=runs)
benchmark_np(1_000, 1_000, runs=runs)
benchmark_np(100, 100, runs=runs)
benchmark_np(100_000, 100, runs=runs)
benchmark_np(100, 100_000, runs=runs)

Benchmarking sum over array of shape (25000, 25000)
Sum Rows: 0.163219 seconds per run
Sum Columns: 0.159681 seconds per run
Rows faster by: 0.98x

Benchmarking sum over array of shape (10000, 10000)
Sum Rows: 0.025280 seconds per run
Sum Columns: 0.022915 seconds per run
Rows faster by: 0.91x

Benchmarking sum over array of shape (1000, 1000)
Sum Rows: 0.000199 seconds per run
Sum Columns: 0.000148 seconds per run
Rows faster by: 0.74x

Benchmarking sum over array of shape (100, 100)
Sum Rows: 0.000007 seconds per run
Sum Columns: 0.000007 seconds per run
Rows faster by: 0.97x

Benchmarking sum over array of shape (100000, 100)
Sum Rows: 0.004621 seconds per run
Sum Columns: 0.004060 seconds per run
Rows faster by: 0.88x

Benchmarking sum over array of shape (100, 100000)
Sum Rows: 0.002556 seconds per run
Sum Columns: 0.002537 seconds per run
Rows faster by: 0.99x

