# THIS IS AI SLOP for testing purposes

real benchmarks coming soon.

In [2]:
import os
import time
from statistics import mean, stdev
from pathlib import Path
import numpy as np

from matmul import naive, cblas

In [3]:
SIZES = [64, 256, 1024]  # square matrix sizes to test
WARMUPS = 2
REPEATS = 5
SEED = 12345


ALGORITHMS = [("naive", naive), ("cblas", cblas)]


def flops_for(m, k, n):
    return 2.0 * m * k * n


def ensure_contig(a):
    return np.ascontiguousarray(a, dtype=np.float64)


def bench(func, A, B, warmups=WARMUPS, repeats=REPEATS, batch=1):
    # warm-up
    for _ in range(warmups):
        _ = func(A, B)
    times = []
    for _ in range(repeats):
        t0 = time.time()
        if batch == 1:
            _ = func(A, B)
        else:
            for _ in range(batch):
                _ = func(A, B)
        t1 = time.time()
        elapsed = t1 - t0
        per_op = elapsed / batch
        times.append(per_op)
    return times


def choose_batch_for_size(s):
    # Simple heuristic: very small sizes need many ops per timed iteration
    if s <= 32:
        return 2000
    if s <= 64:
        return 500
    if s <= 128:
        return 100
    if s <= 256:
        return 20
    return 1


def perform_benchmarks():
    rng = np.random.default_rng(SEED)
    env = {
        "OMP_NUM_THREADS": os.environ.get("OMP_NUM_THREADS"),
        "MKL_NUM_THREADS": os.environ.get("MKL_NUM_THREADS"),
        "OPENBLAS_NUM_THREADS": os.environ.get("OPENBLAS_NUM_THREADS"),
    }

    results = {"meta": {"sizes": SIZES, "warmups": WARMUPS, "repeats": REPEATS, "seed": SEED, "env": env},
               "results": []}

    for name, func in ALGORITHMS:
        print(f"Algorithm: {name}")
        for s in SIZES:
            m = k = n = s
            A = ensure_contig(rng.random((m, k)))
            B = ensure_contig(rng.random((k, n)))

            # Sanity run
            C = func(A, B)

            batch = choose_batch_for_size(s)
            times = bench(func, A, B, warmups=WARMUPS, repeats=REPEATS, batch=batch)
            mean_t = mean(times)
            sd_t = stdev(times) if len(times) > 1 else 0.0
            total_muls = len(times) * batch
            total_flops = flops_for(m, k, n) * total_muls
            total_time = sum([t * batch for t in times])
            gflops = (total_flops / total_time) / 1e9 if total_time > 0 else None

            print(f" size={s:4d} mean={mean_t:.6f}s gflops={gflops:.3f}")

            results["results"].append({
                "algorithm": name,
                "size": s,
                "batch": batch,
                "mean_s": mean_t,
                "stdev_s": sd_t,
                "total_time_s": total_time,
                "gflops": gflops,
            })

In [4]:
perform_benchmarks()

Algorithm: naive
 size=  64 mean=0.000104s gflops=5.053
 size= 256 mean=0.010354s gflops=3.241
 size=1024 mean=0.885629s gflops=2.425
Algorithm: cblas
 size=  64 mean=0.000023s gflops=22.779
 size= 256 mean=0.000305s gflops=110.163
 size=1024 mean=0.020211s gflops=106.253
