In [1]:
import pandas as pd

pd.options.display.max_colwidth = 100
pd.options.display.max_columns = 100
input_file = "4096x5120x4096.gemm.csv"
no_search_input = "./no_search/4096x5120x4096.gemm.csv"

In [2]:
COLS_TO_DROP = [
    "Problem",
    "Provider",
    "OperationKind",
    "gemm_kind",
    "alpha",
    "beta",
    "enable_sm90_mixed_dtype_shuffle_test",
    "min_cc",
    "max_cc",
    "op_class",
    "Bytes",
    "Flops",
    "Runtime",
    "C",
]


def load_df(path: str, cols_to_drop=COLS_TO_DROP):
    df = pd.read_csv(path)
    df = df.copy()
    df = df.drop(
        labels=cols_to_drop,
        axis=1,
    )
    df.sort_values(by="GFLOPs", ascending=False, inplace=True)
    return df

In [3]:
df_search = load_df(input_file)
df_no_search = load_df(no_search_input)

In [17]:
df_search.swizzle_size.unique(), df_search.raster_order.unique()
df_search.head().GFLOPs

512    857336.0
97     853791.0
121    851049.0
153    850498.0
528    844604.0
Name: GFLOPs, dtype: float64

In [16]:
df_no_search.swizzle_size.unique(), df_no_search.raster_order.unique()
df_no_search.head()["GFLOPs"]

512    851492.0
153    845995.0
520    845690.0
514    836437.0
155    832742.0
Name: GFLOPs, dtype: float64

In [12]:
from triton.testing import do_bench
import torch

M, N, K = 4096, 5120, 4096
dtype = torch.float16

A = torch.randn(M, K, dtype=dtype, device="cuda")
B = torch.randn(K, N, dtype=dtype, device="cuda")

GFLOP = (2 * M * N * K) / 1e9
time_ms = do_bench(lambda: torch.matmul(A, B))
time_s = time_ms / 1e3

print(f"Torch GFlops: {GFLOP / time_s:.2f}")


Torch GFlops: 754080.32
