In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib import gridspec

os.makedirs("plots", exist_ok=True)

# Data import:

In [2]:
bench_raw = pd.read_csv("build/benchmark.csv")
bench_raw["Test Case"] = "TC " + bench_raw["Test Case"].astype(str)
bench_raw["Cycles"] = pd.to_numeric(bench_raw["Cycles"], errors="coerce")
bench_raw = bench_raw.dropna(subset=["Cycles"])

orig_raw = pd.read_csv(
    "build/original.csv",
    names=["Function", "Test Case", "Iteration", "Cycles"],
    header=None,
)
orig_raw["Test Case"] = "TC " + orig_raw["Test Case"].astype(str)
orig_raw["Cycles"] = pd.to_numeric(orig_raw["Cycles"], errors="coerce")
orig_raw = orig_raw.dropna(subset=["Cycles"])

flops_df = pd.read_csv("build/flops.csv")
# ensure 'Test Case' is labeled consistently
if flops_df["Test Case"].dtype != object:
    flops_df["Test Case"] = "TC " + flops_df["Test Case"].astype(str)

prof_df = pd.read_csv("build/profiling.csv")
if prof_df["Test Case"].dtype != object:
    prof_df["Test Case"] = "TC " + prof_df["Test Case"].astype(str)

bench_spmd = pd.read_csv("build/benchmark_spmd.csv")
# no Test Case here—just Function,Iteration,Cycles
bench_spmd["Cycles"] = pd.to_numeric(bench_spmd["Cycles"], errors="coerce")
bench_spmd = bench_spmd.dropna(subset=["Cycles"])

flops_spmd = pd.read_csv("build/flops_spmd.csv")

prof_spmd = pd.read_csv("build/profiling_spmd.csv")

print(f"bench_raw:   {bench_raw.shape[0]} rows")
print(f"orig_raw:    {orig_raw.shape[0]} rows")
print(f"flops_df:    {flops_df.shape[0]} rows")
print(f"prof_df:     {prof_df.shape[0]} rows")
print(f"bench_spmd:  {bench_spmd.shape[0]} rows")
print(f"flops_spmd:  {flops_spmd.shape[0]} rows")
print(f"prof_spmd:   {prof_spmd.shape[0]} rows")

display(bench_raw.head(), orig_raw.head(), flops_df.head(), prof_df.head(), bench_spmd.head(), flops_spmd.head(), prof_spmd.head())

bench_raw:   2100000 rows
orig_raw:    50000 rows
flops_df:    90 rows
prof_df:     12600000 rows
bench_spmd:  90000 rows
flops_spmd:  18 rows
prof_spmd:   540000 rows


Unnamed: 0,Function,Test Case,Iteration,Cycles
0,scalar Less SQRT + Approx,TC 0,0,66637
1,scalar Less SQRT + Approx,TC 1,0,69671
2,scalar Less SQRT + Approx,TC 2,0,67081
3,scalar Less SQRT + Approx,TC 3,0,70226
4,scalar Less SQRT + Approx,TC 4,0,69745


Unnamed: 0,Function,Test Case,Iteration,Cycles
1,Original,TC 0,0,433899.0
2,Original,TC 0,1,370074.0
3,Original,TC 0,2,360417.0
4,Original,TC 0,3,357827.0
5,Original,TC 0,4,356976.0


Unnamed: 0,Function,Section,Test Case,Flops,Memory,ADDS,MULS,DIVS,SQRT
0,Basic Implementation,collide_balls,TC 0,79218,240,40091,28107,8013,3007
1,Basic Implementation,Initialization,TC 0,106,144,40,57,5,4
2,Basic Implementation,Impulse,TC 0,16016,0,4004,8008,4004,0
3,Basic Implementation,Delta,TC 0,32032,0,22022,6006,4004,0
4,Basic Implementation,Velocity,TC 0,31034,0,14015,14016,0,3003


Unnamed: 0,Function,Section,Test Case,Iteration,Cycles
0,SIMD,collide_balls,TC 0,0,897139
1,SIMD,Initialization,TC 0,0,74
2,SIMD,Impulse,TC 0,0,36858
3,SIMD,Delta,TC 0,0,38073
4,SIMD,Velocity,TC 0,0,36001


Unnamed: 0,Function,Iteration,Cycles
0,4x Recip Sqrt Implementation,0,221815
1,4x Recip Sqrt Implementation,1,221704
2,4x Recip Sqrt Implementation,2,221741
3,4x Recip Sqrt Implementation,3,221630
4,4x Recip Sqrt Implementation,4,221630


Unnamed: 0,Function,Section,Flops,Memory,ADDS,MULS,DIVS,SQRT
0,4x Basic Implementation,collide_balls,313085,960,156472,112504,32072,12037
1,4x Basic Implementation,Initialization,424,576,160,228,20,16
2,4x Basic Implementation,Impulse,60088,0,12016,32048,16024,0
3,4x Basic Implementation,Delta,128224,0,88154,24042,16028,0
4,4x Basic Implementation,Velocity,124229,0,56102,56106,0,12021


Unnamed: 0,Function,Section,Iteration,Cycles
0,4x Basic Implementation,collide_balls,0,3913416
1,4x Basic Implementation,Initialization,0,481
2,4x Basic Implementation,Impulse,0,154180
3,4x Basic Implementation,Delta,0,148152
4,4x Basic Implementation,Velocity,0,148927


# Data filtering:

In [3]:
def drop_top_outliers(df):
    mask = df.groupby(["Function", "Test Case"])["Cycles"].transform(
        lambda x: x <= x.mean() + 3 * x.std()
    )
    return df[mask].reset_index(drop=True)


bench_clean = drop_top_outliers(bench_raw)
orig_clean = drop_top_outliers(orig_raw)

print(f"bench_raw: {len(bench_raw)} rows -> bench_clean: {len(bench_clean)} rows")
print(f"orig_raw:  {len(orig_raw)} rows ->  orig_clean:  {len(orig_clean)} rows")

removed = (
    (
        bench_raw.groupby(["Function", "Test Case"]).size()
        - bench_clean.groupby(["Function", "Test Case"]).size()
    )
    .rename("n_removed")
    .reset_index()
)
print("\nTop-outliers removed (bench):")
display(removed.head())

bench_raw: 2100000 rows -> bench_clean: 2095777 rows
orig_raw:  50000 rows ->  orig_clean:  48862 rows

Top-outliers removed (bench):


Unnamed: 0,Function,Test Case,n_removed
0,Approx + Symmetry,TC 0,24
1,Approx + Symmetry,TC 1,54
2,Approx + Symmetry,TC 2,59
3,Approx + Symmetry,TC 3,78
4,Approx + Symmetry,TC 4,65


In [4]:
# isolate just the collide_balls rows
cb = prof_df[prof_df["Section"] == "collide_balls"]

# compute per-group threshold = mean + 3 * std
thr = cb.groupby(["Function", "Test Case"])["Cycles"].agg(["mean", "std"]).reset_index()
thr["threshold"] = thr["mean"] + 3 * thr["std"]

# find all (Function,TC,Iteration) where collide_balls exceeds that threshold
cb_thr = cb.merge(thr, on=["Function", "Test Case"])
bad_iters = cb_thr[cb_thr["Cycles"] > cb_thr["threshold"]][
    ["Function", "Test Case", "Iteration"]
].drop_duplicates()

# drop all rows in prof_df belonging to those bad iterations
prof_clean = (
    prof_df.merge(
        bad_iters.assign(to_drop=1),
        on=["Function", "Test Case", "Iteration"],
        how="left",
    )
    .query("to_drop != 1")
    .drop(columns="to_drop")
    .reset_index(drop=True)
)

print(f"prof_df:  {len(prof_df)} rows -> prof_clean: {len(prof_clean)} rows")
print("Example removed iterations:")
display(bad_iters.head())


prof_df:  12600000 rows -> prof_clean: 12439188 rows
Example removed iterations:


Unnamed: 0,Function,Test Case,Iteration
6802,SIMD,TC 2,1360
6874,SIMD,TC 4,1374
6882,SIMD,TC 2,1376
7129,SIMD,TC 4,1425
7340,SIMD,TC 0,1468


In [5]:
mask_spmd = (
    bench_spmd
    .groupby("Function")["Cycles"]
    .transform(lambda x: x <= x.mean() + 3*x.std())
)
bench_spmd_clean = bench_spmd[mask_spmd].reset_index(drop=True)

print(f"bench_spmd:        {len(bench_spmd)} rows")
print(f"bench_spmd_clean:  {len(bench_spmd_clean)} rows")

rm = (bench_spmd.groupby("Function").size()
      - bench_spmd_clean.groupby("Function").size()
      ).rename("n_removed").reset_index()
display(rm)

bench_spmd:        90000 rows
bench_spmd_clean:  89857 rows


Unnamed: 0,Function,n_removed
0,4x Basic Implementation,91
1,4x Recip Sqrt Implementation,31
2,SPMD Basic Implementation,21


In [6]:
# isolate just the collide_balls rows
cb = prof_spmd[prof_spmd["Section"] == "collide_balls"]

# compute per-group threshold = mean + 3 * std
thr = cb.groupby(["Function"])["Cycles"].agg(["mean", "std"]).reset_index()
thr["threshold"] = thr["mean"] + 3 * thr["std"]

# find all (Function,TC,Iteration) where collide_balls exceeds that threshold
cb_thr = cb.merge(thr, on=["Function"])
bad_iters = cb_thr[cb_thr["Cycles"] > cb_thr["threshold"]][
    ["Function", "Iteration"]
].drop_duplicates()

# drop all rows in prof_spmd belonging to those bad iterations
prof_clean_spmd = (
    prof_spmd.merge(
        bad_iters.assign(to_drop=1),
        on=["Function", "Iteration"],
        how="left",
    )
    .query("to_drop != 1")
    .drop(columns="to_drop")
    .reset_index(drop=True)
)

print(f"prof_spmd:  {len(prof_spmd)} rows -> prof_clean: {len(prof_clean_spmd)} rows")
print("Example removed iterations:")
display(bad_iters.head())

prof_spmd:  540000 rows -> prof_clean: 535842 rows
Example removed iterations:


Unnamed: 0,Function,Iteration
14,4x Basic Implementation,14
1056,4x Basic Implementation,1056
1222,4x Basic Implementation,1222
1238,4x Basic Implementation,1238
1563,4x Basic Implementation,1563


# Graphs:

In [7]:
# group the cleaned benchmark + original data
bench_grouped = bench_clean.groupby(["Function", "Test Case"], as_index=False)[
    ["Cycles"]
].mean()
orig_grouped = orig_clean.groupby(["Function", "Test Case"], as_index=False)[
    ["Cycles"]
].mean()

# stack them so all plots can include "Original" alongside the bench variants
grouped_all = pd.concat([bench_grouped, orig_grouped], ignore_index=True)

# mean cycles across all test cases per function
mean_cycles_all = (
    grouped_all.groupby("Function", as_index=False)["Cycles"]
    .mean()
    .rename(columns={"Cycles": "MeanCycles_AllTC"})
)
mean_cycles_all = mean_cycles_all.sort_values(
    "MeanCycles_AllTC", ascending=False
).reset_index(drop=True)
mean_cycles_all["MeanCycles_AllTC"] = (
    mean_cycles_all["MeanCycles_AllTC"].round(0).astype(int)
)
print("=== mean_cycles_all ===")
display(mean_cycles_all)


=== mean_cycles_all ===


Unnamed: 0,Function,MeanCycles_AllTC
0,Original,360864
1,SIMD Optimized Impulse,111615
2,SIMD,111371
3,Code Motion,90485
4,Register Relieve,82648
5,SIMD scalar loop,79085
6,SIMD SSA,79032
7,Basic Implementation,77938
8,Scalar Less SQRT,70843
9,scalar Less SQRT + Approx,68842


In [8]:
# Mean cycles per test case (function x Test Case pivot)
mean_cycles_tc = (
    grouped_all.groupby(["Test Case", "Function"], as_index=False)["Cycles"]
    .mean()
    .pivot(index="Test Case", columns="Function", values="Cycles")
)
func_order = mean_cycles_all["Function"].tolist()
mean_cycles_tc = mean_cycles_tc[func_order]
mean_cycles_tc = mean_cycles_tc.round(0).astype(int)

print("=== mean_cycles_tc ===")
display(mean_cycles_tc)

=== mean_cycles_tc ===


Function,Original,SIMD Optimized Impulse,SIMD,Code Motion,Register Relieve,SIMD scalar loop,SIMD SSA,Basic Implementation,Scalar Less SQRT,scalar Less SQRT + Approx,Improved Symmetry,Reciprocal Sqrt Less IF,Reciprocal Sqrt Hoist,Reciprocal Sqrt IF,Approx + Symmetry
Test Case,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
TC 0,358130,111268,111342,91838,83786,75245,78984,74628,71409,67403,67097,60997,53871,59195,56681
TC 1,359104,111708,111423,90023,81754,81608,79587,80072,70211,69716,69738,59426,63120,59345,54391
TC 2,357750,111074,111101,91838,83793,75304,78776,74656,71481,67094,67250,60574,53915,59238,56602
TC 3,360247,112025,111852,89273,82107,81722,79020,80255,70398,70177,69935,59522,63127,59491,54500
TC 4,369086,112003,111136,89453,81801,81546,78791,80078,70716,69820,70159,59426,63041,59382,54390


In [9]:
# Cost of operations per (Function, Test Case)
cost_ops = flops_df.groupby(["Function", "Test Case"], as_index=False)[
    ["ADDS", "MULS", "DIVS", "SQRT"]
].mean()
cost_ops[["ADDS", "MULS", "DIVS", "SQRT"]] = (
    cost_ops[["ADDS", "MULS", "DIVS", "SQRT"]].round(0).astype(int)
)
print("=== cost_ops ===")
display(cost_ops)

=== cost_ops ===


Unnamed: 0,Function,Test Case,ADDS,MULS,DIVS,SQRT
0,Approx + Symmetry,TC 0,15029,15702,336,668
1,Approx + Symmetry,TC 1,14361,15702,336,668
2,Approx + Symmetry,TC 2,15028,15698,336,668
3,Approx + Symmetry,TC 3,14404,15745,337,670
4,Approx + Symmetry,TC 4,14375,15714,336,668
5,Basic Implementation,TC 0,13364,9369,2671,1002
6,Basic Implementation,TC 1,12696,9369,2671,1002
7,Basic Implementation,TC 2,13363,9368,2670,1002
8,Basic Implementation,TC 3,12734,9396,2678,1005
9,Basic Implementation,TC 4,12709,9377,2673,1003


In [10]:
# Mean cycles per section (Function x Section x Test Case)
sec_cycles = prof_clean.groupby(["Function", "Section", "Test Case"], as_index=False)[
    "Cycles"
].mean()
sec_cycles["Cycles"] = sec_cycles["Cycles"].round(0).astype(int)
print("=== sec_cycles ===")
display(sec_cycles)

=== sec_cycles ===


Unnamed: 0,Function,Section,Test Case,Cycles
0,Approx + Symmetry,Delta,TC 0,40810
1,Approx + Symmetry,Delta,TC 1,40810
2,Approx + Symmetry,Delta,TC 2,40333
3,Approx + Symmetry,Delta,TC 3,40366
4,Approx + Symmetry,Delta,TC 4,41431
...,...,...,...,...
415,scalar Less SQRT + Approx,collide_balls,TC 0,921688
416,scalar Less SQRT + Approx,collide_balls,TC 1,920480
417,scalar Less SQRT + Approx,collide_balls,TC 2,919659
418,scalar Less SQRT + Approx,collide_balls,TC 3,922148


In [11]:
section_order = [
    "collide_balls",
    "Initialization",
    "Impulse",
    "Delta",
    "Velocity",
    "Transform to World Frame",
]
tcs = sorted(sec_cycles["Test Case"].unique(),
             key=lambda x: int(x.split()[-1]))

for sec in section_order:
    print(f"\n=== Section: {sec} (ordered by TC 0 desc) ===")
    sub = sec_cycles[sec_cycles["Section"] == sec]
    pivot = (
        sub
        .pivot(index="Function", columns="Test Case", values="Cycles")
        .reindex(columns=tcs)
    )
    pivot = pivot.loc[pivot["TC 0"].sort_values(ascending=False).index]
    display(pivot)


=== Section: collide_balls (ordered by TC 0 desc) ===


Test Case,TC 0,TC 1,TC 2,TC 3,TC 4
Function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SIMD Optimized Impulse,943295,942281,943228,947954,943894
SIMD,941537,942850,941804,945059,943869
Approx + Symmetry,940200,941192,945933,943846,950996
SIMD SSA,934388,934616,934802,939567,935341
Improved Symmetry,934334,934386,931224,932889,933000
Scalar Less SQRT,931566,933865,935135,937008,933876
Basic Implementation,930234,927795,929887,930308,931008
SIMD scalar loop,928652,924759,927356,928523,930212
Code Motion,925051,923239,924851,928196,926551
Reciprocal Sqrt IF,922363,925052,922238,926379,928740



=== Section: Initialization (ordered by TC 0 desc) ===


Test Case,TC 0,TC 1,TC 2,TC 3,TC 4
Function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Scalar Less SQRT,182,360,201,224,225
Basic Implementation,126,118,117,115,115
Improved Symmetry,123,118,118,127,116
scalar Less SQRT + Approx,121,115,116,114,114
SIMD scalar loop,83,72,71,85,83
SIMD SSA,80,414,76,78,72
Reciprocal Sqrt Hoist,53,53,53,40,39
Approx + Symmetry,47,44,44,54,44
Reciprocal Sqrt IF,45,43,44,44,49
Code Motion,44,44,45,62,55



=== Section: Impulse (ordered by TC 0 desc) ===


Test Case,TC 0,TC 1,TC 2,TC 3,TC 4
Function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Reciprocal Sqrt Less IF,14341385979574688,14341381679215134,14341405706377870,14384364574933730,14355725096999780
Reciprocal Sqrt Hoist,14276865972664190,14276864157635566,14276838962402608,14319623002910278,14291108095265690
Approx + Symmetry,42474,42697,44974,43138,42447
Scalar Less SQRT,41666,42862,43595,43396,42830
Basic Implementation,41524,40373,41286,39988,40467
Improved Symmetry,41428,45812,40816,45045,44119
SIMD scalar loop,41028,43225,40399,43700,44183
SIMD Optimized Impulse,40787,40168,40375,40805,39464
SIMD,40715,41647,40443,40516,41110
scalar Less SQRT + Approx,40427,43288,38983,43307,43028



=== Section: Delta (ordered by TC 0 desc) ===


Test Case,TC 0,TC 1,TC 2,TC 3,TC 4
Function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SIMD SSA,41445,41879,40865,42234,40335
SIMD Optimized Impulse,41405,41488,41225,40987,41878
Reciprocal Sqrt Hoist,41206,39561,40558,39750,40319
SIMD,41148,40525,40921,40861,40218
Reciprocal Sqrt Less IF,40873,39545,41216,39668,39590
Approx + Symmetry,40810,40810,40333,40366,41431
Improved Symmetry,40489,41139,40661,40399,40492
Basic Implementation,40339,39986,39939,39980,40304
Scalar Less SQRT,40203,40134,40600,40555,40094
scalar Less SQRT + Approx,39753,39617,39776,39518,39892



=== Section: Velocity (ordered by TC 0 desc) ===


Test Case,TC 0,TC 1,TC 2,TC 3,TC 4
Function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SIMD SSA,42243,42113,42235,41941,41483
Scalar Less SQRT,41558,41499,42865,42719,41977
Basic Implementation,41461,40620,41506,41858,41736
SIMD scalar loop,41281,40253,40602,40702,41189
Approx + Symmetry,41236,41726,41678,41551,41674
Reciprocal Sqrt Hoist,41128,40083,40570,39696,40825
Improved Symmetry,40673,40883,40988,41560,41221
SIMD,40396,41586,40516,40853,39760
SIMD Optimized Impulse,40372,40951,40649,41253,40049
Reciprocal Sqrt Less IF,39914,40123,40565,39608,39259



=== Section: Transform to World Frame (ordered by TC 0 desc) ===


Test Case,TC 0,TC 1,TC 2,TC 3,TC 4
Function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SIMD,45,41,39,38,37
Scalar Less SQRT,40,63,61,40,40
Improved Symmetry,39,39,81,40,41
scalar Less SQRT + Approx,39,39,44,39,39
Reciprocal Sqrt Hoist,38,37,37,37,38
Code Motion,38,37,38,37,38
SIMD Optimized Impulse,38,69,38,39,38
Reciprocal Sqrt IF,37,37,38,37,37
Approx + Symmetry,37,37,57,39,37
Basic Implementation,37,37,72,53,43


In [12]:

# Section‐level FlopsPerCycle: merge flops_df + sec_cycles
flops_sec = pd.merge(
    flops_df, sec_cycles, on=["Function", "Section", "Test Case"], how="inner"
)
flops_sec["FlopsPerCycle"] = flops_sec["Flops"] / flops_sec["Cycles"]
print("=== flops_sec (section-level FlopsPerCycle) ===")
display(flops_sec[["Function", "Section", "Test Case", "FlopsPerCycle"]])

=== flops_sec (section-level FlopsPerCycle) ===


Unnamed: 0,Function,Section,Test Case,FlopsPerCycle
0,Basic Implementation,collide_balls,TC 0,0.085159
1,Basic Implementation,Initialization,TC 0,0.841270
2,Basic Implementation,Impulse,TC 0,0.385705
3,Basic Implementation,Delta,TC 0,0.794070
4,Basic Implementation,Velocity,TC 0,0.748511
...,...,...,...,...
85,Code Motion,Initialization,TC 4,1.636364
86,Code Motion,Impulse,TC 4,0.399885
87,Code Motion,Delta,TC 4,0.602858
88,Code Motion,Velocity,TC 4,0.578025


In [16]:
# Overall FlopsPerCycle per Function x Test Case
# overall_fp = flops_sec.groupby(["Function", "Test Case"], as_index=False)[
#     ["Flops", "Cycles"]
# ].sum()
# overall_fp["FlopsPerCycle"] = overall_fp["Flops"] / overall_fp["Cycles"]
# print("=== overall_fp (Function x Test Case) ===")
# display(overall_fp)
cb_flops = flops_sec[flops_sec["Section"] == "collide_balls"]
overall_fp = cb_flops.groupby(["Function", "Test Case"], as_index=False)[
    ["Flops", "Cycles"]
].sum()
overall_fp["FlopsPerCycle"] = overall_fp["Flops"] / overall_fp["Cycles"]
print("=== overall_fp ===")
display(overall_fp)

=== overall_fp ===


Unnamed: 0,Function,Test Case,Flops,Cycles,FlopsPerCycle
0,Approx + Symmetry,TC 0,95204,940200,0.101259
1,Approx + Symmetry,TC 1,93202,941192,0.099025
2,Approx + Symmetry,TC 2,95188,945933,0.100629
3,Approx + Symmetry,TC 3,93467,943846,0.099028
4,Approx + Symmetry,TC 4,93281,950996,0.098088
5,Basic Implementation,TC 0,79218,930234,0.085159
6,Basic Implementation,TC 1,77216,927795,0.083225
7,Basic Implementation,TC 2,79210,929887,0.085182
8,Basic Implementation,TC 3,77441,930308,0.083242
9,Basic Implementation,TC 4,77287,931008,0.083014


In [17]:
tcs = sorted(
    overall_fp["Test Case"].unique(),
    key=lambda s: int(s.split()[-1])
)
fp_grid = (
    overall_fp
    .pivot(index="Function", columns="Test Case", values="FlopsPerCycle")
    .reindex(columns=tcs)
)

fp_grid = fp_grid.loc[
    fp_grid["TC 0"].sort_values(ascending=False).index
]

print("=== FlopsPerCycle (Function x Test Case), ordered by TC 0 desc ===")
display(fp_grid)

=== FlopsPerCycle (Function x Test Case), ordered by TC 0 desc ===


Test Case,TC 0,TC 1,TC 2,TC 3,TC 4
Function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Approx + Symmetry,0.101259,0.099025,0.100629,0.099028,0.098088
Basic Implementation,0.085159,0.083225,0.085182,0.083242,0.083014
Code Motion,0.070467,0.068437,0.070473,0.068268,0.068253


In [18]:
spmd_mean_cycles = (
    bench_spmd_clean.groupby("Function", as_index=False)["Cycles"]
    .mean()
    .rename(columns={"Cycles": "MeanCycles"})
)
sorted_spmd_mean_cycles = spmd_mean_cycles.sort_values(
    "MeanCycles", ascending=False
).reset_index(drop=True)
sorted_spmd_mean_cycles["MeanCycles"] = (
    sorted_spmd_mean_cycles["MeanCycles"].round(0).astype(int)
)
print("=== SPMD mean_cycles_all ===")
display(sorted_spmd_mean_cycles)

=== SPMD mean_cycles_all ===


Unnamed: 0,Function,MeanCycles
0,4x Basic Implementation,311018
1,4x Recip Sqrt Implementation,224139
2,SPMD Basic Implementation,116745


In [19]:
# Cost of operations per Function
cost_ops = flops_spmd.groupby(["Function"], as_index=False)[
    ["ADDS", "MULS", "DIVS", "SQRT"]
].mean()
cost_ops[["ADDS", "MULS", "DIVS", "SQRT"]] = (
    cost_ops[["ADDS", "MULS", "DIVS", "SQRT"]].round(0).astype(int)
)
print("=== SPMD cost_ops ===")
display(cost_ops)

=== SPMD cost_ops ===


Unnamed: 0,Function,ADDS,MULS,DIVS,SQRT
0,4x Basic Implementation,52157,37501,10691,4012
1,4x Recip Sqrt Implementation,58822,62848,1344,2673
2,SPMD Basic Implementation,41573,45614,13391,4021


In [22]:
# Mean cycles per section (Function x Section)
spmd_sec_cycles = prof_clean_spmd.groupby(["Function", "Section"], as_index=False)[
    "Cycles"
].mean()
spmd_sec_cycles["Cycles"] = spmd_sec_cycles["Cycles"].round(0).astype(int)
print("=== SPMD sec_cycles ===")
display(spmd_sec_cycles)

=== SPMD sec_cycles ===


Unnamed: 0,Function,Section,Cycles
0,4x Basic Implementation,Delta,150542
1,4x Basic Implementation,Impulse,155024
2,4x Basic Implementation,Initialization,462
3,4x Basic Implementation,Transform to World Frame,152
4,4x Basic Implementation,Velocity,152282
5,4x Basic Implementation,collide_balls,3605815
6,4x Recip Sqrt Implementation,Delta,150296
7,4x Recip Sqrt Implementation,Impulse,154660
8,4x Recip Sqrt Implementation,Initialization,162
9,4x Recip Sqrt Implementation,Transform to World Frame,149


In [23]:
# Section‐level FlopsPerCycle: merge flops_df + sec_cycles
flops_sec_spmd = pd.merge(
    flops_spmd, spmd_sec_cycles, on=["Function", "Section"], how="inner"
)
flops_sec_spmd["FlopsPerCycle"] = flops_sec_spmd["Flops"] / flops_sec_spmd["Cycles"]
print("=== SPMD flops_sec (section-level FlopsPerCycle) ===")
display(flops_sec_spmd[["Function", "Section", "FlopsPerCycle"]])

=== SPMD flops_sec (section-level FlopsPerCycle) ===


Unnamed: 0,Function,Section,FlopsPerCycle
0,4x Basic Implementation,collide_balls,0.086828
1,4x Basic Implementation,Initialization,0.917749
2,4x Basic Implementation,Impulse,0.387604
3,4x Basic Implementation,Delta,0.851749
4,4x Basic Implementation,Velocity,0.815783
5,4x Basic Implementation,Transform to World Frame,0.789474
6,4x Recip Sqrt Implementation,collide_balls,0.104467
7,4x Recip Sqrt Implementation,Initialization,1.975309
8,4x Recip Sqrt Implementation,Impulse,0.828857
9,4x Recip Sqrt Implementation,Delta,0.799822


In [24]:
# Overall FlopsPerCycle per Function
cb_flops_spmd = flops_sec_spmd[flops_sec_spmd["Section"] == "collide_balls"]
overall_fp_spmd = cb_flops_spmd.groupby(["Function"], as_index=False)[
    ["Flops", "Cycles"]
].sum()
overall_fp_spmd["FlopsPerCycle"] = overall_fp_spmd["Flops"] / overall_fp_spmd["Cycles"]
print("=== SPMD overall_fp ===")
display(overall_fp_spmd)

=== SPMD overall_fp ===


Unnamed: 0,Function,Flops,Cycles,FlopsPerCycle
0,4x Basic Implementation,313085,3605815,0.086828
1,4x Recip Sqrt Implementation,377061,3609385,0.104467
2,SPMD Basic Implementation,313797,946347,0.331588
