In [None]:
import os
import numpy as np
import pandas as pd

# Data import:

In [22]:
bench_raw = pd.read_csv("build/benchmark.csv")
bench_raw["Test Case"] = "TC " + bench_raw["Test Case"].astype(str)
bench_raw["Cycles"] = pd.to_numeric(bench_raw["Cycles"], errors="coerce")
bench_raw = bench_raw.dropna(subset=["Cycles"])

orig_raw = pd.read_csv(
    "build/original.csv",
    names=["Function", "Test Case", "Iteration", "Cycles"],
    header=None,
)
orig_raw["Test Case"] = "TC " + orig_raw["Test Case"].astype(str)
orig_raw["Cycles"] = pd.to_numeric(orig_raw["Cycles"], errors="coerce")
orig_raw = orig_raw.dropna(subset=["Cycles"])

flops_df = pd.read_csv("build/flops.csv")
# ensure 'Test Case' is labeled consistently
if flops_df["Test Case"].dtype != object:
    flops_df["Test Case"] = "TC " + flops_df["Test Case"].astype(str)

prof_df = pd.read_csv("build/profiling.csv")
if prof_df["Test Case"].dtype != object:
    prof_df["Test Case"] = "TC " + prof_df["Test Case"].astype(str)

bench_n_df = pd.read_csv("build/N.csv")

bench_spmd = pd.read_csv("build/benchmark_spmd.csv")
# no Test Case here—just Function,Iteration,Cycles
bench_spmd["Cycles"] = pd.to_numeric(bench_spmd["Cycles"], errors="coerce")
bench_spmd = bench_spmd.dropna(subset=["Cycles"])

flops_spmd = pd.read_csv("build/flops_spmd.csv")

prof_spmd = pd.read_csv("build/profiling_spmd.csv")

bench_n_spmd = pd.read_csv("build/N_spmd.csv")

print(f"bench_raw:   {bench_raw.shape[0]} rows")
print(f"orig_raw:    {orig_raw.shape[0]} rows")
print(f"flops_df:    {flops_df.shape[0]} rows")
print(f"prof_df:     {prof_df.shape[0]} rows")
print(f"bench_n_df:  {bench_n_df.shape[0]} rows")
print(f"bench_spmd:  {bench_spmd.shape[0]} rows")
print(f"flops_spmd:  {flops_spmd.shape[0]} rows")
print(f"prof_spmd:   {prof_spmd.shape[0]} rows")
print(f"bench_n_spmd: {bench_n_spmd.shape[0]} rows")

display(bench_raw.head(), orig_raw.head(), flops_df.head(), prof_df.head(), bench_n_df.head(), bench_spmd.head(), flops_spmd.head(), prof_spmd.head(), bench_n_spmd.head())

bench_raw:   120000 rows
orig_raw:    50000 rows
flops_df:    240 rows
prof_df:     720000 rows
bench_n_df:  1440000 rows
bench_spmd:  15000 rows
flops_spmd:  30 rows
prof_spmd:   90000 rows
bench_n_spmd: 180000 rows


Unnamed: 0,Function,Test Case,Iteration,Cycles
0,Third SIMD impl.,TC 0,0,78514
1,Third SIMD impl.,TC 1,0,78588
2,Third SIMD impl.,TC 2,0,78551
3,Third SIMD impl.,TC 3,0,78773
4,Third SIMD impl.,TC 4,0,78773


Unnamed: 0,Function,Test Case,Iteration,Cycles
1,Original,TC 0,0,433899.0
2,Original,TC 0,1,370074.0
3,Original,TC 0,2,360417.0
4,Original,TC 0,3,357827.0
5,Original,TC 0,4,356976.0


Unnamed: 0,Function,Section,Test Case,Flops,Memory,ADDS,MULS,DIVS,SQRT
0,Compiler Flags,collide_balls,TC 0,79218,240,40092,28106,8013,3007
1,Compiler Flags,Initialization,TC 0,0,144,0,0,0,0
2,Compiler Flags,Impulse,TC 0,0,0,0,0,0,0
3,Compiler Flags,Delta,TC 0,0,0,0,0,0,0
4,Compiler Flags,Velocity,TC 0,0,0,0,0,0,0


Unnamed: 0,Function,Section,Test Case,Iteration,Cycles
0,Compiler Flags,collide_balls,TC 0,0,900210
1,Compiler Flags,Initialization,TC 0,0,111
2,Compiler Flags,Impulse,TC 0,0,36929
3,Compiler Flags,Delta,TC 0,0,37037
4,Compiler Flags,Velocity,TC 0,0,37075


Unnamed: 0,Function,Test Case,Iteration,N,Cycles
0,Best,0,0,100,6105
1,Best,1,0,100,6031
2,Best,2,0,100,6327
3,Best,3,0,100,6808
4,Best,4,0,100,5698


Unnamed: 0,Function,Iteration,Cycles
0,SPMD 3: Recip Sqrt,0,97051
1,SPMD 3: Recip Sqrt,1,97236
2,SPMD 3: Recip Sqrt,2,96718
3,SPMD 3: Recip Sqrt,3,97088
4,SPMD 3: Recip Sqrt,4,97051


Unnamed: 0,Function,Section,Flops,Memory,ADDS,MULS,DIVS,SQRT
0,4x Basic Implementation,collide_balls,313085,960,156476,112500,32072,12037
1,4x Basic Implementation,Initialization,0,576,0,0,0,0
2,4x Basic Implementation,Impulse,0,0,0,0,0,0
3,4x Basic Implementation,Delta,0,0,0,0,0,0
4,4x Basic Implementation,Velocity,0,0,0,0,0,0


Unnamed: 0,Function,Section,Iteration,Cycles
0,4x Basic Implementation,collide_balls,0,3600618
1,4x Basic Implementation,Initialization,0,481
2,4x Basic Implementation,Impulse,0,156786
3,4x Basic Implementation,Delta,0,148189
4,4x Basic Implementation,Velocity,0,148335


Unnamed: 0,Function,Iteration,N,Cycles
0,4x Recip Sqrt Implementation,0,100,221519
1,4x Recip Sqrt Implementation,1,100,221889
2,4x Recip Sqrt Implementation,2,100,221593
3,4x Recip Sqrt Implementation,3,100,221445
4,4x Recip Sqrt Implementation,4,100,221297


# Data filtering:

In [3]:
def drop_top_outliers(df):
    mask = df.groupby(["Function", "Test Case"])["Cycles"].transform(
        lambda x: x <= x.mean() + 3 * x.std()
    )
    return df[mask].reset_index(drop=True)


bench_clean = drop_top_outliers(bench_raw)
orig_clean = drop_top_outliers(orig_raw)

print(f"bench_raw: {len(bench_raw)} rows -> bench_clean: {len(bench_clean)} rows")
print(f"orig_raw:  {len(orig_raw)} rows ->  orig_clean:  {len(orig_clean)} rows")

removed = (
    (
        bench_raw.groupby(["Function", "Test Case"]).size()
        - bench_clean.groupby(["Function", "Test Case"]).size()
    )
    .rename("n_removed")
    .reset_index()
)
print("\nTop-outliers removed (bench):")
display(removed.head())

bench_raw: 120000 rows -> bench_clean: 119525 rows
orig_raw:  50000 rows ->  orig_clean:  48862 rows

Top-outliers removed (bench):


Unnamed: 0,Function,Test Case,n_removed
0,Best,TC 0,6
1,Best,TC 1,8
2,Best,TC 2,14
3,Best,TC 3,15
4,Best,TC 4,9


In [4]:
# isolate just the collide_balls rows
cb = prof_df[prof_df["Section"] == "collide_balls"]

# compute per-group threshold = mean + 3 * std
thr = cb.groupby(["Function", "Test Case"])["Cycles"].agg(["mean", "std"]).reset_index()
thr["threshold"] = thr["mean"] + 3 * thr["std"]

# find all (Function,TC,Iteration) where collide_balls exceeds that threshold
cb_thr = cb.merge(thr, on=["Function", "Test Case"])
bad_iters = cb_thr[cb_thr["Cycles"] > cb_thr["threshold"]][
    ["Function", "Test Case", "Iteration"]
].drop_duplicates()

# drop all rows in prof_df belonging to those bad iterations
prof_clean = (
    prof_df.merge(
        bad_iters.assign(to_drop=1),
        on=["Function", "Test Case", "Iteration"],
        how="left",
    )
    .query("to_drop != 1")
    .drop(columns="to_drop")
    .reset_index(drop=True)
)

print(f"prof_df:  {len(prof_df)} rows -> prof_clean: {len(prof_clean)} rows")
print("Example removed iterations:")
display(bad_iters.head())


prof_df:  720000 rows -> prof_clean: 697014 rows
Example removed iterations:


Unnamed: 0,Function,Test Case,Iteration
117,Compiler Flags,TC 2,23
184,Compiler Flags,TC 4,36
319,Compiler Flags,TC 4,63
453,Compiler Flags,TC 3,90
587,Compiler Flags,TC 2,117


In [24]:
def drop_top_outliers_N(df):
    mask = df.groupby(["Function", "Test Case", "N"])["Cycles"].transform(
        lambda x: x <= x.mean() + 3 * x.std()
    )
    return df[mask].reset_index(drop=True)


bench_n_clean = drop_top_outliers_N(bench_n_df)

print(f"bench_n_df:  {len(bench_n_df)} rows -> bench_n_clean: {len(bench_n_clean)} rows")

removed_n = (
    bench_n_df.groupby(["Function", "Test Case", "N"]).size()
    - bench_n_clean.groupby(["Function", "Test Case", "N"]).size()
)
removed_n = removed_n.rename("n_removed").reset_index()

print("\nTop-outliers removed (bench_n_df):")
display(removed_n.head())

bench_n_df:  1440000 rows -> bench_n_clean: 1428098 rows

Top-outliers removed (bench_n_df):


Unnamed: 0,Function,Test Case,N,n_removed
0,Best,0,100,8
1,Best,0,500,10
2,Best,0,1000,7
3,Best,0,2000,10
4,Best,0,3000,12


In [5]:
mask_spmd = (
    bench_spmd
    .groupby("Function")["Cycles"]
    .transform(lambda x: x <= x.mean() + 3*x.std())
)
bench_spmd_clean = bench_spmd[mask_spmd].reset_index(drop=True)

print(f"bench_spmd:        {len(bench_spmd)} rows")
print(f"bench_spmd_clean:  {len(bench_spmd_clean)} rows")

rm = (bench_spmd.groupby("Function").size()
      - bench_spmd_clean.groupby("Function").size()
      ).rename("n_removed").reset_index()
display(rm)

bench_spmd:        15000 rows
bench_spmd_clean:  14898 rows


Unnamed: 0,Function,n_removed
0,4x Basic Implementation,31
1,4x Recip Sqrt Implementation,31
2,SPMD 2: FMA,2
3,SPMD 3: Recip Sqrt,30
4,SPMD Basic Implementation,8


In [6]:
# isolate just the collide_balls rows
cb = prof_spmd[prof_spmd["Section"] == "collide_balls"]

# compute per-group threshold = mean + 3 * std
thr = cb.groupby(["Function"])["Cycles"].agg(["mean", "std"]).reset_index()
thr["threshold"] = thr["mean"] + 3 * thr["std"]

# find all (Function,TC,Iteration) where collide_balls exceeds that threshold
cb_thr = cb.merge(thr, on=["Function"])
bad_iters = cb_thr[cb_thr["Cycles"] > cb_thr["threshold"]][
    ["Function", "Iteration"]
].drop_duplicates()

# drop all rows in prof_spmd belonging to those bad iterations
prof_clean_spmd = (
    prof_spmd.merge(
        bad_iters.assign(to_drop=1),
        on=["Function", "Iteration"],
        how="left",
    )
    .query("to_drop != 1")
    .drop(columns="to_drop")
    .reset_index(drop=True)
)

print(f"prof_spmd:  {len(prof_spmd)} rows -> prof_clean: {len(prof_clean_spmd)} rows")
print("Example removed iterations:")
display(bad_iters.head())

prof_spmd:  90000 rows -> prof_clean: 87966 rows
Example removed iterations:


Unnamed: 0,Function,Iteration
174,4x Basic Implementation,174
237,4x Basic Implementation,237
464,4x Basic Implementation,464
2008,SPMD 3: Recip Sqrt,8
2018,SPMD 3: Recip Sqrt,18


In [25]:
def drop_top_outliers_spmd_N(df):
    mask = df.groupby(["Function", "N"])["Cycles"].transform(
        lambda x: x <= x.mean() + 3 * x.std()
    )
    return df[mask].reset_index(drop=True)

bench_n_spmd_clean = drop_top_outliers_spmd_N(bench_n_spmd)

print(f"bench_n_spmd: {len(bench_n_spmd)} rows -> bench_n_spmd_clean: {len(bench_n_spmd_clean)} rows")

removed_n_spmd = (
    bench_n_spmd.groupby(["Function", "N"]).size()
    - bench_n_spmd_clean.groupby(["Function", "N"]).size()
)
removed_n_spmd = removed_n_spmd.rename("n_removed").reset_index()

print("\nTop-outliers removed (bench_n_spmd):")
display(removed_n_spmd.head())

bench_n_spmd: 180000 rows -> bench_n_spmd_clean: 178932 rows

Top-outliers removed (bench_n_spmd):


Unnamed: 0,Function,N,n_removed
0,4x Basic Implementation,100,23
1,4x Basic Implementation,500,36
2,4x Basic Implementation,1000,40
3,4x Basic Implementation,2000,38
4,4x Basic Implementation,3000,36


# Graphs:

In [7]:
# group the cleaned benchmark + original data
bench_grouped = bench_clean.groupby(["Function", "Test Case"], as_index=False)[
    ["Cycles"]
].mean()
orig_grouped = orig_clean.groupby(["Function", "Test Case"], as_index=False)[
    ["Cycles"]
].mean()

# stack them so all plots can include "Original" alongside the bench variants
grouped_all = pd.concat([bench_grouped, orig_grouped], ignore_index=True)

# mean cycles across all test cases per function
mean_cycles_all = (
    grouped_all.groupby("Function", as_index=False)["Cycles"]
    .mean()
    .rename(columns={"Cycles": "MeanCycles_AllTC"})
)
mean_cycles_all = mean_cycles_all.sort_values(
    "MeanCycles_AllTC", ascending=False
).reset_index(drop=True)
mean_cycles_all["MeanCycles_AllTC"] = (
    mean_cycles_all["MeanCycles_AllTC"].round(0).astype(int)
)
print("=== mean_cycles_all ===")
display(mean_cycles_all)


=== mean_cycles_all ===


Unnamed: 0,Function,MeanCycles_AllTC
0,Original,360864
1,SIMD,111145
2,FMA,86730
3,"Bitmasks, FMA, RSQRT",83314
4,Third SIMD impl.,79076
5,"Branch, Precompute",78433
6,Compiler Flags,78191
7,Best with Double While,57391
8,Best,55365


In [8]:
# Mean cycles per test case (function x Test Case pivot)
mean_cycles_tc = (
    grouped_all.groupby(["Test Case", "Function"], as_index=False)["Cycles"]
    .mean()
    .pivot(index="Test Case", columns="Function", values="Cycles")
)
func_order = mean_cycles_all["Function"].tolist()
mean_cycles_tc = mean_cycles_tc[func_order]
mean_cycles_tc = mean_cycles_tc.round(0).astype(int)

print("=== mean_cycles_tc ===")
display(mean_cycles_tc)

=== mean_cycles_tc ===


Function,Original,SIMD,FMA,"Bitmasks, FMA, RSQRT",Third SIMD impl.,"Branch, Precompute",Compiler Flags,Best with Double While,Best
Test Case,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
TC 0,358130,111162,84083,82959,78989,75033,75466,61546,56608
TC 1,359104,111016,87903,82916,79139,80132,80235,54492,54662
TC 2,357750,111103,84152,84649,78784,75435,74689,61686,56647
TC 3,360247,111351,89579,83080,79014,80635,80359,54656,54506
TC 4,369086,111091,87935,82967,79454,80931,80208,54574,54405


In [9]:
# Cost of operations per (Function, Test Case)
cost_ops = flops_df.groupby(["Function", "Test Case"], as_index=False)[
    ["ADDS", "MULS", "DIVS", "SQRT"]
].mean()
cost_ops[["ADDS", "MULS", "DIVS", "SQRT"]] = (
    cost_ops[["ADDS", "MULS", "DIVS", "SQRT"]].round(0).astype(int)
)
print("=== cost_ops ===")
display(cost_ops)

=== cost_ops ===


Unnamed: 0,Function,Test Case,ADDS,MULS,DIVS,SQRT
0,Best,TC 0,8015,8018,502,334
1,Best,TC 1,7681,8018,502,334
2,Best,TC 2,8014,8016,501,334
3,Best,TC 3,7704,8040,503,335
4,Best,TC 4,7688,8024,502,334
5,Best with Double While,TC 0,8852,7851,502,334
6,Best with Double While,TC 1,8518,7851,502,334
7,Best with Double While,TC 2,8851,7849,501,334
8,Best with Double While,TC 3,8544,7873,503,335
9,Best with Double While,TC 4,8526,7857,502,334


In [10]:
# Mean cycles per section (Function x Section x Test Case)
sec_cycles = prof_clean.groupby(["Function", "Section", "Test Case"], as_index=False)[
    "Cycles"
].mean()
sec_cycles["Cycles"] = sec_cycles["Cycles"].round(0).astype(int)
print("=== sec_cycles ===")
display(sec_cycles)

=== sec_cycles ===


Unnamed: 0,Function,Section,Test Case,Cycles
0,Best,Delta,TC 0,37886
1,Best,Delta,TC 1,37371
2,Best,Delta,TC 2,38909
3,Best,Delta,TC 3,37259
4,Best,Delta,TC 4,37258
...,...,...,...,...
235,Third SIMD impl.,collide_balls,TC 0,900814
236,Third SIMD impl.,collide_balls,TC 1,909588
237,Third SIMD impl.,collide_balls,TC 2,903247
238,Third SIMD impl.,collide_balls,TC 3,916566


In [11]:
section_order = [
    "collide_balls",
    "Initialization",
    "Impulse",
    "Delta",
    "Velocity",
    "Transform to World Frame",
]
tcs = sorted(sec_cycles["Test Case"].unique(),
             key=lambda x: int(x.split()[-1]))

for sec in section_order:
    print(f"\n=== Section: {sec} (ordered by TC 0 desc) ===")
    sub = sec_cycles[sec_cycles["Section"] == sec]
    pivot = (
        sub
        .pivot(index="Function", columns="Test Case", values="Cycles")
        .reindex(columns=tcs)
    )
    pivot = pivot.loc[pivot["TC 0"].sort_values(ascending=False).index]
    display(pivot)


=== Section: collide_balls (ordered by TC 0 desc) ===


Test Case,TC 0,TC 1,TC 2,TC 3,TC 4
Function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SIMD,921047,918008,916176,922263,912042
Compiler Flags,912854,914167,904948,902454,906264
Best,911011,911003,912397,901422,903766
"Branch, Precompute",908221,898148,911214,906999,907284
Third SIMD impl.,900814,909588,903247,916566,910172
FMA,896862,896679,905674,901226,897712
Best with Double While,894821,897928,902642,900089,909403
"Bitmasks, FMA, RSQRT",769556,768741,765701,767987,770847



=== Section: Initialization (ordered by TC 0 desc) ===


Test Case,TC 0,TC 1,TC 2,TC 3,TC 4
Function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Branch, Precompute",138,133,135,133,137
FMA,128,122,123,125,123
Compiler Flags,119,114,115,113,111
Third SIMD impl.,76,87,83,78,71
Best,46,43,45,44,64
Best with Double While,45,44,47,44,44
"Bitmasks, FMA, RSQRT",42,42,42,42,43
SIMD,38,38,38,39,38



=== Section: Impulse (ordered by TC 0 desc) ===


Test Case,TC 0,TC 1,TC 2,TC 3,TC 4
Function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Bitmasks, FMA, RSQRT",61843993646495544,61843992490055576,61843999888740344,62029334011520128,61905779829450216
Compiler Flags,40898,39859,40314,38506,39285
Best,40411,39556,39838,38523,38563
"Branch, Precompute",39909,37852,40758,38758,38244
SIMD,39545,39821,38504,38930,38507
Best with Double While,38281,38505,38792,38444,39137
FMA,38131,38675,39521,38885,38975
Third SIMD impl.,37945,38445,38142,39626,38560



=== Section: Delta (ordered by TC 0 desc) ===


Test Case,TC 0,TC 1,TC 2,TC 3,TC 4
Function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SIMD,39336,38601,38868,39874,38662
"Bitmasks, FMA, RSQRT",39111,38899,38543,39461,38736
Compiler Flags,38781,38302,38034,38067,37986
Third SIMD impl.,38427,38927,38320,39170,39082
"Branch, Precompute",37970,37574,38942,38169,38332
Best,37886,37371,38909,37259,37258
Best with Double While,37442,37661,37947,37325,38231
FMA,36952,37473,37487,38066,37836



=== Section: Velocity (ordered by TC 0 desc) ===


Test Case,TC 0,TC 1,TC 2,TC 3,TC 4
Function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Compiler Flags,39776,40413,38201,38228,38423
"Bitmasks, FMA, RSQRT",39753,39171,39206,39010,38919
"Branch, Precompute",39496,39282,40197,39893,39886
Best,39037,38948,38933,37667,38166
SIMD,38495,37982,38377,38469,38106
FMA,37984,38293,38862,38418,38012
Best with Double While,37969,37602,38454,37988,38645
Third SIMD impl.,37880,39136,38541,39230,38806



=== Section: Transform to World Frame (ordered by TC 0 desc) ===


Test Case,TC 0,TC 1,TC 2,TC 3,TC 4
Function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Branch, Precompute",40,39,39,40,40
Best,37,37,37,37,37
Best with Double While,37,37,37,37,38
Compiler Flags,37,37,37,37,37
SIMD,37,120,37,46,37
FMA,37,37,37,37,37
Third SIMD impl.,37,37,37,37,39
"Bitmasks, FMA, RSQRT",36,36,36,36,36


In [12]:

# Section‐level FlopsPerCycle: merge flops_df + sec_cycles
flops_sec = pd.merge(
    flops_df, sec_cycles, on=["Function", "Section", "Test Case"], how="inner"
)
flops_sec["FlopsPerCycle"] = flops_sec["Flops"] / flops_sec["Cycles"]
print("=== flops_sec (section-level FlopsPerCycle) ===")
display(flops_sec[["Function", "Section", "Test Case", "FlopsPerCycle"]])

=== flops_sec (section-level FlopsPerCycle) ===


Unnamed: 0,Function,Section,Test Case,FlopsPerCycle
0,Compiler Flags,collide_balls,TC 0,0.086781
1,Compiler Flags,Initialization,TC 0,0.000000
2,Compiler Flags,Impulse,TC 0,0.000000
3,Compiler Flags,Delta,TC 0,0.000000
4,Compiler Flags,Velocity,TC 0,0.000000
...,...,...,...,...
235,SIMD,Initialization,TC 4,0.000000
236,SIMD,Impulse,TC 4,0.000000
237,SIMD,Delta,TC 4,0.000000
238,SIMD,Velocity,TC 4,0.000000


In [13]:
cb_flops = flops_sec[flops_sec["Section"] == "collide_balls"]
overall_fp = cb_flops.groupby(["Function", "Test Case"], as_index=False)[
    ["Flops", "Cycles"]
].sum()
overall_fp["FlopsPerCycle"] = overall_fp["Flops"] / overall_fp["Cycles"]
print("=== overall_fp ===")
display(overall_fp)

=== overall_fp ===


Unnamed: 0,Function,Test Case,Flops,Cycles,FlopsPerCycle
0,Best,TC 0,101210,911011,0.111096
1,Best,TC 1,99208,911003,0.1089
2,Best,TC 2,101191,912397,0.110907
3,Best,TC 3,99488,901422,0.110368
4,Best,TC 4,99290,903766,0.109863
5,Best with Double While,TC 0,105234,894821,0.117603
6,Best with Double While,TC 1,103232,897928,0.114967
7,Best with Double While,TC 2,105215,902642,0.116563
8,Best with Double While,TC 3,103524,900089,0.115015
9,Best with Double While,TC 4,103318,909403,0.113611


In [14]:
tcs = sorted(
    overall_fp["Test Case"].unique(),
    key=lambda s: int(s.split()[-1])
)
fp_grid = (
    overall_fp
    .pivot(index="Function", columns="Test Case", values="FlopsPerCycle")
    .reindex(columns=tcs)
)

fp_grid = fp_grid.loc[
    fp_grid["TC 0"].sort_values(ascending=False).index
]

print("=== FlopsPerCycle (Function x Test Case), ordered by TC 0 desc ===")
display(fp_grid)

=== FlopsPerCycle (Function x Test Case), ordered by TC 0 desc ===


Test Case,TC 0,TC 1,TC 2,TC 3,TC 4
Function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Bitmasks, FMA, RSQRT",0.154957,0.155121,0.155737,0.155738,0.154852
Third SIMD impl.,0.131388,0.13012,0.131034,0.129516,0.130167
Best with Double While,0.117603,0.114967,0.116563,0.115015,0.113611
SIMD,0.113118,0.113492,0.113719,0.113307,0.114349
Best,0.111096,0.1089,0.110907,0.110368,0.109863
Compiler Flags,0.086781,0.084466,0.08753,0.085812,0.085281
FMA,0.07718,0.074963,0.07642,0.074801,0.074945
"Branch, Precompute",0.070707,0.0715,0.070469,0.071008,0.070844


In [27]:
# 1) Compute mean Cycles per (Function, Test Case, N)
mean_n = (
    bench_n_clean
    .groupby(["Function", "Test Case", "N"], as_index=False)["Cycles"]
    .mean()
    .round(0)
    .astype({"Cycles": int})
)

# 2) Unique, sorted Test Cases and N’s
tcs = sorted(mean_n["Test Case"].unique())   # numeric list like [0,1,2,3,4]
Ns  = sorted(mean_n["N"].unique())           # e.g. [10,100,1000,...]

# 3) Loop over each Test Case
for tc in tcs:
    print(f"\n=== Test Case: TC {tc} (sorted by N=1000 desc) ===")
    sub = mean_n[mean_n["Test Case"] == tc]
    
    # pivot: functions × N
    pivot = sub.pivot(index="Function", columns="N", values="Cycles")
    pivot = pivot.reindex(columns=Ns)  # ensure all N cols in ascending order
    
    # sort rows by the N=1000 column (or highest N if 1000 missing)
    sort_col = 1000 if 1000 in pivot.columns else max(pivot.columns)
    pivot = pivot.loc[pivot[sort_col].sort_values(ascending=False).index]
    
    display(pivot)


=== Test Case: TC 0 (sorted by N=1000 desc) ===


N,100,500,1000,2000,3000,4000,5000,6000,7000,8000,9000,10000
Function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
SIMD,11337,55620,110995,221734,332617,443442,555142,666050,782308,893413,998329,1109528
FMA,8731,42237,84208,168189,252549,336286,419715,504038,587854,671347,762460,840538
"Bitmasks, FMA, RSQRT",8537,41561,82885,165549,248451,331129,413642,496269,579273,662130,744773,827094
Third SIMD impl.,8302,40073,78793,157302,235954,315020,393665,471942,550636,629237,708570,786650
"Branch, Precompute",7777,37578,74712,149282,223634,297994,372478,446858,521508,596112,671076,745123
Compiler Flags,7764,37517,74642,148868,223349,297660,371939,450679,520765,595678,670327,744625
Best with Double While,6406,30750,61268,122591,183394,244288,305608,371231,428900,489125,549423,610883
Best,5918,28509,56595,112976,169372,225736,282076,343166,400793,451491,508123,568885



=== Test Case: TC 1 (sorted by N=1000 desc) ===


N,100,500,1000,2000,3000,4000,5000,6000,7000,8000,9000,10000
Function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
SIMD,11326,55642,111234,221590,332652,443106,553968,664914,776470,887568,997789,1111490
FMA,9107,44209,87951,175797,264027,352019,440376,531581,623352,706039,794796,883415
"Bitmasks, FMA, RSQRT",8536,41610,82854,165483,248020,330887,413845,495992,578420,661540,744371,835640
"Branch, Precompute",8349,40507,80029,159790,239254,319570,399190,478748,558786,638106,718400,798227
Compiler Flags,8337,40496,79965,159763,239497,319402,398853,478689,558670,638932,724435,798759
Third SIMD impl.,8151,40244,78783,157306,235847,314255,393654,471405,550249,628869,707740,786894
Best with Double While,5708,27485,54580,109551,164207,217096,271810,331194,383978,434208,488557,542927
Best,5744,27421,54380,108569,162916,216951,271227,325388,380856,436013,487976,542846



=== Test Case: TC 2 (sorted by N=1000 desc) ===


N,100,500,1000,2000,3000,4000,5000,6000,7000,8000,9000,10000
Function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
SIMD,11326,55600,111014,221829,332728,443307,553933,665322,776156,893207,998673,1108388
FMA,8736,42197,84127,168433,251874,336468,419615,504604,587497,671529,755807,847258
"Bitmasks, FMA, RSQRT",8520,41684,82924,165341,248242,330758,413606,496725,579032,661406,744751,835725
Third SIMD impl.,8262,39889,78846,157248,235952,314428,393411,471412,550646,628737,707785,787286
Compiler Flags,7795,37517,74966,149204,223302,297570,371886,447085,520645,603262,669568,744010
"Branch, Precompute",7792,37576,74705,149212,223946,297989,372403,447232,521269,597567,676226,745046
Best with Double While,6446,30836,61427,123066,183385,248370,305604,370954,428648,494291,550466,611954
Best,5963,28499,56662,113040,169504,225800,282948,343012,396310,451707,507518,570445



=== Test Case: TC 3 (sorted by N=1000 desc) ===


N,100,500,1000,2000,3000,4000,5000,6000,7000,8000,9000,10000
Function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
SIMD,11344,55735,111334,222343,333820,444688,556137,667343,785954,889405,1000810,1121914
FMA,9123,44256,88313,176223,264303,352793,441600,536874,625033,709072,797911,886986
"Bitmasks, FMA, RSQRT",8510,41652,83117,166020,248885,331652,415257,498483,581118,664020,746686,829908
Compiler Flags,8331,40679,80291,160303,240111,320010,400186,480423,559963,640326,720037,800876
"Branch, Precompute",8313,40255,80266,160189,240099,320488,400156,480168,560740,640015,720172,799921
Third SIMD impl.,8135,40060,79040,157979,236364,315380,394416,472833,551817,630513,709946,788138
Best with Double While,5712,27511,54632,109161,163433,217918,272111,326857,386115,435982,490775,545808
Best,5671,27408,54540,108848,163238,220716,271726,326519,382564,434846,489558,544045



=== Test Case: TC 4 (sorted by N=1000 desc) ===


N,100,500,1000,2000,3000,4000,5000,6000,7000,8000,9000,10000
Function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
SIMD,11319,55646,111047,222021,336581,443741,554254,666640,776670,887944,999736,1110737
FMA,9087,44116,88115,176064,264142,352366,441181,529374,618185,706558,795381,884715
"Bitmasks, FMA, RSQRT",8537,41615,82973,165871,248395,331106,413836,496748,579525,662211,744650,827879
Compiler Flags,8307,41734,80287,159960,239670,319304,399930,478982,559056,639081,718709,798928
"Branch, Precompute",8300,40183,80064,159855,239694,319431,399090,479095,559141,638865,718574,799102
Third SIMD impl.,8282,39890,78858,157305,236141,314578,393674,471785,550854,629351,707672,787532
Best with Double While,5706,27391,54628,109303,163512,217292,271937,327504,386333,434344,488985,543290
Best,5694,27333,54449,108555,163454,216907,271151,325666,380890,435012,488090,542345


In [15]:
spmd_mean_cycles = (
    bench_spmd_clean.groupby("Function", as_index=False)["Cycles"]
    .mean()
    .rename(columns={"Cycles": "MeanCycles"})
)
sorted_spmd_mean_cycles = spmd_mean_cycles.sort_values(
    "MeanCycles", ascending=False
).reset_index(drop=True)
sorted_spmd_mean_cycles["MeanCycles"] = (
    sorted_spmd_mean_cycles["MeanCycles"].round(0).astype(int)
)
print("=== SPMD mean_cycles_all ===")
display(sorted_spmd_mean_cycles)

=== SPMD mean_cycles_all ===


Unnamed: 0,Function,MeanCycles
0,4x Basic Implementation,311456
1,4x Recip Sqrt Implementation,225398
2,SPMD Basic Implementation,107676
3,SPMD 2: FMA,103643
4,SPMD 3: Recip Sqrt,98095


In [16]:
# Cost of operations per Function
cost_ops = flops_spmd.groupby(["Function"], as_index=False)[
    ["ADDS", "MULS", "DIVS", "SQRT"]
].mean()
cost_ops[["ADDS", "MULS", "DIVS", "SQRT"]] = (
    cost_ops[["ADDS", "MULS", "DIVS", "SQRT"]].round(0).astype(int)
)
print("=== SPMD cost_ops ===")
display(cost_ops)

=== SPMD cost_ops ===


Unnamed: 0,Function,ADDS,MULS,DIVS,SQRT
0,4x Basic Implementation,26079,18750,5345,2006
1,4x Recip Sqrt Implementation,31414,32092,2007,1337
2,SPMD 2: FMA,41567,45612,13391,4021
3,SPMD 3: Recip Sqrt,42906,56322,5359,5360
4,SPMD Basic Implementation,41567,45611,13391,4021


In [17]:
# Mean cycles per section (Function x Section)
spmd_sec_cycles = prof_clean_spmd.groupby(["Function", "Section"], as_index=False)[
    "Cycles"
].mean()
spmd_sec_cycles["Cycles"] = spmd_sec_cycles["Cycles"].round(0).astype(int)
print("=== SPMD sec_cycles ===")
display(spmd_sec_cycles)

=== SPMD sec_cycles ===


Unnamed: 0,Function,Section,Cycles
0,4x Basic Implementation,Delta,153883
1,4x Basic Implementation,Impulse,159923
2,4x Basic Implementation,Initialization,460
3,4x Basic Implementation,Transform to World Frame,149
4,4x Basic Implementation,Velocity,152855
5,4x Basic Implementation,collide_balls,3628179
6,4x Recip Sqrt Implementation,Delta,148461
7,4x Recip Sqrt Implementation,Impulse,158452
8,4x Recip Sqrt Implementation,Initialization,165
9,4x Recip Sqrt Implementation,Transform to World Frame,171


In [18]:
# Section‐level FlopsPerCycle: merge flops_df + sec_cycles
flops_sec_spmd = pd.merge(
    flops_spmd, spmd_sec_cycles, on=["Function", "Section"], how="inner"
)
flops_sec_spmd["FlopsPerCycle"] = flops_sec_spmd["Flops"] / flops_sec_spmd["Cycles"]
print("=== SPMD flops_sec (section-level FlopsPerCycle) ===")
display(flops_sec_spmd[["Function", "Section", "FlopsPerCycle"]])

=== SPMD flops_sec (section-level FlopsPerCycle) ===


Unnamed: 0,Function,Section,FlopsPerCycle
0,4x Basic Implementation,collide_balls,0.086293
1,4x Basic Implementation,Initialization,0.0
2,4x Basic Implementation,Impulse,0.0
3,4x Basic Implementation,Delta,0.0
4,4x Basic Implementation,Velocity,0.0
5,4x Basic Implementation,Transform to World Frame,0.0
6,4x Recip Sqrt Implementation,collide_balls,0.110825
7,4x Recip Sqrt Implementation,Initialization,0.0
8,4x Recip Sqrt Implementation,Impulse,0.0
9,4x Recip Sqrt Implementation,Delta,0.0


In [19]:
# Overall FlopsPerCycle per Function
cb_flops_spmd = flops_sec_spmd[flops_sec_spmd["Section"] == "collide_balls"]
overall_fp_spmd = cb_flops_spmd.groupby(["Function"], as_index=False)[
    ["Flops", "Cycles"]
].sum()
overall_fp_spmd["FlopsPerCycle"] = overall_fp_spmd["Flops"] / overall_fp_spmd["Cycles"]
overall_fp_spmd = overall_fp_spmd.sort_values(
    "FlopsPerCycle", ascending=False
).reset_index(drop=True)
print("=== SPMD overall_fp ===")
display(overall_fp_spmd)

=== SPMD overall_fp ===


Unnamed: 0,Function,Flops,Cycles,FlopsPerCycle
0,SPMD 3: Recip Sqrt,329841,923034,0.357344
1,SPMD 2: FMA,313777,921278,0.340589
2,SPMD Basic Implementation,313773,934671,0.335704
3,4x Recip Sqrt Implementation,401097,3619177,0.110825
4,4x Basic Implementation,313085,3628179,0.086293


In [28]:
mean_n_spmd = (
    bench_n_spmd_clean
    .groupby(["Function", "N"], as_index=False)["Cycles"]
    .mean()
    .round(0)
    .astype({"Cycles": int})
)

Ns_spmd = sorted(mean_n_spmd["N"].unique())

pivot_spmd = (
    mean_n_spmd
    .pivot(index="Function", columns="N", values="Cycles")
    .reindex(columns=Ns_spmd)
)

sort_col_spmd = 1000 if 1000 in pivot_spmd.columns else max(pivot_spmd.columns)
pivot_spmd = pivot_spmd.loc[
    pivot_spmd[sort_col_spmd].sort_values(ascending=False).index
]

print("=== SPMD: mean Cycles per N (sorted by N=1000 desc) ===")
display(pivot_spmd)

=== SPMD: mean Cycles per N (sorted by N=1000 desc) ===


N,100,500,1000,2000,3000,4000,5000,6000,7000,8000,9000,10000
Function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
4x Basic Implementation,313712,310489,309977,310147,310223,309893,310499,310244,310384,309947,309949,310285
4x Recip Sqrt Implementation,224708,222260,222091,226703,222891,222766,222323,221985,222245,222159,222255,222860
SPMD Basic Implementation,112104,106888,106335,106413,106487,106444,107370,106228,106375,106260,106192,106203
SPMD 2: FMA,101102,103770,100954,103574,104219,101536,103261,101193,101206,100971,100817,100948
SPMD 3: Recip Sqrt,97773,97528,97732,97498,97510,97634,97528,97555,97449,97527,97917,97568
