As the number of groups increases, which algorithm performs better at scaling?

In [14]:
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Step 1: Load the data
df = pd.read_parquet(
    "results/test.parquet", 
    columns=["algorithm", "n_groups", "np", "value"],
    filters=[("dist", "==", "uniform"), ("n_rows", "==", 8_000_000), ("attribute", "==", "aggregation_time")],
)

# Step 2: Preprocess
df["value"] = df["value"].apply(lambda x: float(x.removesuffix("ms")))
df = df.groupby(["algorithm", "n_groups", "np"])["value"].mean().reset_index()
df = df.rename(columns={"value": "latency"})

# Step 3: Add Speedup Column
baseline = df[df["np"] == 1][["algorithm", "n_groups", "latency"]]
baseline = baseline.rename(columns={"latency": "baseline_latency"})
df = df.merge(baseline, on=["algorithm", "n_groups"], how="left")
df["speedup"] = df["baseline_latency"] / df["latency"]
df = df.drop(columns=["baseline_latency"])

# Step 4: Add Tolerance Column
# Find the latency at the minimal n_groups for each (dist, algorithm, np)
min_group_latency = (
    df.loc[df.groupby(["algorithm", "np"])["n_groups"].idxmin()]
    [["algorithm", "np", "latency"]]
    .rename(columns={"latency": "min_group_latency"})
)

df = df.merge(min_group_latency, on=["algorithm", "np"], how="left")
df["slowdown"] = df["latency"] / df["min_group_latency"]
df = df.drop(columns=["min_group_latency"])

# Step 5: Display
for algorithm in df["algorithm"].unique():
    display(df[(df["np"] == df["np"].max()) & (df["algorithm"] == algorithm)])


Unnamed: 0,algorithm,n_groups,np,latency,speedup,slowdown
4,duckdbish-two-phase,20000,16,15.8,5.278481,1.0
9,duckdbish-two-phase,200000,16,145.2,1.344353,9.189873
14,duckdbish-two-phase,2000000,16,414.8,1.438284,26.253165


Unnamed: 0,algorithm,n_groups,np,latency,speedup,slowdown
19,global-lock,20000,16,954.4,0.095977,1.0
24,global-lock,200000,16,1042.0,0.142418,1.091785
29,global-lock,2000000,16,3402.4,0.120503,3.564962


Unnamed: 0,algorithm,n_groups,np,latency,speedup,slowdown
34,implicit-repartitioning,20000,16,24.4,2.278689,1.0
39,implicit-repartitioning,200000,16,33.6,3.607143,1.377049
44,implicit-repartitioning,2000000,16,118.4,3.702703,4.852459


Unnamed: 0,algorithm,n_groups,np,latency,speedup,slowdown
49,lock-free-hash-table,20000,16,75.6,2.018519,1.0
54,lock-free-hash-table,200000,16,74.6,2.235925,0.986772
59,lock-free-hash-table,2000000,16,87.2,3.213303,1.153439


Unnamed: 0,algorithm,n_groups,np,latency,speedup,slowdown
64,three-phase-radix,20000,16,10.0,4.78,1.0
69,three-phase-radix,200000,16,161.4,0.644362,16.14
74,three-phase-radix,2000000,16,475.4,0.99411,47.54


Unnamed: 0,algorithm,n_groups,np,latency,speedup,slowdown
79,two-phase-central-merge,20000,16,9.2,5.108696,1.0
84,two-phase-central-merge,200000,16,131.8,0.772382,14.326087
89,two-phase-central-merge,2000000,16,672.6,0.58311,73.108696


Unnamed: 0,algorithm,n_groups,np,latency,speedup,slowdown
94,two-phase-central-merge-xxhash,20000,16,9.4,4.93617,1.0
99,two-phase-central-merge-xxhash,200000,16,139.0,0.751079,14.787234
104,two-phase-central-merge-xxhash,2000000,16,683.0,0.584773,72.659574


Unnamed: 0,algorithm,n_groups,np,latency,speedup,slowdown
109,two-phase-radix,20000,16,11.0,6.0,1.0
114,two-phase-radix,200000,16,134.8,0.94362,12.254545
119,two-phase-radix,2000000,16,319.2,1.814536,29.018182


Unnamed: 0,algorithm,n_groups,np,latency,speedup,slowdown
124,two-phase-radix-xxhash,20000,16,14.0,6.342857,1.0
129,two-phase-radix-xxhash,200000,16,98.4,1.784553,7.028571
134,two-phase-radix-xxhash,2000000,16,262.4,2.103659,18.742857
