In [100]:
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [101]:
from pathlib import Path
import subprocess
from typing import Any, Dict, List

import fire
import pandas as pd
from scipy.stats import hmean
from hf_bench.benchmark import ResultsTableRow


def get_columns() -> List[str]:
    return ResultsTableRow.__annotations__.keys()


def list_tracked_files(dirpath: str) -> List[str]:
    # Run git ls-tree command and capture output
    cmd = ["git", "ls-tree", "-r", "HEAD", "--name-only", dirpath]
    result = subprocess.run(cmd, capture_output=True, text=True, check=True)
    # Split output into list of files
    files = result.stdout.strip().split("\n")
    # Filter out empty strings
    files = [f for f in files if f]
    return files


def list_staged_files(dirpath: str) -> List[str]:
    cmd = ["git", "diff", "--name-only", "--cached", "HEAD", dirpath]
    result = subprocess.run(cmd, capture_output=True, text=True, check=True)
    files = result.stdout.strip().split("\n")
    files = [f for f in files if f]
    return files


def get_df_concat(dirpath: str) -> pd.DataFrame:
    """
    Get a dataframe of all the results in the given directory.
    """
    filepaths = list_tracked_files(dirpath)
    print(f"Found {len(filepaths)} tracked files in {dirpath}.")
    df_first = pd.read_csv(filepaths[0])
    column_dtypes: Dict[str, Any] = {
        "submission_id": str,
        **{col: df_first[col].dtype for col in get_columns()},
    }
    columns = ["submission_id"] + list(get_columns())
    df = pd.DataFrame(columns=columns).astype(column_dtypes)
    for f in filepaths:
        submission_id: str = Path(f).parent.stem
        df_new = pd.read_csv(f)
        df_new["drafter"] = df_new["drafter"].fillna("No Drafter (Autoregressive)")
        df_new["submission_id"] = submission_id
        df_new = df_new[columns]
        df = pd.concat([df, df_new])
    df.sort_values(
        by=columns,
        inplace=True,
    )
    return df


def get_df_concat_filtered(df_concat: pd.DataFrame, minimum_new_toks: int) -> pd.DataFrame:
    df_concat_filtered = df_concat.set_index(["target", "submission_id", "dataset_path", "dataset_name", "dataset_split"])
    df_low_new_toks = (df_concat_filtered[df_concat_filtered["new_toks"] < minimum_new_toks]
                   .set_index("example_id", append=True)
                   .sort_index())
    index_low_new_toks = df_low_new_toks.index.unique() # Multi-index (target, submission_id, dataset_path, dataset_name, dataset_split, example_id) for which new_toks < 64
    # Remove all the rows corresponding to these multi-indices from df_concat_filtered
    df_concat_filtered.set_index("example_id", inplace=True, append=True)
    df_concat_filtered = df_concat_filtered[~df_concat_filtered.index.isin(index_low_new_toks)]
    df_concat_filtered.reset_index(inplace=True)
    return df_concat_filtered


def get_df_summary_of_results(df_concat: pd.DataFrame) -> pd.DataFrame:
    df_concat.reset_index(drop=True, inplace=True)
    columns_for_index: List[str] = [
        "target",
        "submission_id",
        "dataset_path",
        "drafter",
        "temperature",
    ]
    df_concat.set_index(columns_for_index, inplace=True)
    example_id_nunique = df_concat["example_id"].groupby(columns_for_index).nunique()
    df_summary = example_id_nunique.to_frame()
    df_summary.rename(columns={"example_id": "example_id_nunique"}, inplace=True)
    df_mean_vals = df_concat.groupby(columns_for_index)[["new_toks", "ttft_ms"]].mean()
    df_hmean_vals = df_concat.groupby(columns_for_index)[
        ["tpot_ms", "out_toks_per_sec"]
    ].agg(hmean)
    df_summary = pd.concat([df_summary, df_mean_vals, df_hmean_vals], axis=1)
    # Add the speedups
    df_otps = df_summary[["out_toks_per_sec"]]
    df_otps.reset_index(level="drafter", inplace=True)
    mask_ar = df_otps["drafter"] == "No Drafter (Autoregressive)"
    df_ar_otps = df_otps[mask_ar]
    df_ar_otps.drop(columns=["drafter"], inplace=True)
    # Reset the index of both dataframes to make the division operation simpler
    df_otps_reset = df_otps.reset_index()
    df_ar_otps_reset = df_ar_otps.reset_index()
    # Merge the dataframes on the common index columns
    merge_cols = ["target", "dataset_path", "temperature", "submission_id"]
    df_merged = pd.merge(
        df_otps_reset, df_ar_otps_reset, on=merge_cols, suffixes=("", "_ar")
    )
    # Perform the division
    df_merged["speedup"] = (
        df_merged["out_toks_per_sec"] / df_merged["out_toks_per_sec_ar"]
    )
    # Set back the multi-index structure
    df_speedups = df_merged.set_index(merge_cols + ["drafter"])[["speedup"]]
    df_summary.reset_index(inplace=True)
    df_summary.set_index(
        ["target", "dataset_path", "drafter", "temperature", "submission_id"],
        inplace=True,
    )
    df_summary = df_summary.join(df_speedups)
    # Reorder the multi-index columns
    df_summary.reset_index(inplace=True)
    new_index = ["target", "dataset_path", "submission_id", "temperature", "drafter"]
    df_summary.set_index(new_index, inplace=True)
    df_summary.sort_index(level=new_index, inplace=True)
    return df_summary


def get_df_max_speedup(df_summary: pd.DataFrame) -> pd.DataFrame:
    df_summary.reset_index(inplace=True)
    df_max_speedup = df_summary.loc[
        df_summary.groupby(["target", "dataset_path", "submission_id", "temperature"])[
            "speedup"
        ].idxmax()
    ]
    df_max_speedup.rename(columns={"drafter": "drafter_of_max_speedup"}, inplace=True)
    df_max_speedup.set_index(
        [
            "target",
            "temperature",
            "dataset_path",
            "submission_id",
            "drafter_of_max_speedup",
        ],
        inplace=True,
    )
    df_max_speedup.sort_index(inplace=True)
    return df_max_speedup


dirpath = "/home/projects/dharel/nadavt/repos/hf-bench/benchmark_results"
print("Concatenating all the results CSVs into one dataframe...")
df_concat: pd.DataFrame = get_df_concat(dirpath)
# df_concat.to_csv("results_all.csv", index=False)

minimum_new_toks = 128
print(f"Filtering out experiments with less than {minimum_new_toks} new tokens...")
df_concat_filtered = get_df_concat_filtered(df_concat, minimum_new_toks)

print("Counting the number of unique example IDs for each experiment...")
df_summary: pd.DataFrame = get_df_summary_of_results(df_concat_filtered)
# Round the values to 1 decimal place
df_summary["new_toks"] = df_summary["new_toks"].round(1)
df_summary["ttft_ms"] = df_summary["ttft_ms"].round(1)
df_summary["tpot_ms"] = df_summary["tpot_ms"].round(1)
df_summary["out_toks_per_sec"] = df_summary["out_toks_per_sec"].round(1)
df_summary["speedup"] = df_summary["speedup"].round(2)
# df_summary.to_csv("results_summary.csv", index=True)

print("Getting the maximum speedup for each experiment...")
df_max_speedup: pd.DataFrame = get_df_max_speedup(df_summary.copy())
# df_max_speedup.to_csv("results_max_speedup.csv", index=True)

df_summary_deepseek_qwen_14b = df_summary[df_summary.index.get_level_values("target").str.startswith("deepseek-ai/DeepSeek-R1-Distill-Qwen-14B")]
df_summary_deepseek_qwen_14b_temperature_0 = df_summary_deepseek_qwen_14b[df_summary_deepseek_qwen_14b.index.get_level_values("temperature") == 0]

# print(f"Stored both the concatenated dataframe and the summary in {dirpath}.")
# print("Done!")

df_concat.head(5)

Concatenating all the results CSVs into one dataframe...
Found 48 tracked files in /home/projects/dharel/nadavt/repos/hf-bench/benchmark_results.
Filtering out experiments with less than 128 new tokens...
Counting the number of unique example IDs for each experiment...
Getting the maximum speedup for each experiment...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ar_otps.drop(columns=["drafter"], inplace=True)


Unnamed: 0,submission_id,target,dataset_path,dataset_name,dataset_split,num_of_examples,drafter,temperature,example_id,new_toks,ttft_ms,tpot_ms,out_toks_per_sec
0,2025-02-04_01-05-29_4c55336,deepseek-ai/DeepSeek-R1-Distill-Llama-70B,openai/openai_humaneval,openai_humaneval,test,30,No Drafter (Autoregressive),0,0,512,2016.254902,129.010985,7.751278
1,2025-02-04_01-05-29_4c55336,deepseek-ai/DeepSeek-R1-Distill-Llama-70B,openai/openai_humaneval,openai_humaneval,test,30,No Drafter (Autoregressive),0,1,512,236.413956,126.681199,7.893831
2,2025-02-04_01-05-29_4c55336,deepseek-ai/DeepSeek-R1-Distill-Llama-70B,openai/openai_humaneval,openai_humaneval,test,30,No Drafter (Autoregressive),0,2,512,215.317965,119.733867,8.351856
3,2025-02-04_01-05-29_4c55336,deepseek-ai/DeepSeek-R1-Distill-Llama-70B,openai/openai_humaneval,openai_humaneval,test,30,No Drafter (Autoregressive),0,3,512,306.704283,123.782173,8.078708
4,2025-02-04_01-05-29_4c55336,deepseek-ai/DeepSeek-R1-Distill-Llama-70B,openai/openai_humaneval,openai_humaneval,test,30,No Drafter (Autoregressive),0,4,512,296.703815,120.254724,8.315682


In [102]:
df_concat.describe()

Unnamed: 0,num_of_examples,temperature,example_id,new_toks,ttft_ms,tpot_ms,out_toks_per_sec
count,10650.0,10650.0,10650.0,10650.0,10650.0,10650.0,10650.0
mean,30.0,0.498592,14.5,393.104413,429.016698,54.531549,39.5517
std,0.0,0.500021,8.655848,153.698989,553.005558,44.652178,1563.060465
min,30.0,0.0,0.0,8.0,20.835876,0.006199,0.300873
25%,30.0,0.0,7.0,254.0,101.354957,32.555696,14.979037
50%,30.0,0.0,14.5,510.0,212.974191,47.632325,20.994146
75%,30.0,1.0,22.0,512.0,504.370451,66.759968,30.716591
max,30.0,1.0,29.0,512.0,3398.63801,3323.662519,161319.384615


In [103]:
df_concat_filtered.describe()

Unnamed: 0,example_id,num_of_examples,new_toks,ttft_ms,tpot_ms,out_toks_per_sec
count,8945.0,8945.0,8945.0,8945.0,8945.0,8945.0
mean,14.285187,30.0,432.308664,423.365627,53.57071,24.685422
std,8.691103,0.0,119.733231,553.718037,31.205392,13.158204
min,0.0,30.0,128.0,20.835876,10.357687,3.702622
25%,7.0,30.0,374.0,91.443539,32.012641,15.252676
50%,14.0,30.0,512.0,203.604698,47.616901,21.000947
75%,22.0,30.0,512.0,478.106737,65.562265,31.23766
max,29.0,30.0,512.0,3398.63801,270.078875,96.546653


In [104]:
df_summary.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,example_id_nunique,new_toks,ttft_ms,tpot_ms,out_toks_per_sec,speedup
target,dataset_path,submission_id,temperature,drafter,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,cnn_dailymail,2025-02-04_20-52-14_eb664c5,0,No Drafter (Autoregressive),30,494.7,232.3,77.4,12.9,1.0
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,cnn_dailymail,2025-02-04_20-52-14_eb664c5,0,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,30,492.7,461.3,79.3,12.4,0.96
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,cnn_dailymail,2025-02-04_20-52-14_eb664c5,0,deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,30,496.6,296.1,72.6,13.6,1.05
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,cnn_dailymail,2025-02-04_20-52-14_eb664c5,1,No Drafter (Autoregressive),30,472.6,232.6,77.7,12.9,1.0
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,cnn_dailymail,2025-02-04_20-52-14_eb664c5,1,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,30,462.0,348.0,58.1,17.2,1.34


In [105]:
df_max_speedup.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,example_id_nunique,new_toks,ttft_ms,tpot_ms,out_toks_per_sec,speedup
target,temperature,dataset_path,submission_id,drafter_of_max_speedup,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,0,cnn_dailymail,2025-02-04_20-52-14_eb664c5,deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,30,496.6,296.1,72.6,13.6,1.05
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,0,cnn_dailymail,2025-02-06_17-46-21_ab73cc7,meta-llama/Llama-3.2-1B,29,493.1,242.3,51.4,19.4,1.51
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,0,openai/openai_humaneval,2025-02-04_01-05-29_4c55336,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,30,512.0,353.5,54.3,18.3,2.25
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,0,openai/openai_humaneval,2025-02-04_20-52-14_eb664c5,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,30,512.0,222.6,52.8,18.8,1.43
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,0,openai/openai_humaneval,2025-02-06_16-28-26_ab73cc7,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,30,512.0,224.3,52.8,18.9,1.44


In [106]:
mask_deepseek_target = df_max_speedup.index.get_level_values("target").str.startswith("deepseek")
df_deepseek = df_max_speedup[mask_deepseek_target]
df_deepseek

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,example_id_nunique,new_toks,ttft_ms,tpot_ms,out_toks_per_sec,speedup
target,temperature,dataset_path,submission_id,drafter_of_max_speedup,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,0,cnn_dailymail,2025-02-04_20-52-14_eb664c5,deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,30,496.6,296.1,72.6,13.6,1.05
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,0,cnn_dailymail,2025-02-06_17-46-21_ab73cc7,meta-llama/Llama-3.2-1B,29,493.1,242.3,51.4,19.4,1.51
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,0,openai/openai_humaneval,2025-02-04_01-05-29_4c55336,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,30,512.0,353.5,54.3,18.3,2.25
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,0,openai/openai_humaneval,2025-02-04_20-52-14_eb664c5,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,30,512.0,222.6,52.8,18.8,1.43
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,0,openai/openai_humaneval,2025-02-06_16-28-26_ab73cc7,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,30,512.0,224.3,52.8,18.9,1.44
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,0,tau/scrolls,2025-02-04_20-52-14_eb664c5,deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,30,464.4,1890.9,85.8,11.3,1.47
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,1,cnn_dailymail,2025-02-04_20-52-14_eb664c5,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,30,462.0,348.0,58.1,17.2,1.34
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,1,cnn_dailymail,2025-02-06_17-46-21_ab73cc7,meta-llama/Llama-3.2-1B,29,479.0,248.5,51.3,19.4,1.52
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,1,openai/openai_humaneval,2025-02-04_01-05-29_4c55336,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,30,512.0,358.2,53.1,18.6,2.3
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,1,openai/openai_humaneval,2025-02-04_20-52-14_eb664c5,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,30,512.0,182.2,44.4,22.4,1.71


In [107]:
df_summary_deepseek_qwen_14b_temperature_0

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,example_id_nunique,new_toks,ttft_ms,tpot_ms,out_toks_per_sec,speedup
target,dataset_path,submission_id,temperature,drafter,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,cnn_dailymail,2025-02-06_02-10-28_d8c80fe,0,No Drafter (Autoregressive),16,415.6,176.8,51.7,19.2,1.0
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,cnn_dailymail,2025-02-06_02-10-28_d8c80fe,0,deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,16,415.6,287.5,69.9,14.1,0.73
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,cnn_dailymail,2025-02-06_02-10-28_d8c80fe,0,double7/vicuna-68m,16,414.9,243.0,36.2,27.4,1.43
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,cnn_dailymail,2025-02-06_06-54-21_ab73cc7,0,No Drafter (Autoregressive),18,379.2,66.9,33.7,29.7,1.0
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,cnn_dailymail,2025-02-06_06-54-21_ab73cc7,0,deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,18,386.3,114.1,51.0,19.4,0.65
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,cnn_dailymail,2025-02-06_06-54-21_ab73cc7,0,double7/vicuna-68m,18,379.2,91.0,26.4,37.6,1.27
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,openai/openai_humaneval,2025-02-06_02-10-28_d8c80fe,0,No Drafter (Autoregressive),30,451.6,91.2,48.7,20.4,1.0
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,openai/openai_humaneval,2025-02-06_02-10-28_d8c80fe,0,bigcode/tiny_starcoder_py,30,454.2,113.5,43.9,22.4,1.1
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,openai/openai_humaneval,2025-02-06_02-10-28_d8c80fe,0,codellama/CodeLlama-7b-Instruct-hf,30,455.6,257.3,77.9,12.3,0.61
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B,openai/openai_humaneval,2025-02-06_02-10-28_d8c80fe,0,deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,30,452.8,292.5,70.4,13.7,0.67


In [108]:
def get_df_summary_for_latex_strings(df: pd.DataFrame, columns_to_drop: List[str] = []) -> pd.DataFrame:
    df = df.reset_index()
    df.drop(columns=columns_to_drop, inplace=True)
    # Split the columns by "/" and take the last element
    columns_to_split = ["dataset_path", "drafter", "target"]
    for col in columns_to_split:
        if col in df.columns:
            df[col] = df[col].str.split("/").str[-1]
    columns_to_rename = {
        "submission_id": "Hardware", "dataset_path": "Dataset", "ttft_ms": "TTFT (ms)", "tpot_ms": "TPOT (ms)", "out_toks_per_sec": "T/s",
         **{col: col.capitalize() for col in ["target", "temperature", "method", "drafter", "speedup"]}
    }
    df.rename(columns=columns_to_rename, inplace=True)
    columns_to_drop_rename = [columns_to_rename.get(col, col) for col in columns_to_drop]
    cols_to_index = [col for col in ["Temperature", "Target", "Dataset", "Hardware", "Method", "Drafter"] if col in df.columns and col not in columns_to_drop_rename]
    df.set_index(cols_to_index, inplace=True)
    return df

columns_to_drop = ["target", "temperature", "example_id_nunique", "new_toks"]
df_summary_deepseek_qwen_14b_temperature_0 = get_df_summary_for_latex_strings(df_summary_deepseek_qwen_14b_temperature_0.copy(), columns_to_drop)
df_summary_deepseek_qwen_14b_temperature_0

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,TTFT (ms),TPOT (ms),T/s,Speedup
Dataset,Hardware,Drafter,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
cnn_dailymail,2025-02-06_02-10-28_d8c80fe,No Drafter (Autoregressive),176.8,51.7,19.2,1.0
cnn_dailymail,2025-02-06_02-10-28_d8c80fe,DeepSeek-R1-Distill-Qwen-1.5B,287.5,69.9,14.1,0.73
cnn_dailymail,2025-02-06_02-10-28_d8c80fe,vicuna-68m,243.0,36.2,27.4,1.43
cnn_dailymail,2025-02-06_06-54-21_ab73cc7,No Drafter (Autoregressive),66.9,33.7,29.7,1.0
cnn_dailymail,2025-02-06_06-54-21_ab73cc7,DeepSeek-R1-Distill-Qwen-1.5B,114.1,51.0,19.4,0.65
cnn_dailymail,2025-02-06_06-54-21_ab73cc7,vicuna-68m,91.0,26.4,37.6,1.27
openai_humaneval,2025-02-06_02-10-28_d8c80fe,No Drafter (Autoregressive),91.2,48.7,20.4,1.0
openai_humaneval,2025-02-06_02-10-28_d8c80fe,tiny_starcoder_py,113.5,43.9,22.4,1.1
openai_humaneval,2025-02-06_02-10-28_d8c80fe,CodeLlama-7b-Instruct-hf,257.3,77.9,12.3,0.61
openai_humaneval,2025-02-06_02-10-28_d8c80fe,DeepSeek-R1-Distill-Qwen-1.5B,292.5,70.4,13.7,0.67


In [None]:
def get_df_summary_sort_by_speedup(df_summary: pd.DataFrame, columns_to_group_by: List[str]) -> pd.DataFrame:
    # Get maximum speedup for each combination of grouping columns
    max_speedups = df_summary.groupby(columns_to_group_by)['Speedup'].max()
    
    # Sort hierarchically from outer to inner columns
    sorted_index = []
    
    def sort_level(current_index: tuple, level: int = 0):
        if level == len(columns_to_group_by):
            # Get all rows matching the current grouping columns
            mask = pd.Series(True, index=df_summary.index)
            for i, val in enumerate(current_index):
                mask &= df_summary.index.get_level_values(i) == val
            sorted_index.extend(df_summary[mask].index)
            return
            
        # Get all values at current level that match the current_index
        if level == 0:
            current_max_speedups = max_speedups
        else:
            # Create a boolean mask for all previous levels
            mask = pd.Series(True, index=max_speedups.index)
            for i, val in enumerate(current_index):
                mask &= max_speedups.index.get_level_values(i) == val
            current_max_speedups = max_speedups[mask]
            
        # Get unique values at current level and sort by their maximum speedups
        current_level_values = current_max_speedups.index.get_level_values(level).unique()
        level_speedups = {val: current_max_speedups[current_max_speedups.index.get_level_values(level) == val].max() 
                         for val in current_level_values}
        sorted_level_values = sorted(level_speedups.keys(), key=lambda x: level_speedups[x], reverse=True)
        
        # Recurse to next level for each sorted value
        for value in sorted_level_values:
            new_index = current_index + (value,) if level > 0 else (value,)
            sort_level(new_index, level + 1)
    
    # Start recursive sorting from the outermost level
    sort_level(())
    
    # Reindex the dataframe with the sorted index
    df_summary = df_summary.reindex(dict.fromkeys(sorted_index))  # dict.fromkeys preserves order while removing duplicates
    return df_summary

df_summary_deepseek_qwen_14b_temperature_0 = get_df_summary_sort_by_speedup(df_summary_deepseek_qwen_14b_temperature_0, columns_to_group_by=["Dataset", "Hardware"])
df_summary_deepseek_qwen_14b_temperature_0

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,TTFT (ms),TPOT (ms),T/s,Speedup
Dataset,Hardware,Drafter,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
scrolls,2025-02-06_02-10-28_d8c80fe,No Drafter (Autoregressive),1481.0,87.5,10.9,1.0
scrolls,2025-02-06_02-10-28_d8c80fe,DeepSeek-R1-Distill-Qwen-1.5B,1665.4,59.1,16.0,1.48
scrolls,2025-02-06_02-10-28_d8c80fe,vicuna-68m,1566.8,56.0,17.3,1.59
scrolls,2025-02-06_06-54-21_ab73cc7,No Drafter (Autoregressive),464.2,52.1,18.4,1.0
scrolls,2025-02-06_06-54-21_ab73cc7,DeepSeek-R1-Distill-Qwen-1.5B,533.0,48.7,19.8,1.08
scrolls,2025-02-06_06-54-21_ab73cc7,vicuna-68m,484.6,41.4,23.3,1.27
cnn_dailymail,2025-02-06_02-10-28_d8c80fe,No Drafter (Autoregressive),176.8,51.7,19.2,1.0
cnn_dailymail,2025-02-06_02-10-28_d8c80fe,DeepSeek-R1-Distill-Qwen-1.5B,287.5,69.9,14.1,0.73
cnn_dailymail,2025-02-06_02-10-28_d8c80fe,vicuna-68m,243.0,36.2,27.4,1.43
cnn_dailymail,2025-02-06_06-54-21_ab73cc7,No Drafter (Autoregressive),66.9,33.7,29.7,1.0


In [110]:
def get_df_summary_for_latex_floats(df_summary: pd.DataFrame, columns_to_bold_max: List[str]) -> pd.DataFrame:
    """
    For each column in the dataframe, find the index of the maximum or minimum value.
    If the column is in the list of columns to bold, bold the maximum value, otherwise bold the minimum value.
    """
    df_summary["Speedup"] = df_summary["Speedup"].round(2)
    cols_to_groupby: List[str] = df_summary.index.names.difference(["Drafter"])
    for col in df_summary.columns:
        g = df_summary.groupby(cols_to_groupby)[col]
        index_to_bold = g.idxmax() if col in columns_to_bold_max else g.idxmin()
        df_summary[col] = df_summary[col].astype(str)
        df_summary.loc[index_to_bold, col] = df_summary.loc[index_to_bold, col].apply(lambda x: "\\textbf{" + x + "}")
    return df_summary

# For each multi-index (Dataset, Hardware), find the maximum Speedup
columns_to_bold_max = ["T/s", "Speedup"]
df_summary_deepseek_qwen_14b_temperature_0 = get_df_summary_for_latex_floats(df_summary_deepseek_qwen_14b_temperature_0, columns_to_bold_max)
df_summary_deepseek_qwen_14b_temperature_0

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,TTFT (ms),TPOT (ms),T/s,Speedup
Dataset,Hardware,Drafter,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
scrolls,2025-02-06_02-10-28_d8c80fe,No Drafter (Autoregressive),\textbf{1481.0},87.5,10.9,1.0
scrolls,2025-02-06_02-10-28_d8c80fe,DeepSeek-R1-Distill-Qwen-1.5B,1665.4,59.1,16.0,1.48
scrolls,2025-02-06_02-10-28_d8c80fe,vicuna-68m,1566.8,\textbf{56.0},\textbf{17.3},\textbf{1.59}
scrolls,2025-02-06_06-54-21_ab73cc7,No Drafter (Autoregressive),\textbf{464.2},52.1,18.4,1.0
scrolls,2025-02-06_06-54-21_ab73cc7,DeepSeek-R1-Distill-Qwen-1.5B,533.0,48.7,19.8,1.08
scrolls,2025-02-06_06-54-21_ab73cc7,vicuna-68m,484.6,\textbf{41.4},\textbf{23.3},\textbf{1.27}
cnn_dailymail,2025-02-06_02-10-28_d8c80fe,No Drafter (Autoregressive),\textbf{176.8},51.7,19.2,1.0
cnn_dailymail,2025-02-06_02-10-28_d8c80fe,DeepSeek-R1-Distill-Qwen-1.5B,287.5,69.9,14.1,0.73
cnn_dailymail,2025-02-06_02-10-28_d8c80fe,vicuna-68m,243.0,\textbf{36.2},\textbf{27.4},\textbf{1.43}
cnn_dailymail,2025-02-06_06-54-21_ab73cc7,No Drafter (Autoregressive),\textbf{66.9},33.7,29.7,1.0


In [111]:
import re


def get_latex_table(df: pd.DataFrame) -> str:
    table: str = df.to_latex(escape=True)
    table = re.sub(r'\\textbackslash textbf\\\{([\d.]+)\\\}', r'\\textbf{\1}', table)
    return table

print(get_latex_table(df_summary_deepseek_qwen_14b_temperature_0))

\begin{tabular}{lllllll}
\toprule
 &  &  & TTFT (ms) & TPOT (ms) & T/s & Speedup \\
Dataset & Hardware & Drafter &  &  &  &  \\
\midrule
\multirow[t]{6}{*}{scrolls} & \multirow[t]{3}{*}{2025-02-06\_02-10-28\_d8c80fe} & No Drafter (Autoregressive) & \textbf{1481.0} & 87.5 & 10.9 & 1.0 \\
 &  & DeepSeek-R1-Distill-Qwen-1.5B & 1665.4 & 59.1 & 16.0 & 1.48 \\
 &  & vicuna-68m & 1566.8 & \textbf{56.0} & \textbf{17.3} & \textbf{1.59} \\
\cline{2-7}
 & \multirow[t]{3}{*}{2025-02-06\_06-54-21\_ab73cc7} & No Drafter (Autoregressive) & \textbf{464.2} & 52.1 & 18.4 & 1.0 \\
 &  & DeepSeek-R1-Distill-Qwen-1.5B & 533.0 & 48.7 & 19.8 & 1.08 \\
 &  & vicuna-68m & 484.6 & \textbf{41.4} & \textbf{23.3} & \textbf{1.27} \\
\cline{1-7} \cline{2-7}
\multirow[t]{6}{*}{cnn\_dailymail} & \multirow[t]{3}{*}{2025-02-06\_02-10-28\_d8c80fe} & No Drafter (Autoregressive) & \textbf{176.8} & 51.7 & 19.2 & 1.0 \\
 &  & DeepSeek-R1-Distill-Qwen-1.5B & 287.5 & 69.9 & 14.1 & 0.73 \\
 &  & vicuna-68m & 243.0 & \textbf{36.

In [112]:
def is_method_sd(target: str, drafter: str) -> str:
    target = target.lower()
    drafter = drafter.lower()
    family = target.split("-")[0]
    if "phi" in family:
        return False
    if "deepseek" in family:
        if "llama" in target and "llama" in drafter:
            return True
        if "qwen" in target and "qwen" in drafter:
            return True
        return False
    if family in drafter:
        return True
    return False

def get_method_name(target: str, drafter: str, temperature: float) -> str:
    if drafter == "No Drafter (Autoregressive)":
        return "AR"
    if is_method_sd(target, drafter):
        return "SD"
    if temperature == 0:
        return "SLEM"
    return "TLI"


def enrich_df_summary_with_method_name(df_summary: pd.DataFrame) -> pd.DataFrame:
    # We will insert the "Method" column between `left_index_cols` and `right_index_cols`
    cols_index_right = ["drafter"]
    cols_index_left = df_summary.index.names.difference(cols_index_right)
    df_summary.reset_index(inplace=True)
    df_summary["method"] = df_summary.apply(lambda x: get_method_name(x["target"], x["drafter"], x["temperature"]), axis=1)
    df_summary.set_index(cols_index_left + ["method"] + cols_index_right, inplace=True)
    return df_summary


df_summary = enrich_df_summary_with_method_name(df_summary)
df_summary.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,example_id_nunique,new_toks,ttft_ms,tpot_ms,out_toks_per_sec,speedup
target,dataset_path,submission_id,temperature,method,drafter,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,cnn_dailymail,2025-02-04_20-52-14_eb664c5,0,AR,No Drafter (Autoregressive),30,494.7,232.3,77.4,12.9,1.0
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,cnn_dailymail,2025-02-04_20-52-14_eb664c5,0,SD,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,30,492.7,461.3,79.3,12.4,0.96
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,cnn_dailymail,2025-02-04_20-52-14_eb664c5,0,SLEM,deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,30,496.6,296.1,72.6,13.6,1.05
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,cnn_dailymail,2025-02-04_20-52-14_eb664c5,1,AR,No Drafter (Autoregressive),30,472.6,232.6,77.7,12.9,1.0
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,cnn_dailymail,2025-02-04_20-52-14_eb664c5,1,SD,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,30,462.0,348.0,58.1,17.2,1.34
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,cnn_dailymail,2025-02-04_20-52-14_eb664c5,1,TLI,deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,30,496.0,329.0,59.3,16.8,1.3
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,cnn_dailymail,2025-02-06_17-46-21_ab73cc7,0,AR,No Drafter (Autoregressive),29,494.1,231.2,77.8,12.8,1.0
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,cnn_dailymail,2025-02-06_17-46-21_ab73cc7,0,SD,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,29,492.0,452.5,78.6,12.5,0.98
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,cnn_dailymail,2025-02-06_17-46-21_ab73cc7,0,SLEM,deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,29,496.0,296.8,72.3,13.6,1.06
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,cnn_dailymail,2025-02-06_17-46-21_ab73cc7,0,SLEM,double7/vicuna-68m,29,496.0,263.8,51.5,19.3,1.5


In [113]:
df_summary_for_latex = get_df_summary_for_latex_strings(df_summary, columns_to_drop=["example_id_nunique", "new_toks"])
df_summary_for_latex.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,TTFT (ms),TPOT (ms),T/s,Speedup
Temperature,Target,Dataset,Hardware,Method,Drafter,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,DeepSeek-R1-Distill-Llama-70B,cnn_dailymail,2025-02-04_20-52-14_eb664c5,AR,No Drafter (Autoregressive),232.3,77.4,12.9,1.0
0,DeepSeek-R1-Distill-Llama-70B,cnn_dailymail,2025-02-04_20-52-14_eb664c5,SD,DeepSeek-R1-Distill-Llama-8B,461.3,79.3,12.4,0.96
0,DeepSeek-R1-Distill-Llama-70B,cnn_dailymail,2025-02-04_20-52-14_eb664c5,SLEM,DeepSeek-R1-Distill-Qwen-1.5B,296.1,72.6,13.6,1.05
1,DeepSeek-R1-Distill-Llama-70B,cnn_dailymail,2025-02-04_20-52-14_eb664c5,AR,No Drafter (Autoregressive),232.6,77.7,12.9,1.0
1,DeepSeek-R1-Distill-Llama-70B,cnn_dailymail,2025-02-04_20-52-14_eb664c5,SD,DeepSeek-R1-Distill-Llama-8B,348.0,58.1,17.2,1.34
1,DeepSeek-R1-Distill-Llama-70B,cnn_dailymail,2025-02-04_20-52-14_eb664c5,TLI,DeepSeek-R1-Distill-Qwen-1.5B,329.0,59.3,16.8,1.3
0,DeepSeek-R1-Distill-Llama-70B,cnn_dailymail,2025-02-06_17-46-21_ab73cc7,AR,No Drafter (Autoregressive),231.2,77.8,12.8,1.0
0,DeepSeek-R1-Distill-Llama-70B,cnn_dailymail,2025-02-06_17-46-21_ab73cc7,SD,DeepSeek-R1-Distill-Llama-8B,452.5,78.6,12.5,0.98
0,DeepSeek-R1-Distill-Llama-70B,cnn_dailymail,2025-02-06_17-46-21_ab73cc7,SLEM,DeepSeek-R1-Distill-Qwen-1.5B,296.8,72.3,13.6,1.06
0,DeepSeek-R1-Distill-Llama-70B,cnn_dailymail,2025-02-06_17-46-21_ab73cc7,SLEM,vicuna-68m,263.8,51.5,19.3,1.5


In [114]:
df_summary_for_latex = get_df_summary_sort_by_speedup(df_summary_for_latex, columns_to_group_by=["Temperature", "Target", "Dataset", "Hardware"])
df_summary_for_latex.head(40)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,TTFT (ms),TPOT (ms),T/s,Speedup
Temperature,Target,Dataset,Hardware,Method,Drafter,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,gemma-2-9b-it,scrolls,2025-02-06_06-38-32_e14af1d,AR,No Drafter (Autoregressive),584.0,95.3,9.9,1.0
0,gemma-2-9b-it,scrolls,2025-02-06_06-38-32_e14af1d,SLEM,vicuna-68m,592.4,48.3,18.6,1.87
0,gemma-2-9b-it,scrolls,2025-02-06_06-38-32_e14af1d,SD,gemma-2-2b-it,739.0,31.3,30.2,3.05
0,gemma-2-9b-it,openai_humaneval,2025-02-06_06-38-32_e14af1d,AR,No Drafter (Autoregressive),42.6,37.0,27.0,1.0
0,gemma-2-9b-it,openai_humaneval,2025-02-06_06-38-32_e14af1d,SLEM,vicuna-68m,51.6,24.5,40.2,1.49
0,gemma-2-9b-it,openai_humaneval,2025-02-06_06-38-32_e14af1d,SD,gemma-2-2b-it,446.5,29.1,33.2,1.23
0,gemma-2-9b-it,cnn_dailymail,2025-02-06_06-38-32_e14af1d,AR,No Drafter (Autoregressive),73.7,37.3,26.7,1.0
0,gemma-2-9b-it,cnn_dailymail,2025-02-06_06-38-32_e14af1d,SLEM,vicuna-68m,83.9,26.9,37.1,1.39
0,gemma-2-9b-it,cnn_dailymail,2025-02-06_06-38-32_e14af1d,SD,gemma-2-2b-it,125.4,39.7,24.6,0.92
0,Llama-3.1-70B-Instruct,scrolls,2025-02-06_06-38-19_e14af1d,AR,No Drafter (Autoregressive),1804.6,125.3,7.8,1.0


In [115]:
get_df_summary_for_latex_floats(df_summary_for_latex, columns_to_bold_max)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,TTFT (ms),TPOT (ms),T/s,Speedup
Temperature,Target,Dataset,Hardware,Method,Drafter,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,gemma-2-9b-it,scrolls,2025-02-06_06-38-32_e14af1d,AR,No Drafter (Autoregressive),\textbf{584.0},\textbf{95.3},\textbf{9.9},\textbf{1.0}
0,gemma-2-9b-it,scrolls,2025-02-06_06-38-32_e14af1d,SLEM,vicuna-68m,\textbf{592.4},\textbf{48.3},\textbf{18.6},\textbf{1.87}
0,gemma-2-9b-it,scrolls,2025-02-06_06-38-32_e14af1d,SD,gemma-2-2b-it,\textbf{739.0},\textbf{31.3},\textbf{30.2},\textbf{3.05}
0,gemma-2-9b-it,openai_humaneval,2025-02-06_06-38-32_e14af1d,AR,No Drafter (Autoregressive),\textbf{42.6},\textbf{37.0},\textbf{27.0},\textbf{1.0}
0,gemma-2-9b-it,openai_humaneval,2025-02-06_06-38-32_e14af1d,SLEM,vicuna-68m,\textbf{51.6},\textbf{24.5},\textbf{40.2},\textbf{1.49}
0,gemma-2-9b-it,openai_humaneval,2025-02-06_06-38-32_e14af1d,SD,gemma-2-2b-it,\textbf{446.5},\textbf{29.1},\textbf{33.2},\textbf{1.23}
0,gemma-2-9b-it,cnn_dailymail,2025-02-06_06-38-32_e14af1d,AR,No Drafter (Autoregressive),\textbf{73.7},\textbf{37.3},\textbf{26.7},\textbf{1.0}
0,gemma-2-9b-it,cnn_dailymail,2025-02-06_06-38-32_e14af1d,SLEM,vicuna-68m,\textbf{83.9},\textbf{26.9},\textbf{37.1},\textbf{1.39}
0,gemma-2-9b-it,cnn_dailymail,2025-02-06_06-38-32_e14af1d,SD,gemma-2-2b-it,\textbf{125.4},\textbf{39.7},\textbf{24.6},\textbf{0.92}
0,Llama-3.1-70B-Instruct,scrolls,2025-02-06_06-38-19_e14af1d,AR,No Drafter (Autoregressive),\textbf{1804.6},\textbf{125.3},\textbf{7.8},\textbf{1.0}
