In [1]:
from pathlib import Path
import subprocess
from typing import Any, Dict, List

import fire
import pandas as pd
from scipy.stats import hmean
from hf_bench.benchmark import ResultsTableRow


def get_columns() -> List[str]:
    return ResultsTableRow.__annotations__.keys()


def list_tracked_files(dirpath: str) -> Dict[str, str]:
    # Run git ls-tree command and capture output
    cmd = ["git", "ls-tree", "-r", "HEAD", "--name-only", dirpath]
    result = subprocess.run(cmd, capture_output=True, text=True, check=True)
    # Split output into list of files
    files = result.stdout.strip().split("\n")
    # Filter out empty strings
    files = [f for f in files if f]
    return files


def get_df_concat(dirpath: str) -> pd.DataFrame:
    """
    Get a dataframe of all the results in the given directory.
    """
    filepaths = list_tracked_files(dirpath)
    print(f"Found {len(filepaths)} tracked files in {dirpath}.")
    df_first = pd.read_csv(filepaths[0])
    column_dtypes: Dict[str, Any] = {
        "submission_id": str,
        **{col: df_first[col].dtype for col in get_columns()},
    }
    columns = ["submission_id"] + list(get_columns())
    df = pd.DataFrame(columns=columns).astype(column_dtypes)
    for f in filepaths:
        submission_id: str = Path(f).parent.stem
        df_new = pd.read_csv(f)
        df_new["drafter"] = df_new["drafter"].fillna("No Drafter (Autoregressive)")
        df_new["submission_id"] = submission_id
        df_new = df_new[columns]
        df = pd.concat([df, df_new])
    df.sort_values(
        by=columns,
        inplace=True,
    )
    return df


def get_df_summary_of_results(df_concat: pd.DataFrame) -> pd.DataFrame:
    df_concat.reset_index(drop=True, inplace=True)
    columns_for_index: List[str] = [
        "target",
        "submission_id",
        "dataset_path",
        "drafter",
        "temperature",
    ]
    df_concat.set_index(columns_for_index, inplace=True)
    example_id_nunique = df_concat["example_id"].groupby(columns_for_index).nunique()
    df_summary = example_id_nunique.to_frame()
    df_summary.rename(columns={"example_id": "example_id_nunique"}, inplace=True)
    df_mean_vals = df_concat.groupby(columns_for_index)[["new_toks", "ttft_ms"]].mean()
    df_hmean_vals = df_concat.groupby(columns_for_index)[
        ["tpot_ms", "out_toks_per_sec"]
    ].agg(hmean)
    df_summary = pd.concat([df_summary, df_mean_vals, df_hmean_vals], axis=1)
    return df_summary


dirpath = "/home/projects/dharel/nadavt/repos/hf-bench/benchmark_results"
print("Concatenating all the results CSVs into one dataframe...")
df_concat: pd.DataFrame = get_df_concat(dirpath)
# df_concat.to_csv("results_all.csv", index=False)
print("Counting the number of unique example IDs for each experiment...")
df_summary: pd.DataFrame = get_df_summary_of_results(df_concat)
# df_summary.to_csv("results_summary.csv", index=True)
print(f"Stored both the concatenated dataframe and the summary in {dirpath}.")
df_summary

  from .autonotebook import tqdm as notebook_tqdm


Concatenating all the results CSVs into one dataframe...
Found 6 tracked files in /home/projects/dharel/nadavt/repos/hf-bench/benchmark_results.
Counting the number of unique example IDs for each experiment...
Stored both the concatenated dataframe and the summary in /home/projects/dharel/nadavt/repos/hf-bench/benchmark_results.


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,example_id_nunique,new_toks,ttft_ms,tpot_ms,out_toks_per_sec
target,submission_id,dataset_path,drafter,temperature,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,2025-02-04_01-05-29_4c55336,openai/openai_humaneval,No Drafter (Autoregressive),0,30,512.0,297.108555,122.585022,8.153429
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,2025-02-04_01-05-29_4c55336,openai/openai_humaneval,No Drafter (Autoregressive),1,30,512.0,244.010814,123.482814,8.096364
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,2025-02-04_01-05-29_4c55336,openai/openai_humaneval,bigcode/tiny_starcoder_py,0,30,512.0,265.932067,84.592127,11.751581
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,2025-02-04_01-05-29_4c55336,openai/openai_humaneval,bigcode/tiny_starcoder_py,1,30,512.0,258.679827,85.461729,11.655785
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,2025-02-04_01-05-29_4c55336,openai/openai_humaneval,codellama/CodeLlama-7b-Instruct-hf,0,30,512.0,428.676637,101.486216,9.642755
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,2025-02-04_01-05-29_4c55336,openai/openai_humaneval,codellama/CodeLlama-7b-Instruct-hf,1,30,511.866667,317.299342,87.200994,11.403309
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,2025-02-04_01-05-29_4c55336,openai/openai_humaneval,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,0,30,512.0,353.519543,54.292916,18.315643
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,2025-02-04_01-05-29_4c55336,openai/openai_humaneval,deepseek-ai/DeepSeek-R1-Distill-Llama-8B,1,30,512.0,358.236814,53.126245,18.615135
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,2025-02-04_20-52-14_eb664c5,cnn_dailymail,No Drafter (Autoregressive),0,30,494.7,232.321461,77.447808,12.899085
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,2025-02-04_20-52-14_eb664c5,cnn_dailymail,No Drafter (Autoregressive),1,30,472.633333,232.607396,77.73758,12.851543


In [12]:
# 1. First get the autoregressive baseline values into a Series without the drafter level
mask_ar = df_summary.index.get_level_values("drafter") == "No Drafter (Autoregressive)"
ar_otps = df_summary[mask_ar]["out_toks_per_sec"]
ar_otps.index = ar_otps.index.droplevel("drafter")

# # 2. Then we can divide directly, but we need to keep out_toks_per_sec as a Series
# df_with_drafter = df_summary["out_toks_per_sec"].reset_index(level="drafter")["out_toks_per_sec"]
# speedup = df_with_drafter.div(ar_otps)

# # 3. Restore the original MultiIndex that includes the drafter level
# speedup.index = speedup.index.set_index("drafter", append=True)
# speedup

# 2. Then we can divide directly, but we need to keep out_toks_per_sec as a Series
df_with_drafter = df_summary["out_toks_per_sec"].reset_index(level="drafter")
speedup = df_with_drafter["out_toks_per_sec"].div(ar_otps)
# 3. Restore the drafter level to the index
speedup.index = df_with_drafter.set_index("drafter", append=True).index
speedup


target                                     submission_id                dataset_path             temperature  drafter                                  
deepseek-ai/DeepSeek-R1-Distill-Llama-70B  2025-02-04_01-05-29_4c55336  openai/openai_humaneval  0            No Drafter (Autoregressive)                  1.000000
                                                                                                 1            No Drafter (Autoregressive)                  1.441305
                                                                                                 0            bigcode/tiny_starcoder_py                    1.182663
                                                                                                 1            bigcode/tiny_starcoder_py                    2.246373
                                                                                                 0            codellama/CodeLlama-7b-Instruct-hf           1.000000
                            