In [16]:
from collections import Counter
import warnings
import pandas as pd
from hf_bench.summarize_results import list_tracked_files, get_columns


filepaths = list_tracked_files("benchmark_results")
expected_columns = get_columns()
print(f"{expected_columns=}")
benign_missing_example_ids_compared_to_others = [] # Some (target, dataset_path, drafter, temperature) have less unique example IDs than others
benign_missing_example_ids_compared_to_declared = [] # Some (target, dataset_path, drafter, temperature) have less unique example IDs than declared in the num_of_examples column
catastrophic_missing_example_ids_compared_to_others = [] # Some (target, dataset_path, drafter, temperature, example_id) include less rows than excepted, where the expected number of rows is the Cartesian product of the number of unique (target, dataset_path, drafter, temperature)
# for f in filepaths:
f = "benchmark_results/2025-02-04_01-05-29_4c55336/deepseek-ai-DeepSeek-R1-Distill-Llama-70B_openai-openai_humaneval_openai_humaneval_test_30_deepseek-ai-DeepSeek-R1-Distill-Llama-8B-codellama-CodeLlama-7b-Instruct-hf-bigcode-tiny_starcoder_py.csv"
df = pd.read_csv(f)
col_counter = Counter(df.columns)
for col in expected_columns:
    assert col_counter[col] == 1, f"Column {col} is missing in the dataframe or appears multiple times.\nFilepath: {f}"
# Check that all example IDs appear the same number of times
columns_for_index = ["target", "dataset_path", "drafter", "temperature"]
df_example_ids_nunique = df.groupby(columns_for_index)["example_id"].nunique()
if df_example_ids_nunique.min() != df_example_ids_nunique.max():
    print(f"File {f} has missing example IDs (example IDs do not appear the same number of times).")
    benign_missing_example_ids_compared_to_others.append(f)
# Check that all example IDs appear num_of_examples times
expected_count = df["num_of_examples"].max()
if df_example_ids_nunique.min() != expected_count:
    print(f"File {f} has wrong number of example IDs (example IDs do not appear the same number of times).")
    benign_missing_example_ids_compared_to_declared.append(f)
# Calculate the expected number of times each example ID should appear
# This is the Cartesian product of the number of unique (target, dataset_path, drafter, temperature)
df["drafter"] = df["drafter"].fillna("No Drafter (Autoregressive)")
expected_num_of_rows_per_example_id: int = df["target"].nunique() * df["dataset_path"].nunique() * df["drafter"].nunique() * df["temperature"].nunique()
print(f"{expected_num_of_rows_per_example_id}={df['target'].nunique()}*{df['dataset_path'].nunique()}*{df['drafter'].nunique()}*{df['temperature'].nunique()}")
# When grouping by (target, dataset_path, drafter, temperature, example_id), the number of rows should be the expected number of rows per example ID
df_example_ids_count = df[columns_for_index + ["example_id"]].groupby("example_id").count().min(axis=1)
# all the incides for which the value is less than expected_num_of_rows_per_example_id
mask = df_example_ids_count < expected_num_of_rows_per_example_id
catastrophic_missing_example_ids = df_example_ids_count[mask].index.get_level_values("example_id")
catastrophic_missing_example_ids
# if df_example_ids_count.min() != expected_num_of_rows_per_example_id:
#     # Find the example IDs for which (target, dataset_path, drafter, temperature) have less rows than expected
#     mask = df_example_ids_count < expected_num_of_rows_per_example_id
#     catastrophic_missing_example_ids = df_example_ids_count[mask].index.get_level_values("example_id")
#     print(f"File {f} has catastrophic missing example IDs. The following example IDs do not repeat the expected number of times, which is {expected_num_of_rows_per_example_id}={df['target'].nunique()}*{df['dataset_path'].nunique()}*{df['drafter'].nunique()}*{df['temperature'].nunique()}:\n{catastrophic_missing_example_ids.to_list()}")
#     catastrophic_missing_example_ids_compared_to_others.append(f)
# if benign_missing_example_ids_compared_to_others:
#     warnings.warn("Some example IDs do not appear the same number of times in the following files:\n" + "\n".join(benign_missing_example_ids_compared_to_others))
# if benign_missing_example_ids_compared_to_declared:
#     warnings.warn("Some example IDs appear only %d times in the dataframe although they should appear %d times according to the num_of_examples column.\nFilepath: %s" % (df_example_ids_nunique.min(), expected_count, f))
# assert len(catastrophic_missing_example_ids_compared_to_others) == 0, f"Some example IDs do not appear the same number of times in the following files:\n" + "\n".join(catastrophic_missing_example_ids_compared_to_others)

expected_columns=dict_keys(['target', 'dataset_path', 'dataset_name', 'dataset_split', 'num_of_examples', 'drafter', 'temperature', 'example_id', 'new_toks', 'ttft_ms', 'tpot_ms', 'out_toks_per_sec'])
8=1*1*4*2


Index([], dtype='int64', name='example_id')

In [23]:
(df[["new_toks", "ttft_ms", "tpot_ms", "out_toks_per_sec"]] > 0).all().all().item()

True

In [28]:
from hf_bench.summarize_results import get_df_concat, get_df_summary_of_results


dirpath = "benchmark_results"
print("Concatenating all the results CSVs into one dataframe...")
df_concat: pd.DataFrame = get_df_concat(dirpath)
df_concat.to_csv("results_all.csv", index=False)

print("Counting the number of unique example IDs for each experiment...")
df_summary: pd.DataFrame = get_df_summary_of_results(df_concat)
# Round the values to 1 decimal place
df_summary["new_toks"] = df_summary["new_toks"].round(1)
df_summary["ttft_ms"] = df_summary["ttft_ms"].round(1)
df_summary["tpot_ms"] = df_summary["tpot_ms"].round(1)
df_summary["out_toks_per_sec"] = df_summary["out_toks_per_sec"].round(1)
# Reorder the multi-index columns so that the `submission_id` column is the last one
df_summary.reset_index(level="submission_id", inplace=True)
df_summary.set_index("submission_id", append=True, inplace=True)
df_summary

# # Move the `submission_id` columns
# df_summary = df_summary[
#     [
#         "target",
#         "dataset_path",
#         "drafter",
#         "temperature",
#         "submission_id",
#         "example_id_nunique",
#         "new_toks",
#         "ttft_ms",
#         "tpot_ms",
#         "out_toks_per_sec",
#     ]
# ]
# df_summary.sort_values(
#     by=["target", "dataset_path", "drafter", "temperature", "submission_id"],
#     inplace=True,
# )
# df_summary.to_csv("results_summary.csv", index=True)

# print(f"Stored both the concatenated dataframe and the summary in {dirpath}.")
# print("Done!")

Concatenating all the results CSVs into one dataframe...
Found 49 tracked files in benchmark_results.


Counting the number of unique example IDs for each experiment...


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,example_id_nunique,new_toks,ttft_ms,tpot_ms,out_toks_per_sec
target,dataset_path,drafter,temperature,submission_id,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,openai/openai_humaneval,No Drafter (Autoregressive),0,2025-02-04_01-05-29_4c55336,30,512.0,297.1,122.6,8.2
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,openai/openai_humaneval,No Drafter (Autoregressive),1,2025-02-04_01-05-29_4c55336,30,512.0,244.0,123.5,8.1
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,openai/openai_humaneval,bigcode/tiny_starcoder_py,0,2025-02-04_01-05-29_4c55336,30,512.0,265.9,84.6,11.8
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,openai/openai_humaneval,bigcode/tiny_starcoder_py,1,2025-02-04_01-05-29_4c55336,30,512.0,258.7,85.5,11.7
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,openai/openai_humaneval,codellama/CodeLlama-7b-Instruct-hf,0,2025-02-04_01-05-29_4c55336,30,512.0,428.7,101.5,9.6
...,...,...,...,...,...,...,...,...,...
mistralai/Mixtral-8x22B-Instruct-v0.1,tau/scrolls,No Drafter (Autoregressive),1,2025-02-06_21-32-25_ab73cc7,30,237.2,1325.3,168.0,5.9
mistralai/Mixtral-8x22B-Instruct-v0.1,tau/scrolls,Qwen/Qwen2.5-0.5B-Instruct,0,2025-02-06_21-32-25_ab73cc7,30,188.3,1395.3,75.0,11.0
mistralai/Mixtral-8x22B-Instruct-v0.1,tau/scrolls,Qwen/Qwen2.5-0.5B-Instruct,1,2025-02-06_21-32-25_ab73cc7,30,216.9,1365.3,98.5,9.8
mistralai/Mixtral-8x22B-Instruct-v0.1,tau/scrolls,double7/vicuna-68m,0,2025-02-06_21-32-25_ab73cc7,30,191.6,1336.1,130.2,7.5
