In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

from pathlib import Path
import sys

sys.path.append('/mnt/c/Users/devmo/Desktop/SciRes/mess-plus')

from classifier.file_reader import read_files_from_folder
from evaluations.utils.wandb_loader import download_log_data, load_all_histories_to_dataframe
from plots.utils.plotting import write_figure_to_disk

NOTEBOOK_PATH = Path("experiments.ipynb").absolute().parent

DATA_DIR = f"{NOTEBOOK_PATH}/data/online"

BENCHMARK_NAMES = ["arc_challenge", "arc_easy", "boolq", "lambada_standard", "logiqa", "logiqa2", "piqa", "sciq", "social_iqa", "winogrande"]
# BENCHMARK_NAMES = ["winogrande"]

plt.rcParams.update({
    "text.usetex": True,
    "font.family": "Helvetica"
})



In [None]:
run_summary_df = download_log_data(
    entity="tum-i13",
    project_name="mess-plus-api-pricing",
    save_dir=DATA_DIR,
    batch_size=50
)

In [None]:
display(run_summary_df)
run_df = load_all_histories_to_dataframe(DATA_DIR)

for name in BENCHMARK_NAMES:
	run_df.loc[run_df["run_name"].str.contains(name), "benchmark_name"] = name
	run_df.loc[run_df["run_name"].str.contains(name), "run_name"] = run_df.loc[run_df["run_name"].str.contains(name), "run_name"].str.replace(f"{name}_", "")

pat = r'V=([^_]+)_a=([^_]+)_c=([^_]+)_seed=([^_]+)'
run_df[['V', 'alpha', 'c', 'seed']] = run_df['run_name'].str.extract(pat)
run_df[['V', 'alpha', 'c']] = run_df[['V', 'alpha', 'c']].astype(float)
run_df['seed'] = run_df['seed'].astype(int)
run_df["alpha"] = run_df["alpha"].astype(float)
run_df["V"] = run_df["V"].astype(float)
run_df["c"] = run_df["c"].astype(float)
run_df["seed"] = run_df["seed"].astype(int)

run_df["models/small_chosen"] = run_df["models/small_chosen"].astype(float)
run_df["models/medium_chosen"] = run_df["models/medium_chosen"].astype(float)
run_df["models/large_chosen"] = run_df["models/large_chosen"].astype(float)

display(run_df.head())

In [None]:
analysis_df = run_df.loc[(run_df["c"] == 1.0) & (run_df["benchmark_name"] == "winogrande")].pivot_table(index=["benchmark_name", "alpha", "V", "c"], values=["avg_accuracy", "running_avg_cost_usd", "mess_plus/q_length", "total_runtime"], aggfunc={"avg_accuracy": "mean", "running_avg_cost_usd": "sum", "mess_plus/q_length": "mean", "total_runtime": "max"})

In [None]:
def add_value_labels(axx, spacing=5, convert_to_mj: bool = True):
    """Add labels to the end of each bar in a bar chart.

    Arguments:
        ax (matplotlib.axes.Axes): The matplotlib object containing the axes
            of the plot to annotate.
        spacing (int): The distance between the labels and the bars.
    """

    # For each bar: Place a label
    for rect in axx.patches:
        # Get X and Y placement of label from rect.
        y_value = rect.get_height()
        x_value = rect.get_x() + rect.get_width() / 2

        # Number of points between bar and label. Change to your liking.
        space = spacing
        # Vertical alignment for positive values
        va = 'bottom'

        # If value of bar is negative: Place label below bar
        if y_value < 0:
            # Invert space to place label below
            space *= -1
            # Vertically align label at top
            va = 'top'

        # Use Y value as label and format number with one decimal place
        if convert_to_mj:
            label = f'{y_value / 1_000_000:.1f}' # MJ conversion
        else:
            label = f'{y_value:.2f}'

        # Create annotation
        axx.annotate(
            label,                      # Use `label` as label
            (x_value, y_value),         # Place label at end of the bar
            xytext=(0, space),          # Vertically shift label by `space`
            textcoords="offset points", # Interpret `xytext` as offset in points
            ha='center',                # Horizontally center label
            va=va)                      # Vertically align label differently for
                                        # positive and negative values.

def fmt_to_megajoules(x, pos):
    return f'{(x / 1_000_000):.0f}'


In [None]:
infer_df = pd.DataFrame()
def get_inference_data(benchmark_name):
	try:
		input_df = read_files_from_folder(folder_path=f"{NOTEBOOK_PATH.parent}/data/inference_outputs/{benchmark_name}")
		input_df["idx_original"] = input_df.index
		input_df = input_df.sample(frac=1).reset_index(drop=True)

		return input_df
	except ValueError:
		return pd.DataFrame()

for name in BENCHMARK_NAMES:
	infer_df = pd.concat([infer_df, get_inference_data(name)], ignore_index=True)

infer_df.reset_index(inplace=True)

In [None]:
v_values_per_benchmark = {
    "arc_challenge": [0.001, 0.0001, 0.00001],
    "arc_easy": [0.01, 0.001, 0.0001],
    "boolq": [0.01, 0.001, 0.0001],
    # "lambada_standard": [0.01, 0.001, 0.0001],
    "logiqa": [0.001, 0.0001, 0.00001],
    # "logiqa2": [0.01, 0.001, 0.0001],
    "piqa": [0.01, 0.001, 0.0001],
    "sciq": [0.0001, 0.00001, 0.000001],
    "social_iqa": [0.001, 0.0001, 0.00001],
    "winogrande": [0.01, 0.001, 0.0001],
}

In [None]:
import matplotlib as mpl
mpl.rcParams['text.usetex'] = False       

In [None]:
# Set the style for all plots
sns.set_style("whitegrid")
sns.set_palette(palette="dark:#5A9_r")

sns.set_theme(context='paper', style='whitegrid', palette='dark:#5A9_r', font='sans-serif', font_scale=1.1, color_codes=True, rc=None)

# Create a figure and a grid of subplots: 4 rows, 10 columns
fig, axes = plt.subplots(nrows=3, ncols=6, figsize=(15, 5), gridspec_kw={'width_ratios': [4, 1, 1, 1, 1, 1]})

# # Flatten the 2D array of axes for easier iteration
# axes = axes.flatten()

name = "winogrande"
subset = run_df.loc[(run_df["benchmark_name"] == name) & (run_df["c"] == 0.1) & (run_df["V"].isin(v_values_per_benchmark[name])) & (run_df["_step"] > 10)]
subset = subset.sort_values(by=["alpha"])

def format_v_value(b):
	if b < 0.0001:
		b = b * 100
		b = b / 100

	return f"Ours (V={b})"

subset["V"] = subset["V"].apply(format_v_value)

iterator = 0
for alpha in subset["alpha"].unique().tolist():
	v_values = subset["V"].unique().tolist()
	c_values = subset["c"].unique().tolist()

	# alpha = target_alpha_per_benchmark[name]

	# Accuracy Plot
	raw_inference_accuracies_per_model = infer_df[["benchmark_name", "label_small", "label_medium", "label_large"]].groupby("benchmark_name").mean().loc[name]

	axes[iterator][0].text(s="L1B", x=subset["_step"].max() - 20, y=raw_inference_accuracies_per_model["label_small"] + 0.01, color='gray', fontsize=9, ha="right")
	axes[iterator][0].text(s="L8B", x=subset["_step"].max() - 20, y=raw_inference_accuracies_per_model["label_medium"] + 0.01, color='gray', fontsize=9, ha="right")
	axes[iterator][0].text(s="L70B", x=subset["_step"].max() - 20, y=raw_inference_accuracies_per_model["label_large"] + 0.01, color='gray', fontsize=9, ha="right")
	axes[iterator][0].axhline(y=raw_inference_accuracies_per_model["label_small"], color='gray', linestyle='--')
	axes[iterator][0].axhline(y=raw_inference_accuracies_per_model["label_medium"], color='gray', linestyle='--')
	axes[iterator][0].axhline(y=raw_inference_accuracies_per_model["label_large"], color='gray', linestyle='--')

	sns.lineplot(
	    data=subset.loc[(subset["alpha"] == alpha)],
	    x="_step",
	    y="avg_accuracy",
	    hue="V",
		errorbar=None,
		ax=axes[iterator][0],
		legend=True if iterator == 0 else False,
		palette=["#2f364d", "#3f758a", "#69cf81"],
		hue_order=["Ours (V=1e-5)", "Ours (V=0.0001)", "Ours (V=0.001)"],
	)
    
	# Add alpha marker
	t1 = axes[iterator][0].text(s=r"$ \alpha = {alpha_val} $ (red line)".format(alpha_val=alpha), x=subset["_step"].max() - 20, y=1.15 * raw_inference_accuracies_per_model["label_large"] - 0.04, color='red', fontsize=9, ha="right")
	t1.set_bbox(dict(facecolor='white', alpha=0.6, edgecolor='black', pad=1.5))

	# Stackplot for Model Call Ratio
	v_values_per_benchmark[name] = sorted(v_values_per_benchmark[name], reverse=False)
	# v_values_per_benchmark[name].reverse()
	for jdx, V in enumerate(v_values_per_benchmark[name]):

		stack_df = subset.loc[
			(run_df["benchmark_name"] == name) &
			(run_df["V"] == V) &
			(subset["alpha"] == alpha),
			["_step", "models/small_chosen", "models/medium_chosen", "models/large_chosen"]
		].groupby(["_step"]).mean().reset_index()

		x = stack_df["_step"]
		y = stack_df[["models/small_chosen", "models/medium_chosen", "models/large_chosen"]]
		y_stack = np.cumsum(y, axis=1)

		axes[iterator][1 + jdx].fill_between(x, 0, y_stack.iloc[:, 0], color="#2f364d", alpha=1.0)
		axes[iterator][1 + jdx].fill_between(x, y_stack.iloc[:, 0], y_stack.iloc[:, 1], color="#3f758a", alpha=1.0)
		axes[iterator][1 + jdx].fill_between(x, y_stack.iloc[:, 1], y_stack.iloc[:, 2], color="#69cf81", alpha=1.0)
		axes[iterator][1 + jdx].set(xlabel=f"Requests @ V={V}", xlim=[0, subset.loc[(subset["alpha"] == alpha), "_step"].max()], ylim=[0, 1])
		axes[iterator][1 + jdx].yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
		axes[iterator][1 + jdx].set(xlim=[0, stack_df["_step"].max()])

		if iterator == 0 and jdx == 0:
			axes[iterator][1 + jdx].text(s="L70B", x=70, y=0.80, color="black")
			axes[iterator][1 + jdx].text(s="L8B", x=70, y=0.40, color="white")
			axes[iterator][1 + jdx].text(s="L1B", x=70, y=0.10, color="white")

	axes[iterator][0].set(xlabel="Requests", xlim=[0, subset.loc[(subset["alpha"] == alpha), "_step"].max()])
	axes[iterator][0].set(ylabel=None)

	for ax, col in zip(axes[iterator], [r"Avg. User Satisfaction Rate Over Time (varying $\alpha$)".format(alpha_val=alpha), "Model Call Ratio (MCR)", "", ""]):

		if iterator == 1:
			ax.set_ylabel(col, rotation=90, size=10)

	iterator += 1

fig.tight_layout()
write_figure_to_disk(plt, file_name=f"{name}_all_alpha", chapter_name="evaluations")


In [None]:
# Set the style for all plots
sns.set_style("whitegrid")
sns.set_palette(palette="dark:#5A9_r")

sns.set_theme(context='paper', style='whitegrid', palette='dark:#5A9_r', font='sans-serif', font_scale=1.8, color_codes=True, rc=None)

# Create a figure and a grid of subplots: 4 rows, 10 columns
fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(4, 7))

# # Flatten the 2D array of axes for easier iteration
# axes = axes.flatten()

name = "winogrande"
subset = run_df.loc[(run_df["benchmark_name"] == name) & (run_df["c"] == 0.1) & (run_df["V"].isin(v_values_per_benchmark[name])) & (run_df["_step"] > 10)]
subset = subset.sort_values(by=["alpha"])

def format_v_value(b):
	if b < 0.0001:
		b = b * 100
		b = b / 100

	return f"Ours (V={b})"

subset["V"] = subset["V"].apply(format_v_value)

iterator = 0
for alpha in subset["alpha"].unique().tolist():
	v_values = subset["V"].unique().tolist()
	c_values = subset["c"].unique().tolist()

	# Q Plot for SLA violations
	subset.loc[(subset["alpha"] == alpha), "sla_violations"] = subset.loc[(subset["alpha"] == alpha), "mess_plus/q_length"] / subset.loc[(subset["alpha"] == alpha), "_step"]
	sns.lineplot(
	    data=subset.loc[(subset["alpha"] == alpha)],
	    x="_step",
	    y="sla_violations",
	    hue="V",
		errorbar=None,
		ax=axes[iterator],
		legend=True if iterator == 0 else False,
		palette=["#2f364d", "#3f758a", "#69cf81"],
		hue_order=["Ours (V=1e-5)", "Ours (V=0.0001)", "Ours (V=0.001)"],
	)

	# Add alpha marker
	t1 = axes[iterator].text(s=r"$ \alpha = {alpha_val} $".format(alpha_val=alpha), x=subset["_step"].max() - 20, y=0.17, color='red', fontsize=14, ha="right")
	t1.set_bbox(dict(facecolor='white', alpha=0.6, edgecolor='black', pad=1.5))

	if iterator == 0:
		axes[iterator].legend(ncols=1, title="Method", fontsize=9, title_fontsize=9, loc='upper left')

	axes[iterator].set(xlabel="Requests", xlim=[0, subset.loc[(subset["alpha"] == alpha), "_step"].max()])

	axes[iterator].set(ylabel=None)

	for ax, col in zip(axes, ["", r"Avg. SLA Violations Over Time (varying $\alpha$ values)", ""]):
		ax.set_ylabel(col, rotation=90, size=18)

	iterator += 1

fig.tight_layout()
write_figure_to_disk(plt, file_name=f"{name}_sla_violations", chapter_name="evaluations")

In [None]:
print(infer_df.columns)
print(infer_df.groupby("benchmark_name")["energy_consumption_large"].mean())
print(infer_df.groupby("benchmark_name")["energy_consumption_medium"].mean())
print(infer_df.groupby("benchmark_name")["energy_consumption_small"].mean())

In [None]:
# # Plot generator
#
# BENCHMARK_NAME_DICT = {
#     "arc_challenge": "ARC Challenge",
#     "arc_easy": "ARC Easy",
#     "boolq": "BoolQ",
#     # "lambada_standard": "Lambada",
#     "logiqa": "LogiQA",
#     # "logiqa2": "LogiQA2",
#     "piqa": "PiQA",
#     "sciq": "SciQ",
#     "social_iqa": "SocialIQA",
#     "winogrande": "WinoGrande",
# }
#
# # Create a list of all benchmark-alpha combinations
# benchmark_alpha_combinations = []
# for name in v_values_per_benchmark.keys():
#     config_path = Path(f"{NOTEBOOK_PATH.parent}/config/online/{name}.yaml")
#     with config_path.open("r") as f:
#         import yaml
#         CONFIG = yaml.safe_load(f)
#
#     algorithm_config = CONFIG["algorithm"]
#     for alpha in algorithm_config["alpha_values"]:
#         benchmark_alpha_combinations.append((name, alpha))
#
# # Initialize plotting variables
# plot_num = 0
# col_count = 0
#
# # Iterate through all benchmark-alpha combinations
# for combo_idx, (name, alpha) in enumerate(benchmark_alpha_combinations):
#
#     # Create new figure every 6 columns
#     if col_count == 0:
#         sns.set(style="whitegrid")
#         fig, axes = plt.subplots(nrows=7, ncols=6, figsize=(20, 12))
#         plot_num += 1
#
#     # Get current column index
#     col_idx = col_count
#
#     # Skip if this benchmark doesn't have V values configured
#     if name not in v_values_per_benchmark.keys():
#         continue
#
#     # Filter data for current benchmark and alpha
#     subset = run_df.loc[(run_df["benchmark_name"] == name) &
#                        (run_df["c"] == 0.1) &
#                        (run_df["V"].isin(v_values_per_benchmark[name])) &
#                        (run_df["_step"] > 10) &
#                        (run_df["alpha"] == alpha)]
#
#     v_values = subset["V"].unique().tolist()
#
#     # Accuracy Plot
#     raw_inference_accuracies_per_model = infer_df[["benchmark_name", "label_small", "label_medium", "label_large"]].groupby("benchmark_name").mean().loc[name]
#
#     axes[0][col_idx].text(s="Llama 3.1 1B", x=subset["_step"].min() + 20, y=raw_inference_accuracies_per_model["label_small"] + 0.025, color='gray', fontsize=8, ha="left")
#     axes[0][col_idx].text(s="Llama 3.1 8B", x=(subset["_step"].min() + 1/2 * subset["_step"].max()), y=raw_inference_accuracies_per_model["label_medium"] + 0.025, color='gray', fontsize=8, ha="center")
#     axes[0][col_idx].text(s="Llama 3.3 70B", x=subset["_step"].max() - 20, y=raw_inference_accuracies_per_model["label_large"] + 0.025, color='gray', fontsize=8, ha="right")
#     axes[0][col_idx].axhline(y=raw_inference_accuracies_per_model["label_small"], color='gray', linestyle='--')
#     axes[0][col_idx].axhline(y=raw_inference_accuracies_per_model["label_medium"], color='gray', linestyle='--')
#     axes[0][col_idx].axhline(y=raw_inference_accuracies_per_model["label_large"], color='gray', linestyle='--')
#
#     sns.lineplot(
#         data=subset.loc[(subset["alpha"] == alpha)],
#         x="_step",
#         y="avg_accuracy",
#         hue="V",
#         errorbar=None,
#         ax=axes[0][col_idx],
#         legend=True if col_idx == 0 else False,
# 	    palette=["#2f364d", "#3f758a", "#69cf81"]
#     )
#
#     axes[0][col_idx].plot(
# 		baseline_df.loc[(baseline_df["benchmark_name"] == name) & (baseline_df["alpha"] == alpha), "_step"],
# 		baseline_df.loc[(baseline_df["benchmark_name"] == name) & (baseline_df["alpha"] == alpha),"avg_accuracy"],
# 		color="violet", linestyle="dotted", label="Rand."
# 	)
#
#     axes[0][col_idx].axhline(y=alpha, color='red', linestyle='-')
#     axes[0][col_idx].text(s=r"$ \alpha = {alpha_val} $ ".format(alpha_val=alpha), x=subset["_step"].max() - 20, y=alpha + 0.01, color='red', fontsize=8, ha="right")
#
#     axes[0][col_idx].set(ylim=[0.97 * raw_inference_accuracies_per_model["label_small"], 1.15 * raw_inference_accuracies_per_model["label_large"]])
#
#     if col_idx == 0:
#         axes[0][col_idx].legend(ncols=2)
#
#     # Q Plot for SLA violations
#     subset.loc[(subset["alpha"] == alpha), "sla_violations"] = subset.loc[(subset["alpha"] == alpha), "mess_plus/q_length"] / subset.loc[(subset["alpha"] == alpha), "_step"]
#     sns.lineplot(
#         data=subset.loc[(subset["alpha"] == alpha)],
#         x="_step",
#         y="sla_violations",
#         hue="V",
#         errorbar=None,
#         ax=axes[1][col_idx],
#         legend=True if col_idx == 0 else False,
# 	    palette=["#2f364d", "#3f758a", "#69cf81"]
#     )
#
#     if col_idx == 0:
#         axes[1][col_idx].legend(ncols=2)
#
#     # Energy consumption plot
#     random_baseline_energy = baseline_df.loc[baseline_df["alpha"] == alpha, ["benchmark_name", "mess_plus/energy"]].groupby("benchmark_name").sum().loc[name].to_frame()
#     random_baseline_energy["V"] = "Rand."
#     random_baseline_energy["mess_plus/energy"] = random_baseline_energy[name]
#     random_baseline_energy.reset_index(inplace=True)
#
#     raw_inference_energy_data = infer_df[["benchmark_name", "energy_consumption_large", "energy_consumption_medium", "energy_consumption_small"]].groupby("benchmark_name").sum().loc[name].to_frame()
#     raw_inference_energy_data["V"] = raw_inference_energy_data.index
#     raw_inference_energy_data["mess_plus/energy"] = raw_inference_energy_data[name]
#     raw_inference_energy_data.rename({name: "mess_plus/energy"}, inplace=True)
#     raw_inference_energy_data.reset_index(inplace=True)
#
#     raw_inference_energy_data["V"] = raw_inference_energy_data["V"].replace({"energy_consumption_large": "70B", "energy_consumption_medium": "8B", "energy_consumption_small": "1B"}, inplace=False)
#
#     raw_inference_energy_data.drop([name, "index"], inplace=True, axis=1)
#     energy_data = subset.loc[(subset["alpha"] == alpha)].groupby(["_step", "V"]).agg({"mess_plus/energy": "mean"}).groupby("V")["mess_plus/energy"].sum().reset_index()
#
#     energy_data["V"] = energy_data["V"].apply(lambda sample: f"V={sample}")
#
#     energy_data = pd.concat([random_baseline_energy, raw_inference_energy_data, energy_data], ignore_index=True)
#     energy_data.reset_index(inplace=True)
#     energy_data = energy_data.sort_values(by=["mess_plus/energy"], ascending=False)
#
#     sns.barplot(
#         data=energy_data,
#         x="V",
#         y="mess_plus/energy",
#         ax=axes[2][col_idx],
#         errorbar=("ci", 0.95),
#     )
#
#     add_value_labels(axes[2][col_idx])
#     axes[2][col_idx].yaxis.set_major_formatter(plt.FuncFormatter(fmt_to_megajoules))
#     axes[2][col_idx].set(ylim=[0, 2 * energy_data["mess_plus/energy"].max()])
#     axes[2][col_idx].tick_params(axis='x', labelrotation=45)
#
#     # Stackplot for Model Call Ratio
#     for jdx, V in enumerate(v_values_per_benchmark[name]):
#
#         stack_df = subset.loc[
#             (run_df["benchmark_name"] == name) &
#             (run_df["V"] == V) &
#             (subset["alpha"] == alpha),
#             ["_step", "models/small_chosen", "models/medium_chosen", "models/large_chosen"]
#         ].groupby(["_step"]).mean().reset_index()
#
#         x = stack_df["_step"]
#         y = stack_df[["models/small_chosen", "models/medium_chosen", "models/large_chosen"]]
#         y_stack = np.cumsum(y, axis=1)
#
#         axes[3 + jdx][col_idx].fill_between(x, 0, y_stack.iloc[:, 0], color="#2f364d", alpha=0.95)
#         axes[3 + jdx][col_idx].fill_between(x, y_stack.iloc[:, 0], y_stack.iloc[:, 1], color="#3f758a", alpha=0.95)
#         axes[3 + jdx][col_idx].fill_between(x, y_stack.iloc[:, 1], y_stack.iloc[:, 2], color="#69cf81", alpha=0.95)
#         axes[3 + jdx][col_idx].set(xlabel=f"Request @ V={V}", xlim=[0, subset.loc[(subset["alpha"] == alpha), "_step"].max()], ylim=[0, 1])
#         axes[3 + jdx][col_idx].yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
#
#         if jdx == 0 and col_idx == 0:
#             axes[3 + jdx][col_idx].legend(["Llama 3.1 1B", "Llama 3.1 8B", "Llama 3.3 70B"])
#
#     # Add area plot for random baseline with constraint.
#     baseline_stack_df = baseline_df.loc[
#             (baseline_df["benchmark_name"] == name) &
#             (baseline_df["alpha"] == alpha),
#             ["_step", "models/small_chosen", "models/medium_chosen", "models/large_chosen"]
#         ].groupby(["_step"]).mean().reset_index()
#
#     x_base = baseline_stack_df["_step"]
#     y_base = baseline_stack_df[["models/small_chosen", "models/medium_chosen", "models/large_chosen"]]
#     y_stack_base = np.cumsum(y_base, axis=1)
#
#     axes[6][col_idx].fill_between(x_base, 0, y_stack_base.iloc[:, 0], color="#2f364d", alpha=0.95)
#     axes[6][col_idx].fill_between(x_base, y_stack_base.iloc[:, 0], y_stack_base.iloc[:, 1], color="#3f758a", alpha=0.95)
#     axes[6][col_idx].fill_between(x_base, y_stack_base.iloc[:, 1], y_stack_base.iloc[:, 2], color="#69cf81", alpha=0.95)
#     axes[6][col_idx].set(xlabel=f"Requests (Rand.)", xlim=[0, baseline_stack_df["_step"].max()], ylim=[0, 1])
#     axes[6][col_idx].yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
#     axes[6][col_idx].set(xlim=[0, baseline_stack_df["_step"].max()])
#
#     # Set axis properties
#     axes[0][col_idx].set(xlabel="Request", xlim=[0, subset.loc[(subset["alpha"] == alpha), "_step"].max()])
#     axes[1][col_idx].set(xlabel="Request", xlim=[0, subset.loc[(subset["alpha"] == alpha), "_step"].max()])
#     axes[2][col_idx].set(xlabel="")
#
#     # Remove y-labels for columns after the first
#     if col_idx > 0:
#         axes[0][col_idx].set(ylabel=None)
#         axes[1][col_idx].set(ylabel=None)
#         axes[2][col_idx].set(ylabel=None)
#
#     # Set title for each column
#     axes[0][col_idx].set_title(r"{bm_name} ($\alpha = {alpha_val} $)".format(bm_name=BENCHMARK_NAME_DICT[name], alpha_val=alpha))
#
#     # Increment column counter
#     col_count += 1
#
#     # Check if we need to save the current figure and start a new one
#     if col_count == 6 or combo_idx == len(benchmark_alpha_combinations) - 1:
#         # Add row labels
#         for idx, (ax, row) in enumerate(zip(axes[:,0], ["User Satisfaction", "SLA Violations", "Cost (in MJ energy)", "", "", "", ""])):
#             if idx == 5:
#                 fig.text(0.003, 0.225, "Model Call Ratio (MCR)", ha="center", rotation='vertical', fontsize=plt.rcParams['axes.labelsize'])
#             else:
#                 ax.set_ylabel(row, rotation=90, size='large')
#
#         # Save the figure
#         fig.tight_layout()
#         write_figure_to_disk(plt, file_name=f"benchmark_performance_plot_{plot_num}", chapter_name="evaluations")
#
#         # Reset column counter for next figure
#         col_count = 0

In [None]:
def build_pivot_table_for_main_results(input_df: pd.DataFrame, model_cols: list):

	latest_steps = input_df.groupby(['benchmark_name', 'alpha'])['_step'].transform('max')
	is_last_step = input_df['_step'] == latest_steps

	for col in model_cols:
		input_df = input_df.rename(columns={col: f"final_{col}"})

	# Create new columns with the final values
	# for col in model_cols:
	#     final_values = input_df.loc[is_last_step, ['benchmark_name', 'alpha', col]]
	#     final_values = final_values.drop_duplicates(['benchmark_name', 'alpha'])
	#     input_df = pd.merge(
	#         input_df,
	# 	    final_values.rename(columns={col: f"final_{col}"}),
	#         on=['benchmark_name', 'alpha'],
	#         how='left'
	#     )
	#
	# display(input_df.columns)

	# Add the final model values to the pivot table
	merged_pvt_table = input_df.loc[:, ["benchmark_name", "alpha", "V", "avg_accuracy", "running_avg_cost_usd"] + [f"final_{col}" for col in model_cols]].pivot_table(
	    index=["benchmark_name", "alpha"],
	    columns=["V"],
	    values=["avg_accuracy", "running_avg_cost_usd"] + [f"final_{col}" for col in model_cols],
	    aggfunc={
	        "avg_accuracy": ["mean", "std"],
	        "running_avg_cost_usd": ["sum", "std"],
	        **{f"final_{col}": ['mean'] for col in model_cols}
	    }
	)

	return merged_pvt_table



In [None]:
import math
import re

# CREATE LATEX TABLE
def multiindex_df_to_latex_chunked(
		df,
		chunk_size=3,
		level2_order=None,
		caption_template="Results Table Part {}",
		label_template="tab:results_part{}", include_index=True
):
	# Get unique level 1 items
	level1_items = df.columns.get_level_values(0).unique()

	# Get level 2 items (either in specified order or existing order)
	if level2_order is None:
		level2_items = df.columns.get_level_values(1).unique()
	else:
		# Verify all specified level2 items exist in the DataFrame
		existing_level2 = df.columns.get_level_values(1).unique()
		for item in level2_order:
			if item not in existing_level2:
				raise ValueError(f"Level 2 item '{item}' not found in DataFrame")
		level2_items = level2_order

	# Calculate number of chunks
	num_chunks = math.ceil(len(level1_items) / chunk_size)

	latex_tables = []

	# Process each chunk
	for chunk_idx in range(num_chunks):
		start_idx = chunk_idx * chunk_size
		end_idx = min((chunk_idx + 1) * chunk_size, len(level1_items))

		# Get level 1 items for this chunk
		chunk_level1_items = level1_items[start_idx:end_idx]

		# Filter DataFrame to only include these level 1 items
		chunk_columns = [col for col in df.columns if col[0] in chunk_level1_items]
		chunk_df = df[chunk_columns]

		# Create a new DataFrame for the LaTeX output with the same higher-level structure
		# Get unique combinations of first two levels in this chunk
		higher_levels = chunk_df.columns.droplevel(2).unique()

		# Create new DataFrame with appropriate multi-index
		result_df = pd.DataFrame(index=df.index)
		result_cols = []

		# For each combination of higher levels, combine mean and std
		all_alph = []
		for level1 in chunk_level1_items:
			pattern = r"\\alpha\s*=\s*(\d+)\\%"
			match = re.search(pattern, level1)
			if match:
			    number = match.group(1)  # This will be "50" as a string
			    number_int = int(number)  # Convert to integer if needed
			else:
			    print("No number found")
			    number_int = 0

			alph = number_int
			all_alph.append(alph)

			for level2 in level2_items:
				if "final_models/large" in level2:
					name = r"\thead{Model Call Ratio \\ (L70B/L8B/L1B)}"
					result_df[(level1, name)] = [f"{x:.0f}\\% / {y:.0f}\\% / {z:.0f}\\%" for x, y, z in zip(df[(level1, "final_models/large_chosen", "mean")], df[(level1, "final_models/medium_chosen", "mean")], df[(level1, "final_models/small_chosen", "mean")])]
					result_cols.append((level1, name))
					continue

				try:
					if level2 == "avg_accuracy":
						name = r"\thead{Request. \\ Satisfaction}"
						level3 = "mean"

						mean_val = df[level1, level2, level3]
						std_val = df[level1, level2, 'std']

						if level1 == "Mean":
							alph = 66.625

						vals = []
						for m, s in zip(mean_val, std_val):
							if m >= alph:
								vals.append(f"\\textcolor{{darkgreen}}{{{m:.2f}$\\scriptscriptstyle\\pm{s:.2f}$}}")
							else:
								vals.append(f"\\textcolor{{red}}{{{m:.2f}$\\scriptscriptstyle\\pm{s:.2f}$}}")

					elif level2 == "running_avg_cost_usd":
						name = r"\thead{Operating \\ Cost}"
						level3 = "sum"
						mean_val = df[level1, level2, level3]
						std_val = df[level1, level2, 'std']
						min_mean_val = mean_val[3:].min()

						mean_acc = df[level1, "avg_accuracy", "mean"]
						if level1 == "Mean":
							alph = 66.625

						vals = []
						acc_match = mean_acc[:3] >= alph
						is_min_single_satisfying = np.where(acc_match == True)[0]

						for idx, (m, s) in enumerate(zip(mean_val, std_val)):
							if m == mean_val[3:].min():
								vals.append(f"\\textbf{{{m:.2f}$\\scriptscriptstyle\\pm{s:.2f}$}}")
							elif idx == is_min_single_satisfying[0]:
								vals.append(f"\\underline{{{m:.2f}$\\scriptscriptstyle\\pm{s:.2f}$}}")
							else:
								vals.append(f"{m:.2f}$\\scriptscriptstyle\\pm{s:.2f}$")

					else:
						name = level2
						level3 = "mean"
						mean_val = df[level1, level2, level3]
						std_val = df[level1, level2, 'std']
						vals = [f"{m:.2f}$\\scriptscriptstyle\\pm{s:.2f}$" for m, s in zip(mean_val, std_val)]

					result_df[(level1, name)] = vals
					result_cols.append((level1, name))

				except KeyError:
					# Skip if mean or std not available
					print(f"Warning: Missing mean or std for {level1}, {level2}")

		# Set the columns with multi-index (preserving top 2 levels)
		result_df.columns = pd.MultiIndex.from_tuples(result_cols, names=['Category', 'Subcategory'])

		# Convert to LaTeX with multi-index
		caption = caption_template.format(chunk_idx + 1)
		label = label_template.format(chunk_idx + 1)

		latex_str = result_df.to_latex(escape=False, multicolumn=True, multicolumn_format='c', index=include_index)

		# Add caption and label
		latex_str = latex_str.replace('\\begin{tabular}',
		                              f'\\begin{{table}}\n\\caption{{{caption}}}\n\\label{{{label}}}\n\\begin{{tabular}}')
		latex_str = latex_str + '\\end{table}'

		latex_tables.append(latex_str)

	return latex_tables


latex_tables = multiindex_df_to_latex_chunked(
	selected_pivot,
	chunk_size=3,
	caption_template="Results Table Part {}: Categories",
	label_template="tab:results_part{}",
	level2_order=["mess_plus/energy", "avg_accuracy", "final_models/large_chosen"]
)

for t in latex_tables:
	print(t)

In [None]:
sns.set(style="whitegrid")
fig, axes = plt.subplots(nrows=3, ncols=8, figsize=(20, 6.75))

BENCHMARK_NAME_DICT = {
    "arc_challenge": "ARC Challenge",
    "arc_easy": "ARC Easy",
    "boolq": "BoolQ",
    # "lambada_standard": "Lambada",
    "logiqa": "LogiQA",
    # "logiqa2": "LogiQA2",
    "piqa": "PiQA",
    "sciq": "SciQ",
    "social_iqa": "SocialIQA",
    "winogrande": "WinoGrande",
}

iterator = 0
for name, display_name in BENCHMARK_NAME_DICT.items():

	plt_data = run_df.loc[(run_df["benchmark_name"] == name), ["c", "running_avg_cost_usd", "classifier/train_loss", "_step", "mess_plus/exploration_step_ratio", "mess_plus/p_t"]]

	plt_data["exploration_cost"] = plt_data["running_avg_cost_usd"] * plt_data["mess_plus/p_t"]

	sns.lineplot(
	    data=plt_data[["_step", "mess_plus/exploration_step_ratio", "c"]],
	    x="_step",
	    y="mess_plus/exploration_step_ratio",
	    hue="c",
		errorbar=("sd", 1),
		ax=axes[0][iterator],
		legend=True if iterator == 0 else False,
		palette=["#2f364d", "#3f758a", "#69cf81"]
	)

	plt_data.loc[plt_data["c"] == 0.1, "classifier/train_loss"] /= 0.1
	plt_data.loc[plt_data["c"] == 0.01, "classifier/train_loss"] /= 0.01

	sns.lineplot(
	    data=plt_data[["_step", "classifier/train_loss", "c"]],
	    x="_step",
	    y="classifier/train_loss",
	    hue="c",
		errorbar=None, # ("sd", 1),
		ax=axes[1][iterator],
		legend=True if iterator == 0 else False,
		palette=["#2f364d", "#3f758a", "#69cf81"]
	)

	# bar_data = plt_data[["_step", "mess_plus/energy", "c"]].groupby(["c"], as_index=False).sum()
	plt_data["exploration_cost"] = plt_data["exploration_cost"] / 1_000_000 # convert to MJ
	sns.barplot(
	    data=plt_data,
	    x="c",
	    y="exploration_cost",
		errorbar=("sd", 1),
		ax=axes[2][iterator],
		legend=False,
		estimator=np.sum
	)

	axes[0][iterator].set_xlim([0, plt_data["_step"].max()])
	axes[0][iterator].set_ylim([0, 1])
	axes[0][iterator].set_xlabel("Request")
	axes[1][iterator].set_xlabel("Request")
	axes[0][iterator].set_title(display_name, fontsize=14)
	axes[0][iterator].yaxis.set_major_formatter(mtick.PercentFormatter(1.0))

	axes[1][iterator].set_ylim([0, 4])
	axes[1][iterator].set_xlim([0, plt_data["_step"].max()])

	axes[2][iterator].set_ylim([0, 1.2 * plt_data.groupby("c")["exploration_cost"].sum().max()])
	add_value_labels(axes[2][iterator], convert_to_mj=False)

	if iterator == 0:
		axes[0][iterator].set_ylabel("Exploration Ratio (\%)")
		axes[1][iterator].set_ylabel("Router Training Loss")
		axes[2][iterator].set_ylabel("Exploration Cost (in MJ)")
		axes[0][iterator].legend(title="c")
	else:
		axes[0][iterator].set_ylabel(None)
		axes[1][iterator].set_ylabel(None)
		axes[2][iterator].set_ylabel(None)

	iterator += 1

plt.tight_layout()
write_figure_to_disk(plt, file_name="c_ablation_study", chapter_name="evaluations")

In [None]:
C_BENCHMARK = "winogrande"

sns.set_theme(context='paper', style='whitegrid', palette='dark:#5A9_r', font='sans-serif', font_scale=1.4, color_codes=True, rc=None)
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(3, 4.5))

# (run_df["benchmark_name"] == C_BENCHMARK)
plt_data = run_df.loc[:, ["c", "running_avg_cost_usd", "classifier/train_loss", "_step", "mess_plus/exploration_step_ratio", "mess_plus/p_t"]]

plt_data["exploration_cost"] = plt_data["running_avg_cost_usd"] * plt_data["mess_plus/p_t"]

# sns.lineplot(
#     data=plt_data[["_step", "mess_plus/exploration_step_ratio", "c"]],
#     x="_step",
#     y="mess_plus/exploration_step_ratio",
#     hue="c",
# 	errorbar=("sd", 1),
# 	ax=axes[0],
# 	legend=True,
# 	palette=["#2f364d", "#3f758a", "#69cf81"]
# )

plt_data.loc[plt_data["c"] == 0.1, "classifier/train_loss"] /= 0.1
plt_data.loc[plt_data["c"] == 0.01, "classifier/train_loss"] /= 0.01

sns.lineplot(
    data=plt_data[["_step", "classifier/train_loss", "c"]],
    x="_step",
    y="classifier/train_loss",
    hue="c",
	errorbar=None, # ("sd", 1),
	ax=axes[0],
	legend=True,
	palette=["#2f364d", "#3f758a", "#69cf81"]
)

# bar_data = plt_data[["_step", "mess_plus/energy", "c"]].groupby(["c"], as_index=False).sum()
plt_data["exploration_cost"] = plt_data["exploration_cost"] / 1_000_000 # convert to MJ
sns.barplot(
    data=plt_data.groupby(["_step", "c"]).mean(),
    x="c",
    y="exploration_cost",
	errorbar=("sd", 1),
	ax=axes[1],
	legend=False,
	estimator=np.sum,
	palette=["#2f364d", "#3f758a", "#69cf81"]
)

# axes[0].set_xlim([0, plt_data["_step"].max()])
# axes[0].set_ylim([0, 1])
# axes[0].set_xlabel("Request")
axes[0].set_xlabel("Request")
# axes[0].set_title("ARC Challenge", fontsize=14)
# axes[0].yaxis.set_major_formatter(mtick.PercentFormatter(1.0))

axes[0].set_ylim([0, 4])
axes[0].set_xlim([0, 2000])

axes[1].set_ylim([0, 1.2 * plt_data.groupby(["_step", "c"]).mean()["exploration_cost"].sum().max()])
add_value_labels(axes[1], convert_to_mj=False)

# axes[0].set_ylabel("Exploration Ratio (\%)")
axes[0].set_ylabel("Router Training Loss")
axes[1].set_ylabel("Exploration Cost (in MJ)")
# axes[0].legend(title="c")


plt.tight_layout()
write_figure_to_disk(plt, file_name=f"c_ablation_study_{C_BENCHMARK}", chapter_name="evaluations")

In [None]:
# FINAL PLOT: Report on alpha & V dynamics regarding cost and request satisfaction

# Set the style for all plots
sns.set_style("whitegrid")
sns.set_palette(palette="dark:#5A9_r")

sns.set_theme(context='paper', style='whitegrid', palette='dark:#5A9_r', font='sans-serif', font_scale=1.8, color_codes=True, rc=None)

# Create a figure and a grid of subplots: 4 rows, 10 columns
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(14, 4), gridspec_kw={'height_ratios': [2, 1]})

# # Flatten the 2D array of axes for easier iteration
# axes = axes.flatten()

name = "winogrande"
subset = run_df.loc[(run_df["benchmark_name"] == name) & (run_df["c"] == 0.1) & (run_df["V"].isin(v_values_per_benchmark[name])) & (run_df["_step"] > 10)]
subset = subset.sort_values(by=["alpha"])

def format_v_value(b):
	if b < 0.0001:
		b = b * 100
		b = b / 100

	return f"Ours (V={b})"

subset["V"] = subset["V"].apply(format_v_value)

iterator = 0
for alpha in subset["alpha"].unique().tolist():
	v_values = subset["V"].unique().tolist()
	c_values = subset["c"].unique().tolist()

	# Accuracy Plot
	raw_inference_accuracies_per_model = infer_df[["benchmark_name", "label_small", "label_medium", "label_large"]].groupby("benchmark_name").mean().loc[name]

	sns.lineplot(
	    data=subset.loc[(subset["alpha"] == alpha)],
	    x="_step",
	    y="avg_accuracy",
	    hue="V",
		errorbar=None,
		ax=axes[0][iterator],
		legend=True if iterator == 0 else False,
		palette=["#2f364d", "#3f758a", "#69cf81"],
		hue_order=["Ours (V=0.01)", "Ours (V=0.001)", "Ours (V=0.0001)"],
	)

	if alpha == 0.65:
		step = 740
		y = 0.62
		offset = 0.04
		ha = "center"
	elif alpha == 0.7:
		step = 803
		y = 0.65
		offset = 0.04
		ha = "center"
	else:
		step = 994
		y = 0.65
		offset = 0.04
		ha = "right"

	axes[0][iterator].annotate('',
        xy=(step, alpha),         # tip of the arrow (endpoint)
        xytext=(step + 100, y),       # start point of the arrow
        arrowprops=dict(
            facecolor='black',
            shrink=0.05,      # how much to shrink the arrow from the endpoints
            width=2,          # width of arrow in points
            headwidth=8,      # width of arrow head in points
            headlength=10     # length of arrow head in points
        ))

	axes[0][iterator].text(step + 120, y - offset, f"SLA satisfied @ step {step}", fontsize=12, ha=ha)

	if iterator == 0:
		axes[0][iterator].legend(loc="center", ncol=6, title="Method", fontsize=7, title_fontsize=7)

	axes[0][iterator].axhline(y=alpha, color='red', linestyle='-', label="alpha")
	axes[0][iterator].set(ylim=[0.97 * raw_inference_accuracies_per_model["label_small"], 1.15 * raw_inference_accuracies_per_model["label_large"]])
	axes[0][iterator].yaxis.set_major_formatter(mtick.PercentFormatter(1.0, decimals=0))

	# Add alpha marker
	t1 = axes[0][iterator].text(s=r"$ \alpha = {alpha_val} $ (red line)".format(alpha_val=alpha), x=subset["_step"].max() - 20, y=1.15 * raw_inference_accuracies_per_model["label_large"] - 0.04, color='red', fontsize=12, ha="right")
	t1.set_bbox(dict(facecolor='white', alpha=0.6, edgecolor='black', pad=1.5))

	run_df = run_df.sort_values('_step')
	step_averages = subset.loc[(subset["alpha"] == alpha)].groupby(["_step", "V"], as_index=False)['running_avg_cost_usd'].mean().reset_index()

	# Step 2: Make sure the data is sorted by step
	step_averages = step_averages.sort_values('_step')

	# Step 3: Calculate the cumulative sum of the seed-averaged energy values
	for v in v_values:
		step_averages.loc[(step_averages["V"] == v), 'cumulative_energy_sum'] = step_averages.loc[(step_averages["V"] == v), "running_avg_cost_usd"].cumsum()

	# Step 4: Calculate the time average (cumulative sum divided by step)
	step_averages['time_average_energy'] = step_averages['cumulative_energy_sum'] / step_averages['_step']

	# Energy plot
	sns.lineplot(
	    data=step_averages,
	    x="_step",
	    y="time_average_energy",
	    hue="V",
		errorbar=None,
		ax=axes[1][iterator],
		legend=False,
		palette=["#2f364d", "#3f758a", "#69cf81"],
		hue_order=["Ours (V=0.01)", "Ours (V=0.001)", "Ours (V=0.0001)"],
	)
	axes[0][iterator].set(xlim=[0, subset["_step"].max()], ylim=[0.97 * raw_inference_accuracies_per_model["label_small"], 1.15 * raw_inference_accuracies_per_model["label_large"]])
	axes[0][iterator].set_ylabel(None)
	axes[1][iterator].set_ylabel(None)
	axes[1][iterator].set(xlim=[0, subset["_step"].max()])

	if iterator == 0:
		axes[0][iterator].set(ylabel="Req. Satisfaction", xlabel=None)
		axes[1][iterator].set(ylabel="Avg. Cost (J)", xlabel="Request")

		legend = axes[0][iterator].get_legend()
		legend.set_frame_on(True)           # Ensure the frame is visible
		legend.get_frame().set_facecolor('white')  # Non-transparent white background
		legend.get_frame().set_edgecolor('darkgray')  # Black frame
		legend.get_frame().set_linewidth(1.5)      # Slightly thicker frame for a sleek look
		legend.get_frame().set_alpha(1.0)          # Fully opaque

	else:
		axes[0][iterator].set(ylabel=None, xlabel=None)
		axes[1][iterator].set(ylabel=None, xlabel="Request")


	iterator += 1

fig.tight_layout()

# Export legend
def export_legend(legend, filename="legend.png", expand=[-1,-1,1,1]): # ,
    fig  = legend.figure
    fig.canvas.draw()
    bbox  = legend.get_window_extent()
    bbox = bbox.from_extents(*(bbox.extents + np.array(expand))) #
    bbox = bbox.transformed(fig.dpi_scale_trans.inverted())
    fig.savefig(filename, dpi="figure", bbox_inches=bbox)

export_legend(axes[0][0].get_legend(), filename="legend_test.pdf")

write_figure_to_disk(plt, file_name=f"{name}_alpha_v_interplay_no_legend", chapter_name="evaluations")

In [None]:
# Routing overhead table

run_df["classifier_inference_energy"] = run_df["mess_plus/total_energy_incl_classifier"] - run_df["mess_plus/inference_only_energy"]

overhead_df = run_df.loc[run_df["classifier_inference_energy"].notna()]
avg_overhead_df = overhead_df[["_step", "benchmark_name", "classifier_inference_energy", "running_avg_cost_usd"]].groupby(["_step", "benchmark_name"]).mean()
avg_overhead_df["routing_overhead_ratio"] = avg_overhead_df["classifier_inference_energy"] / avg_overhead_df["running_avg_cost_usd"]

pvt_overhead = avg_overhead_df.pivot_table(index="benchmark_name", values=["classifier_inference_energy", "running_avg_cost_usd", "routing_overhead_ratio"], aggfunc=["mean", "std"])
pvt_overhead = pvt_overhead.loc[~(pvt_overhead.index == "lambada_standard") & ~(pvt_overhead.index == "logiqa2")]

pvt_overhead.loc["mean"] = pvt_overhead.mean(axis=0)

print(pvt_overhead)

In [None]:
# CREATE LATEX TABLE
def multiindex_df_to_latex_simple_for_overhead(
		df,
		chunk_size=3,
		level2_order=None,
		caption_template="Results Table Part {}",
		label_template="tab:results_part{}", include_index=True
):
	# Get unique level 1 items
	level1_items = df.columns.get_level_values(0).unique()

	# Get level 2 items (either in specified order or existing order)
	if level2_order is None:
		level2_items = df.columns.get_level_values(1).unique()
	else:
		# Verify all specified level2 items exist in the DataFrame
		existing_level2 = df.columns.get_level_values(1).unique()
		for item in level2_order:
			if item not in existing_level2:
				raise ValueError(f"Level 2 item '{item}' not found in DataFrame")
		level2_items = level2_order

	# Calculate number of chunks
	num_chunks = math.ceil(len(level1_items) / chunk_size)

	latex_tables = []

	# Process each chunk
	for chunk_idx in range(num_chunks):
		start_idx = chunk_idx * chunk_size
		end_idx = min((chunk_idx + 1) * chunk_size, len(level1_items))

		# Get level 1 items for this chunk
		chunk_level1_items = level1_items[start_idx:end_idx]

		# Filter DataFrame to only include these level 1 items
		chunk_columns = [col for col in df.columns if col[0] in chunk_level1_items]
		chunk_df = df[chunk_columns]

		# Create new DataFrame with appropriate multi-index
		result_df = pd.DataFrame(index=df.index)
		result_cols = []

		for level1 in chunk_level1_items:

			if level1 == "std":
				continue

			for level2 in level2_items:
				mean_val = df[level1, level2]
				std_val = df["std", level2]

				if level2 == "running_avg_cost_usd" or level2 ==  "classifier_inference_energy":
					result_df[level2] = [f"{m:.2f}$\\scriptscriptstyle\\pm{s:.2f}$" for m, s in zip(mean_val, std_val)]
				else:
					result_df[level2] = [f"{(m * 100):.2f}$\\scriptscriptstyle\\pm{(s * 100):.2f}$" for m, s in zip(mean_val, std_val)]

				result_cols.append(level2)

		# Set the columns with multi-index (preserving top 2 levels)
		# result_df.columns = pd.MultiIndex.from_tuples(result_cols, names=['Category', 'Subcategory'])

		# Convert to LaTeX with multi-index
		caption = caption_template.format(chunk_idx + 1)
		label = label_template.format(chunk_idx + 1)

		latex_str = result_df.to_latex(escape=False, multicolumn=True, multicolumn_format='c', index=include_index)

		# Add caption and label
		latex_str = latex_str.replace('\\begin{tabular}',
		                              f'\\begin{{table}}\n\\caption{{{caption}}}\n\\label{{{label}}}\n\\begin{{tabular}}')
		latex_str = latex_str + '\\end{table}'

		latex_tables.append(latex_str)

	return latex_tables

tables = multiindex_df_to_latex_simple_for_overhead(
	pvt_overhead,
	chunk_size=3,
	level2_order=None,
	caption_template="Results Table Part {}",
	label_template="tab:results_part{}",
	include_index=True
)


In [None]:
for tab in tables:
	print(tab)