# Set up

In [1]:
from pathlib import Path
from typing import Dict, List, NamedTuple, Optional, Tuple, Union

import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from dotenv import load_dotenv
from matplotlib.axes import Axes

load_dotenv()

sns.set_theme(style="whitegrid")
mpl.rcParams["figure.dpi"] = 150

ROOT_DIR = Path.cwd().parent
ROOT_RESULT_DIR = ROOT_DIR / "results"

INDEX_COLS = ["method", "iteration"]
METRIC_COLS = [
    "duration",
    "output_throughput",
    "request_throughput",
    "median_ttft_ms",
    "mean_ttft_ms",
    "gpu_prefix_cache_hit_rate",
    "cpu_prefix_cache_hit_rate",
]

METHOD_NAMES = []

In [2]:
class Metrics(NamedTuple):
    col: str
    name: str
    ylim: Optional[Tuple[float, float]] = None
    plot_errors: bool = True


def get_method_names(result_dir: Path) -> List[str]:
    default_method_names = [
        "NO-APC",
        "APC",
        "MT-APC",
        "MT-APC-ONLY-MT",
        "MT-APC-ONLY-ASYNC",
        "MT-APC-ONLY-SCHED",
        "MT-APC-NO-PREFETCH",
        "MT-APC-NO-SCHED",
    ]

    result_files = [
        file
        for file in (result_dir / "cleaned").iterdir()
        if file.is_file() and file.suffix == ".csv"
    ]
    df = pd.read_csv(result_files[0])
    method_names = df["method"].drop_duplicates().apply(lambda x: x.upper()).tolist()
    method_names = [name for name in default_method_names if name in method_names]
    return method_names


def plot_results_bar(
    df: pd.DataFrame,
    metric_col: str,
    metric_name: str,
    method_names: List[str] = METHOD_NAMES,
    figsize: Tuple[int, int] = (4, 3),
    rot: int = 0,
    plot_errs: bool = True,
    label: bool = False,
    **kwargs,
):
    grouped = df[metric_col].groupby(INDEX_COLS[0])
    means = grouped.mean()
    errs = grouped.std() if plot_errs else None
    ax: Axes = means.loc[method_names].plot.bar(
        yerr=errs,
        capsize=4,
        rot=rot,
        xlabel=INDEX_COLS[0].capitalize(),
        ylabel=metric_name,
        figsize=figsize,
        **kwargs,
    )
    ax.set_xlabel("")

    if label:
        bars = ax.containers[-1]
        ax.bar_label(
            bars,
            labels=[f"{c.get_height():.2f}" for c in bars],
            label_type="edge",
            padding=5,
            fontsize=10,
        )


def plot_metrics(
    df: pd.DataFrame,
    metrics: List[Metrics],
    method_names: List[str] = METHOD_NAMES,
    figsize: Tuple[int, int] = (4, 3),
    rot: int = 0,
    label: bool = False,
):
    for metric in metrics:
        plot_results_bar(
            df,
            metric.col,
            metric.name,
            method_names=method_names,
            rot=rot,
            plot_errs=metric.plot_errors,
            ylim=metric.ylim,
            figsize=figsize,
            label=label,
        )
        plt.show()


def load_benchmark_results(
    result_dir: Path,
    benchmark_name_map: Dict[str, str],
    benchmarks: Optional[List[str]] = None,
    metric: Optional[Metrics] = None,
    baseline: Optional[str] = None,
) -> pd.DataFrame:
    combined_df = pd.DataFrame()
    if benchmarks is not None:
        benchmark_name_map = {
            k: v for k, v in benchmark_name_map.items() if k in benchmarks
        }
    for bench_id, bench_name in benchmark_name_map.items():
        df = pd.read_csv(result_dir / f"cleaned/{bench_id}.csv")
        df[INDEX_COLS[0]] = df[INDEX_COLS[0]].str.upper()
        df["benchmark"] = bench_name
        df = df.set_index(INDEX_COLS + ["benchmark"])[METRIC_COLS].drop_duplicates()
        if metric is not None:
            assert baseline is not None
            df[metric.col] = df[metric.col] / df[metric.col].loc[baseline].mean().item()
        combined_df = pd.concat([combined_df, df])
    return combined_df


def plot_normalized_metric(
    result_dir: Path,
    metric: Metrics,
    methods: List[str],
    baseline: str,
    benchmark_name_map: Dict[str, str],
    rotation: int = -15,
    label_padding: float = 5,
    benchmarks: Optional[List[str]] = None,
    **kwargs,
):
    combined_df = load_benchmark_results(
        result_dir, benchmark_name_map, benchmarks, metric, baseline
    )

    kwargs = (
        dict(
            height=4,
            aspect=2,
            palette="muted",
            capsize=0.2,
            err_kws={"linewidth": 1.5},
        )
        | kwargs
    )

    g = sns.catplot(
        data=combined_df.loc[methods],
        x="benchmark",
        y=metric.col,
        hue="method",
        kind="bar",
        errorbar="sd" if metric.plot_errors else None,
        legend_out=False,
        **kwargs,
    )
    g.set_axis_labels("", metric.name)
    g.set_xticklabels(rotation=rotation)
    # g.legend.set_title("")
    handles, labels = g.axes.item().get_legend_handles_labels()
    g.legend.remove()
    g.fig.legend(
        handles,
        labels,
        ncol=len(methods),
        loc="upper center",
        bbox_to_anchor=(0.5, 1.1),
        frameon=False,
    )

    ax = g.facet_axis(0, 0)
    for c in ax.containers:
        labels = [f"{v.get_height():.2f}" for v in c]
        ax.bar_label(
            c, labels=labels, label_type="edge", padding=label_padding, fontsize=8
        )
    plt.show()


def plot_normalized_metrics(
    result_dir: Path,
    metrics: List[Metrics],
    methods: List[str],
    baselines: Union[str, List[str]],
    benchmark_name_map: Dict[str, str],
    **kwargs,
):
    if isinstance(baselines, str):
        baselines = [baselines] * len(metrics)
    for metric, baseline in zip(metrics, baselines):
        plot_normalized_metric(
            result_dir, metric, methods, baseline, benchmark_name_map, **kwargs
        )


def plot_gpu_cpu_cache_hit_rate_per_benchmark(
    result_dir: Path,
    benchmark_name_map: Dict[str, str],
    benchmarks: Optional[List[str]] = None,
    rotation: int = -15,
    figsize: Tuple[float, float] = (7, 4),
):
    df = load_benchmark_results(result_dir, benchmark_name_map, benchmarks)
    df = df.groupby(["benchmark", "method"]).mean().reset_index()

    _, ax = plt.subplots(figsize=figsize)
    bar_width = 0.35

    benchmarks = df["benchmark"].unique()
    colors = sns.color_palette("muted").as_hex()
    for i, benchmark in enumerate(benchmark_name_map.values()):
        # Filter data for each benchmark and method
        apc_data = df[(df["benchmark"] == benchmark) & (df["method"] == "APC")]
        mt_apc_data = df[(df["benchmark"] == benchmark) & (df["method"] == "MT-APC")]

        # Plot stacked bars for APC and MT-APC
        ax.bar(
            i - bar_width / 2,
            apc_data["gpu_prefix_cache_hit_rate"],
            bar_width,
            label="APC GPU" if i == 0 else "",
            color=colors[0],
        )
        ax.bar(
            i + bar_width / 2,
            mt_apc_data["gpu_prefix_cache_hit_rate"],
            bar_width,
            label="MT-APC GPU" if i == 0 else "",
            color=colors[1],
        )
        ax.bar(
            i + bar_width / 2,
            mt_apc_data["cpu_prefix_cache_hit_rate"],
            bar_width,
            bottom=mt_apc_data["gpu_prefix_cache_hit_rate"],
            label="MT-APC CPU" if i == 0 else "",
            color=colors[2],
        )

    ax.set_xticks(range(len(benchmarks)))
    ax.set_xticklabels(benchmark_name_map.values(), rotation=rotation)
    # ax.set_xlabel("Workload")
    ax.set_ylabel("Prefix Cache Hit Rate (%)")
    ax.set_ylim(0, 100)
    ax.legend(loc="upper right", fontsize=9)
    # ax.legend(ncols=3, loc="upper right", fontsize=9)

    plt.show()


def plot_gpu_cpu_cache_hit_rate(
    df: pd.DataFrame,
    method_names: List[str] = METHOD_NAMES,
    metric_name: str = "Prefix Cache Hit Rate (%)",
    plot_errors: bool = False,
    rotation: int = -15,
    label_padding: float = 5,
    **kwargs,
):
    kwargs = (
        dict(
            height=4,
            aspect=2,
            palette="muted",
            capsize=0.2,
            err_kws={"linewidth": 1.5},
        )
        | kwargs
    )
    df = df.reset_index().melt(
        id_vars=["method", "iteration"],
        value_vars=["gpu_prefix_cache_hit_rate", "cpu_prefix_cache_hit_rate"],
        var_name="device",
        value_name="hit_rate",
    )
    df["device"] = df["device"].str.replace("_prefix_cache_hit_rate", "").str.upper()
    df = df.set_index(["method", "iteration"])

    g = sns.catplot(
        data=df.loc[method_names],
        x="method",
        y="hit_rate",
        hue="device",
        kind="bar",
        errorbar="sd" if plot_errors else None,
        **kwargs,
    )
    g.set_axis_labels("Method", metric_name)
    g.set_xticklabels(rotation=rotation)
    g.legend.set_title("")

    ax = g.facet_axis(0, 0)
    for c in ax.containers:
        labels = [f"{v.get_height():.2f}" for v in c]
        ax.bar_label(
            c, labels=labels, label_type="edge", padding=label_padding, fontsize=10
        )
    plt.show()

# Standard Workloads

In [None]:
BENCHMARK_NAME_MAP = {
    "sharegpt": "ShareGPT",
    "generative_agents": "Generative Agents",
    "multiturn_long": "Multi-turn Chat (Long)",
    "multiturn_short": "Multi-turn Chat (Short)",
    "guessing_game": "Guessing Game",
}

RESULT_DIR = ROOT_RESULT_DIR / "standard"
METHOD_NAMES.clear()
METHOD_NAMES.extend(get_method_names(RESULT_DIR))
METHOD_NAMES

## ShareGPT

In [4]:
bench_name = "sharegpt"
result_dir = RESULT_DIR / "cleaned"
result_df = pd.read_csv(result_dir / f"{bench_name}.csv")
metric_cols = METRIC_COLS
result_df[INDEX_COLS[0]] = result_df[INDEX_COLS[0]].str.upper()
result_df = result_df.set_index(INDEX_COLS)[metric_cols].drop_duplicates()

In [None]:
display(result_df.groupby([INDEX_COLS[0]]).mean().loc[METHOD_NAMES])
display(result_df.groupby([INDEX_COLS[0]]).std().loc[METHOD_NAMES])

In [6]:
metrics = [
    Metrics(
        col="duration",
        name="Job Completion Time (s)",
        ylim=(0, 60),
    ),
    Metrics(
        col="request_throughput",
        name="Request Throughput (request/s)",
        ylim=(0, 3.5),
    ),
    Metrics(
        col="output_throughput",
        name="Output Throughput (tps)",
        ylim=(0, 690),
    ),
    Metrics(
        col="median_ttft_ms",
        name="Median TTFT (ms)",
        ylim=(0, 95),
        plot_errors=False,
    ),
]

In [None]:
plot_metrics(result_df, metrics=metrics, method_names=METHOD_NAMES[:3], figsize=(4, 3))

In [None]:
plot_metrics(result_df, metrics=metrics, method_names=METHOD_NAMES[2:], figsize=(6, 3), rot=15)

## Generative Agents

In [9]:
bench_name = "generative_agents"
result_dir = RESULT_DIR / "cleaned"
result_df = pd.read_csv(result_dir / f"{bench_name}.csv")
metric_cols = METRIC_COLS
result_df[INDEX_COLS[0]] = result_df[INDEX_COLS[0]].str.upper()
result_df = result_df.set_index(INDEX_COLS)[metric_cols].drop_duplicates()

In [None]:
display(result_df.groupby([INDEX_COLS[0]]).mean().loc[METHOD_NAMES])
display(result_df.groupby([INDEX_COLS[0]]).std().loc[METHOD_NAMES])

In [11]:
metrics = [
    Metrics(
        col="duration",
        name="Job Completion Time (s)",
        ylim=(0, 25),
    ),
    Metrics(
        col="request_throughput",
        name="Request Throughput (request/s)",
        ylim=(0, 10),
    ),
    Metrics(
        col="output_throughput",
        name="Output Throughput (tps)",
        ylim=(0, 38),
    ),
    Metrics(
        col="median_ttft_ms",
        name="Median TTFT (ms)",
        ylim=(0, 60),
        plot_errors=False,
    ),
]

In [None]:
plot_metrics(result_df, metrics=metrics, method_names=METHOD_NAMES[:3], figsize=(4, 3))

In [None]:
plot_metrics(result_df, metrics=metrics, method_names=METHOD_NAMES[2:], figsize=(6, 3), rot=15)

## Multiturn Chat (Long)

In [14]:
bench_name = "multiturn_long"
result_dir = RESULT_DIR / "cleaned"
result_df = pd.read_csv(result_dir / f"{bench_name}.csv")
metric_cols = METRIC_COLS
result_df[INDEX_COLS[0]] = result_df[INDEX_COLS[0]].str.upper()
result_df = result_df.set_index(INDEX_COLS)[metric_cols].drop_duplicates()

In [None]:
display(result_df.groupby([INDEX_COLS[0]]).mean().loc[METHOD_NAMES])
display(result_df.groupby([INDEX_COLS[0]]).std().loc[METHOD_NAMES])

In [16]:
metrics = [
    Metrics(
        col="duration",
        name="Job Completion Time (s)",
        ylim=(0, 21),
    ),
    Metrics(
        col="request_throughput",
        name="Request Throughput (request/s)",
        ylim=(0, 2.2),
    ),
    Metrics(
        col="output_throughput",
        name="Output Throughput (tps)",
        ylim=(0, 205),
    ),
    Metrics(
        col="median_ttft_ms",
        name="Median TTFT (ms)",
        ylim=(0, 1100),
        plot_errors=False,
    ),
]

In [None]:
plot_metrics(result_df, metrics=metrics, method_names=METHOD_NAMES[:3], figsize=(4, 3))

In [18]:
metrics = [
    Metrics(
        col="duration",
        name="Job Completion Time (s)",
        ylim=(0, 21),
    ),
    Metrics(
        col="request_throughput",
        name="Request Throughput (request/s)",
        ylim=(0, 2.2),
    ),
    Metrics(
        col="output_throughput",
        name="Output Throughput (tps)",
        ylim=(0, 205),
    ),
    Metrics(
        col="median_ttft_ms",
        name="Median TTFT (ms)",
        ylim=(0, 1250),
        plot_errors=False,
    ),
]

In [None]:
plot_metrics(result_df, metrics=metrics, method_names=METHOD_NAMES[2:], figsize=(6, 3), rot=15)

## Multiturn Chat (Short)

In [20]:
bench_name = "multiturn_short"
result_dir = RESULT_DIR / "cleaned"
result_df = pd.read_csv(result_dir / f"{bench_name}.csv")
metric_cols = METRIC_COLS
result_df[INDEX_COLS[0]] = result_df[INDEX_COLS[0]].str.upper()
result_df = result_df.set_index(INDEX_COLS)[metric_cols].drop_duplicates()

In [None]:
display(result_df.groupby([INDEX_COLS[0]]).mean().loc[METHOD_NAMES])
display(result_df.groupby([INDEX_COLS[0]]).std().loc[METHOD_NAMES])

In [22]:
metrics = [
    Metrics(
        col="duration",
        name="Job Completion Time (s)",
        ylim=(0, 13),
    ),
    Metrics(
        col="request_throughput",
        name="Request Throughput (request/s)",
        ylim=(0, 7),
    ),
    Metrics(
        col="output_throughput",
        name="Output Throughput (tps)",
        ylim=(0, 42),
    ),
    Metrics(
        col="median_ttft_ms",
        name="Median TTFT (ms)",
        ylim=(0, 1700),
        plot_errors=False,
    ),
]

In [None]:
plot_metrics(result_df, metrics=metrics, method_names=METHOD_NAMES[:3], figsize=(4, 3))

In [None]:
plot_metrics(result_df, metrics=metrics, method_names=METHOD_NAMES[2:], figsize=(6, 3), rot=15)

## Guessing Game

In [25]:
bench_name = "guessing_game"
result_dir = RESULT_DIR / "cleaned"
result_df = pd.read_csv(result_dir / f"{bench_name}.csv")
metric_cols = METRIC_COLS
result_df[INDEX_COLS[0]] = result_df[INDEX_COLS[0]].str.upper()
result_df = result_df.set_index(INDEX_COLS)[metric_cols].drop_duplicates()

In [None]:
display(result_df.groupby([INDEX_COLS[0]]).mean().loc[METHOD_NAMES])
display(result_df.groupby([INDEX_COLS[0]]).std().loc[METHOD_NAMES])

In [27]:
metrics = [
    Metrics(
        col="duration",
        name="Job Completion Time (s)",
        ylim=(0, 8.5),
    ),
    Metrics(
        col="request_throughput",
        name="Request Throughput (request/s)",
        ylim=(0, 60),
    ),
    Metrics(
        col="output_throughput",
        name="Output Throughput (tps)",
        ylim=(0, 200),
    ),
    Metrics(
        col="median_ttft_ms",
        name="Median TTFT (ms)",
        ylim=(0, 600),
        plot_errors=False,
    ),
]

In [None]:
plot_metrics(result_df, metrics=metrics, method_names=METHOD_NAMES[:3], figsize=(4, 3))

In [None]:
plot_metrics(result_df, metrics=metrics, method_names=METHOD_NAMES[2:], figsize=(6, 3), rot=15)

## Cache hit rate

In [None]:
combined_df = pd.DataFrame()
for bench_id, bench_name in BENCHMARK_NAME_MAP.items():
    df = pd.read_csv(RESULT_DIR / f"cleaned/{bench_id}.csv")
    df = df[df["method"] == "apc"]
    df = df[["method", "gpu_prefix_cache_hit_rate"]]
    df = df.groupby("method").mean()
    df["benchmark"] = bench_name
    combined_df = pd.concat([combined_df, df])

g = sns.catplot(
    data=combined_df,
    x="benchmark",
    y="gpu_prefix_cache_hit_rate",
    kind="bar",
    errorbar=None,
    height=2.75,
    aspect=2,
)
g.set_axis_labels("", "Prefix Cache Hit Rate (%)")
g.set_xticklabels(rotation=-15)
g.set(ylim=(0, 100))

ax = g.facet_axis(0, 0)
for c in ax.containers:
    labels = [f"{v.get_height():.2f}" for v in c]
    ax.bar_label(c, labels=labels, label_type="edge", padding=5, fontsize=10)
plt.show()

## Normalized

In [31]:
metrics = [
    Metrics(
        col="duration",
        name="JCT (normalized)",
    ),
    Metrics(
        col="request_throughput",
        name="Request Throughput\n(normalized)",
    ),
    Metrics(
        col="output_throughput",
        name="Output Throughput\n(normalized)",
    ),
    Metrics(
        col="median_ttft_ms",
        name="Median TTFT\n(normalized)",
        plot_errors=False,
    ),
]

In [None]:
plot_normalized_metrics(
    RESULT_DIR,
    metrics,
    methods=METHOD_NAMES[:3],
    baselines=["NO-APC", "MT-APC", "MT-APC", "NO-APC"],
    benchmark_name_map=BENCHMARK_NAME_MAP,
    height=2.5,
    aspect=2.5,
)

In [None]:
plot_gpu_cpu_cache_hit_rate_per_benchmark(RESULT_DIR, BENCHMARK_NAME_MAP, figsize=(6, 2.75))

## Ablation Study

In [34]:
bench_name = "guessing_game"
result_df = pd.read_csv(RESULT_DIR / f"cleaned/{bench_name}.csv")
metric_cols = METRIC_COLS
result_df[INDEX_COLS[0]] = result_df[INDEX_COLS[0]].str.upper()
result_df = result_df.set_index(INDEX_COLS)[metric_cols].drop_duplicates()

for col in metric_cols:
    if col not in ["gpu_prefix_cache_hit_rate", "cpu_prefix_cache_hit_rate"]:
        result_df[col] = result_df[col] / result_df[col].loc["MT-APC-ONLY-MT"].mean().item()

In [None]:
display(result_df.groupby([INDEX_COLS[0]]).mean().loc[METHOD_NAMES])
display(result_df.groupby([INDEX_COLS[0]]).std().loc[METHOD_NAMES])

In [36]:
metrics = [
    Metrics(
        col="duration",
        name="JCT (normalized)",
        ylim=(0, 1.3),
    ),
    Metrics(
        col="request_throughput",
        name="Request Throughput\n(normalized)",
        ylim=(0, 1.3),
    ),
    Metrics(
        col="output_throughput",
        name="Output Throughput\n(normalized)",
        ylim=(0, 1.3),
    ),
    Metrics(
        col="mean_ttft_ms",
        name="Mean TTFT\n(normalized)",
        ylim=(0, 1.3),
    ),
]

In [None]:
tmp_df = result_df.copy()
new_methods = [i[0] for i in tmp_df.index.to_list()]
new_methods = [
    method + "-ALL" if method == "MT-APC" else method for method in new_methods
]
new_methods = [method[len("MT-APC-") :] for method in new_methods]
tmp_df["method2"] = new_methods
tmp_df = (
    tmp_df.reset_index()
    .drop(columns="method")
    .rename(columns={"method2": "method"})
    .set_index(INDEX_COLS)
)
method_names = [name[len("MT-APC-") :] for name in METHOD_NAMES[3:-1]] + ["ALL"]

plot_metrics(
    tmp_df,
    metrics=metrics,
    method_names=method_names,
    figsize=(5.5, 2.44),
    rot=-15,
    label=True,
)

In [None]:
plot_gpu_cpu_cache_hit_rate(
    result_df,
    method_names=[METHOD_NAMES[1]] + METHOD_NAMES[3:] + [METHOD_NAMES[2]],
)

# Large-scale Multi-Agent Workloads

In [None]:
BENCHMARK_NAME_MAP = {
    "gptswarm_mmlu": "GPTSwarm-MMLU",
    "guessing_game_e2e": "Guessing Game (DP)",
    "guessing_game_e2e_cot": "Guessing Game (CoT)",
}

RESULT_DIR = ROOT_RESULT_DIR / "multi_agent"
METHOD_NAMES.clear()
METHOD_NAMES.extend(get_method_names(RESULT_DIR))
METHOD_NAMES

## GPTSwarm-MMLU

In [40]:
bench_name = "gptswarm_mmlu"
result_dir = RESULT_DIR / "cleaned"
result_df = pd.read_csv(result_dir / f"{bench_name}.csv")
metric_cols = METRIC_COLS
result_df[INDEX_COLS[0]] = result_df[INDEX_COLS[0]].str.upper()
result_df = result_df.set_index(INDEX_COLS)[metric_cols].drop_duplicates()

In [None]:
display(result_df.groupby([INDEX_COLS[0]]).mean().loc[METHOD_NAMES])
display(result_df.groupby([INDEX_COLS[0]]).std().loc[METHOD_NAMES])

In [42]:
metrics = [
    Metrics(
        col="duration",
        name="Job Completion Time (s)",
        ylim=(0, 650),
    ),
    Metrics(
        col="request_throughput",
        name="Request Throughput (request/s)",
        ylim=(0, 180),
    ),
    Metrics(
        col="output_throughput",
        name="Output Throughput (tps)",
        ylim=(0, 300),
    ),
    Metrics(
        col="median_ttft_ms",
        name="Median TTFT (ms)",
        ylim=(0, 16000),
        plot_errors=False,
    ),
]

In [None]:
plot_metrics(result_df, metrics=metrics, method_names=METHOD_NAMES[:3], figsize=(4, 3))

In [None]:
plot_metrics(result_df, metrics=metrics, method_names=METHOD_NAMES[2:], figsize=(6, 3), rot=15)

## Guessing Game (Direct Prompting)

In [45]:
bench_name = "guessing_game_e2e"
result_dir = RESULT_DIR / "cleaned"
result_df = pd.read_csv(result_dir / f"{bench_name}.csv")
metric_cols = METRIC_COLS
result_df[INDEX_COLS[0]] = result_df[INDEX_COLS[0]].str.upper()
result_df = result_df.set_index(INDEX_COLS)[metric_cols].drop_duplicates()

In [None]:
display(result_df.groupby([INDEX_COLS[0]]).mean().loc[METHOD_NAMES])
display(result_df.groupby([INDEX_COLS[0]]).std().loc[METHOD_NAMES])

In [47]:
metrics = [
    Metrics(
        col="duration",
        name="Job Completion Time (s)",
        ylim=(0, 300),
    ),
    Metrics(
        col="request_throughput",
        name="Request Throughput (request/s)",
        ylim=(0, 42),
    ),
    Metrics(
        col="output_throughput",
        name="Output Throughput (tps)",
        ylim=(0, 150),
    ),
    Metrics(
        col="median_ttft_ms",
        name="Median TTFT (ms)",
        ylim=(0, 23000),
        plot_errors=False,
    ),
]

In [None]:
plot_metrics(result_df, metrics=metrics, method_names=METHOD_NAMES[:3], figsize=(4, 3))

In [None]:
plot_metrics(result_df, metrics=metrics, method_names=METHOD_NAMES[2:], figsize=(6, 3), rot=15)

## Guessing Game (CoT)

In [50]:
bench_name = "guessing_game_e2e_cot"
result_dir = RESULT_DIR / "cleaned"
result_df = pd.read_csv(result_dir / f"{bench_name}.csv")
metric_cols = METRIC_COLS
result_df[INDEX_COLS[0]] = result_df[INDEX_COLS[0]].str.upper()
result_df = result_df.set_index(INDEX_COLS)[metric_cols].drop_duplicates()

In [None]:
display(result_df.groupby([INDEX_COLS[0]]).mean().loc[METHOD_NAMES])
display(result_df.groupby([INDEX_COLS[0]]).std().loc[METHOD_NAMES])

In [52]:
metrics = [
    Metrics(
        col="duration",
        name="Job Completion Time (s)",
        ylim=(0, 1250),
    ),
    Metrics(
        col="request_throughput",
        name="Request Throughput (request/s)",
        ylim=(0, 7),
    ),
    Metrics(
        col="output_throughput",
        name="Output Throughput (tps)",
        ylim=(0, 950),
    ),
    Metrics(
        col="median_ttft_ms",
        name="Median TTFT (ms)",
        ylim=(0, 68000),
        plot_errors=False,
    ),
]

In [None]:
plot_metrics(result_df, metrics=metrics, method_names=METHOD_NAMES[:3], figsize=(4, 3))

In [None]:
plot_metrics(result_df, metrics=metrics, method_names=METHOD_NAMES[2:], figsize=(6, 3), rot=15)

## Cache hit rate

In [None]:
combined_df = pd.DataFrame()
for bench_id, bench_name in BENCHMARK_NAME_MAP.items():
    df = pd.read_csv(RESULT_DIR / f"cleaned/{bench_id}.csv")
    df = df[df["method"] == "apc"]
    df = df[["method", "gpu_prefix_cache_hit_rate"]]
    df = df.groupby("method").mean()
    df["benchmark"] = bench_name
    combined_df = pd.concat([combined_df, df])

g = sns.catplot(
    data=combined_df,
    x="benchmark",
    y="gpu_prefix_cache_hit_rate",
    kind="bar",
    errorbar=None,
    height=2.75,
    aspect=2,
)
g.set_axis_labels("", "Prefix Cache Hit Rate (%)")
g.set_xticklabels(rotation=-15)
g.set(ylim=(0, 100))

ax = g.facet_axis(0, 0)
for c in ax.containers:
    labels = [f"{v.get_height():.2f}" for v in c]
    ax.bar_label(c, labels=labels, label_type="edge", padding=5, fontsize=10)
plt.show()

## Normalized

In [56]:
metrics = [
    Metrics(
        col="duration",
        name="JCT (normalized)",
    ),
    Metrics(
        col="request_throughput",
        name="Request Throughput\n(normalized)",
    ),
    Metrics(
        col="output_throughput",
        name="Output Throughput\n(normalized)",
    ),
    Metrics(
        col="median_ttft_ms",
        name="Median TTFT\n(normalized)",
        plot_errors=False,
    ),
]

In [None]:
plot_normalized_metrics(
    RESULT_DIR,
    metrics,
    methods=METHOD_NAMES[:3],
    baselines=["NO-APC", "MT-APC", "MT-APC", "NO-APC"],
    benchmark_name_map=BENCHMARK_NAME_MAP,
    rotation=0,
    height=2.5,
    aspect=2.5,
)

In [None]:
plot_gpu_cpu_cache_hit_rate_per_benchmark(
    RESULT_DIR, BENCHMARK_NAME_MAP, rotation=-15, figsize=(6, 2.75)
)

# GPTSwarm Graphs

In [59]:
import os

os.environ["SERVER_CONFIG_PATH"] = str((ROOT_DIR / ".env").absolute())

In [None]:
from swarm.environment.operations.final_decision import MergingStrategy
from swarm.graph.swarm import Swarm
from swarm.llm import OPENAI_MODEL_PREFIX
from swarm.utils.const import GPTSWARM_ROOT

from agents.config import BaseClientConfig
from agents.gptswarm.guessing_game.guessing_game import GuessTwoThirdGame


config = BaseClientConfig()
MODEL_NAME = OPENAI_MODEL_PREFIX + config.model


def display_swarm_graph(swarm: Swarm, file_path: Path):
    graph, _ = swarm.connection_dist.realize(swarm.composite_graph)
    file_name = file_path.name
    graph.display(file_name=file_name)
    (GPTSWARM_ROOT / "result" / file_name).rename(file_path)

## GPTSwarm-MMLU

In [None]:
swarm = Swarm(
    agent_names=["IO"] * 3 + ["AdversarialAgent"] * 3,
    domain="mmlu",
    model_name=MODEL_NAME,
    final_node_class="FinalDecision",
    final_node_kwargs=dict(strategy=MergingStrategy.MajorityVote),
    edge_optimize=False,
)
display_swarm_graph(swarm, ROOT_RESULT_DIR / "gptswarm_mmlu.html")

## Guessing Game

In [None]:
game = GuessTwoThirdGame(
    model_name=OPENAI_MODEL_PREFIX + config.model,
    num_participants=3,
    num_steps=5,
)
display_swarm_graph(game, ROOT_RESULT_DIR / "guessing_game.html")