In [1]:
import json
from pathlib import Path
from typing import List

import numpy as np
import pandas as pd

In [2]:
def get_run_stats(eval_path: Path, threshold: int = 4, num_requests: int = 50):
    """Statistics of a single simulation/data/evaluate.py run."""

    num_evals = 0
    num_good_answers = 0
    num_bad_tasks = []
    num_tool_steps = []

    for eval_path in eval_path.glob("*.json"):
        num_evals += 1

        with eval_path.open("r") as f:
            score_json = json.load(f)

        if score_json["final_answer_rating"] >= threshold:
            num_good_answers += 1

        tool_steps = score_json["steps"]
        
        if tool_steps:
            task_ratings = np.array([step["task_rating"] for step in tool_steps])
            num_bad_tasks.append(np.sum(task_ratings < threshold))
            num_tool_steps.append(len(tool_steps))
        else:
            num_bad_tasks.append(0)
            num_tool_steps.append(0)

    pass_rate = num_good_answers / num_requests
    bad_task_rate = np.sum(num_bad_tasks) / np.sum(num_tool_steps)
    num_requests_completed = num_evals

    return pass_rate, bad_task_rate, num_requests_completed


def get_series_stats(output_dirs: List[str], threshold: int = 4, num_requests: int = 50, num_evals: int = 4):
    """Statistics of one or more series of simulation/data/evaluate.py runs."""

    def format_stats(xs):
        return f"{np.mean(xs):.2f} ± {np.std(xs):.2f}"

    stats = {
        "series": [],
        "pass_rate": [],
        "bad_task_rate": [],
        "completion_rate": [],
    }
    
    for output_dir in output_dirs:
        output_path = Path(output_dir)
        
        pass_rates = []
        bad_task_rates = []
        completion_rate = []

        for i in range(num_evals):
            eval_path = output_path / f"evaluations_{i + 1}"

            pass_rate, bad_task_rate, _num_requests_completed = get_run_stats(
                eval_path=eval_path, 
                threshold=threshold,
                num_requests=num_requests,
            )
    
            pass_rates.append(pass_rate)
            bad_task_rates.append(bad_task_rate)
            completion_rate.append(_num_requests_completed / num_requests)

        stats["series"].append(output_path.name)
        stats["pass_rate"].append(format_stats(pass_rates))
        stats["bad_task_rate"].append(format_stats(bad_task_rates))
        stats["completion_rate"].append(format_stats(completion_rate))

    return pd.DataFrame(stats)

In [3]:
get_series_stats(
    output_dirs=[
        "output-eval/zeroshot-8bit",
        "output-eval/finetuned-4bit",
        "output-eval/finetuned-8bit",
        "output-eval/openai",
    ],
    threshold=4,
    num_requests=50,
    num_evals=4,
)

Unnamed: 0,series,pass_rate,bad_task_rate,completion_rate
0,zeroshot-8bit,0.72 ± 0.05,0.30 ± 0.04,0.88 ± 0.02
1,finetuned-4bit,0.89 ± 0.04,0.14 ± 0.01,0.96 ± 0.02
2,finetuned-8bit,0.88 ± 0.04,0.09 ± 0.01,0.95 ± 0.03
3,openai,0.91 ± 0.05,0.07 ± 0.01,0.97 ± 0.02
