In [1]:
import json
from pathlib import Path
from typing import List

import numpy as np
import pandas as pd

In [2]:
def get_run_stats(eval_path: Path, threshold: int = 4, num_requests: int = 50):
    """Statistics of a single simulation/data/evaluate.py run."""

    num_evals = 0
    num_good_answers = 0
    num_bad_tasks = []
    num_tool_steps = []

    for eval_path in eval_path.glob("*.json"):
        num_evals += 1

        with eval_path.open("r") as f:
            score_json = json.load(f)

        if score_json["final_answer_rating"] >= threshold:
            num_good_answers += 1

        tool_steps = score_json["steps"]
        
        if tool_steps:
            task_ratings = np.array([step["task_rating"] for step in tool_steps])
            num_bad_tasks.append(np.sum(task_ratings < threshold))
            num_tool_steps.append(len(tool_steps))
        else:
            num_bad_tasks.append(0)
            num_tool_steps.append(0)

    pass_rate = num_good_answers / num_requests
    bad_task_rate = np.sum(num_bad_tasks) / np.sum(num_tool_steps)
    num_requests_completed = num_evals

    return pass_rate, bad_task_rate, num_requests_completed


def get_series_stats(output_dirs: List[str], threshold: int = 4, num_requests: int = 50):

    def format_stats(xs):
        # standard error with Bessel's correction
        std_err = np.std(xs, ddof=1) / np.sqrt(len(xs))
        return f"{np.mean(xs):.2f} ± {std_err:.2f}"

    series_data = []
    series_stats = {
        "series": [],
        "pass_rate": [],
        "bad_task_rate": [],
        "completion_rate": [],
    }
    
    for output_dir in output_dirs:
        output_path = Path(output_dir)
        
        pass_rates = []
        bad_task_rates = []
        completion_rate = []

        for eval_path in output_path.glob("evaluations_*"):
            pass_rate, bad_task_rate, _num_requests_completed = get_run_stats(
                eval_path=eval_path, 
                threshold=threshold,
                num_requests=num_requests,
            )
    
            pass_rates.append(pass_rate)
            bad_task_rates.append(bad_task_rate)
            completion_rate.append(_num_requests_completed / num_requests)

        series_data.append({
            "pass_rate": pass_rates,
            "bad_task_rate": bad_task_rates,
            "completion_rate": completion_rate,
        })

        series_stats["series"].append(output_path.name)
        series_stats["pass_rate"].append(format_stats(pass_rates))
        series_stats["bad_task_rate"].append(format_stats(bad_task_rates))
        series_stats["completion_rate"].append(format_stats(completion_rate))

    return pd.DataFrame(series_stats), series_data

In [3]:
stats, _ = get_series_stats(
    output_dirs=[
        "output-eval/zeroshot-8bit",
        "output-eval/finetuned-4bit",
        "output-eval/finetuned-8bit",
        "output-eval/gpt-4",
    ],
    threshold=4,
)
stats

Unnamed: 0,series,pass_rate,bad_task_rate,completion_rate
0,zeroshot-8bit,0.72 ± 0.03,0.30 ± 0.02,0.88 ± 0.01
1,finetuned-4bit,0.89 ± 0.02,0.14 ± 0.01,0.96 ± 0.01
2,finetuned-8bit,0.88 ± 0.02,0.09 ± 0.01,0.95 ± 0.02
3,gpt-4,0.91 ± 0.03,0.07 ± 0.01,0.97 ± 0.01


In [4]:
stats, data = get_series_stats(
    output_dirs=[
        "output-eval-masking/prompt-and-completion",
        "output-eval-masking/completion-only",
        "output-eval-masking/gpt-4",
    ],
    threshold=4,
    num_requests=50,
)
stats

Unnamed: 0,series,pass_rate,bad_task_rate,completion_rate
0,prompt-and-completion,0.88 ± 0.01,0.12 ± 0.01,0.99 ± 0.00
1,completion-only,0.85 ± 0.01,0.14 ± 0.01,0.98 ± 0.01
2,gpt-4,0.90 ± 0.01,0.11 ± 0.01,0.98 ± 0.01


In [5]:
from scipy.stats import ttest_ind

print("t-test on metrics of 'prompt-and-completion' and 'completion-only' series:")

result = ttest_ind(
    a=data[0]["pass_rate"], 
    b=data[1]["pass_rate"],
)

print(f"- significance of 'pass_rate' difference: p-value = {result.pvalue:.2f}")

result = ttest_ind(
    a=data[0]["bad_task_rate"], 
    b=data[1]["bad_task_rate"],
)

print(f"- significance of 'bad_task_rate' difference: p-value = {result.pvalue:.2f}")

t-test on metrics of 'prompt-and-completion' and 'completion-only' series:
- significance of 'pass_rate' difference: p-value = 0.10
- significance of 'bad_task_rate' difference: p-value = 0.22
