In [1]:
import json
import pandas as pd

from collections import defaultdict, Counter
from abandabot.evaluate import evaluate_performance

In [2]:
with open("reports/performance.json") as f:
    perf = json.load(f)

In [7]:
table = []
for setting, perf_logs in perf.items():
    model = setting.split("+")[0]
    context = "+".join(setting.split("+")[1:])
    avg_perf = pd.DataFrame(perf_logs).mean()
    std_pref = pd.DataFrame(perf_logs).std()
    results = {"model": model, "context": context}
    for key in avg_perf.keys():
        results[key] = f"{avg_perf[key]:.3f}±{std_pref[key]:.3f}"
    table.append(results)

ground_truth = pd.read_csv("ground_truth.csv")
ground_truth = ground_truth[ground_truth.impactful.isin(("Yes", "No"))]

table.append(
    {
        "model": "yesman",
        "context": "yesman",
        **evaluate_performance(
            ground_truth.impactful, ["Yes"] * len(ground_truth), "Yes", "No"
        ),
    }
)

table.append(
    {
        "model": "noman",
        "context": "noman",
        **evaluate_performance(
            ground_truth.impactful, ["No"] * len(ground_truth), "Yes", "No"
        ),
    }
)

table.append(
    {
        "model": "random",
        "context": "random",
        "errors": 0,
        "macro_precision": 0.5,
        "macro_recall": 0.5,
        "macro_f1": 0.5,
    }
)

pd.DataFrame(table)["model context macro_precision macro_recall macro_f1".split()]

Unnamed: 0,model,context,macro_precision,macro_recall,macro_f1
0,gpt-4o,no+context+no+reasoning,0.737±0.061,0.638±0.043,0.648±0.053
1,gpt-4o,no+context,0.748±0.076,0.619±0.038,0.624±0.049
2,gpt-4o,no+reasoning,0.685±0.026,0.687±0.026,0.686±0.026
3,gpt-4o,context+reasoning,0.693±0.022,0.676±0.020,0.682±0.020
4,gpt-4o,context+reasoning+complex,0.710±0.029,0.727±0.033,0.715±0.031
5,gpt-4o-mini,no+context+no+reasoning,0.739±0.062,0.580±0.018,0.568±0.027
6,gpt-4o-mini,no+context,0.723±0.064,0.561±0.018,0.536±0.029
7,gpt-4o-mini,no+reasoning,0.788±0.032,0.609±0.027,0.609±0.038
8,gpt-4o-mini,context+reasoning,0.732±0.047,0.620±0.028,0.625±0.036
9,gpt-4o-mini,context+reasoning+complex,0.772±0.094,0.576±0.023,0.559±0.033


In [4]:
models = ["gpt-4o-mini", "llama-v3p3", "gemini-2.0"]
context = "context+reasoning"
ground_truth = pd.read_csv("ground_truth.csv")
merged = defaultdict(Counter)
for repo, dep in zip(ground_truth.repo, ground_truth.dep):
    for model in models:
        for run in range(10):
            file = f"reports/summary-{model}-{context}-run-{run}.csv"
            df = pd.read_csv(file)
            df = df[(df.repo == repo) & (df.dep == dep)]
            if len(df) == 0:
                continue
            merged[(repo, dep)][df.ai_eval.values[0]] += 1
merged_perf = []
for (repo, dep), counts in merged.items():
    merged_perf.append(
        {
            "repo": repo,
            "dep": dep,
            "ai_yes": counts["Yes"],
            "ai_no": counts["No"],
            "ai_majority": max(counts, key=counts.get),
        }
    )
merged_perf = pd.DataFrame(merged_perf)
for repo, dep in zip(merged_perf.repo, merged_perf.dep):
    dev_eval = ground_truth[
        (ground_truth.repo == repo) & (ground_truth.dep == dep)
    ].impactful.values[0]
    merged_perf.loc[
        (merged_perf.repo == repo) & (merged_perf.dep == dep), "dev_eval"
    ] = dev_eval

In [5]:
evaluate_performance(merged_perf.dev_eval, merged_perf.ai_majority, "Yes", "No")

{'yes_precision': 0.7846153846153846,
 'yes_recall': 0.8947368421052632,
 'yes_f1': 0.8360655737704918,
 'no_precision': 0.6470588235294118,
 'no_recall': 0.44,
 'no_f1': 0.5238095238095238,
 'macro_precision': 0.7158371040723982,
 'macro_recall': 0.6673684210526316,
 'macro_f1': 0.6799375487900079}