In [34]:
import json
import pandas as pd

from collections import defaultdict, Counter
from abandabot.evaluate import evaluate_performance

In [35]:
with open("reports/performance.json") as f:
    perf = json.load(f)

In [36]:
table = []
for setting, perf_logs in perf.items():
    model = setting.split("+")[0]
    context = "+".join(setting.split("+")[1:])
    avg_perf = pd.DataFrame(perf_logs).mean()
    std_pref = pd.DataFrame(perf_logs).std()
    results = {"model": model, "context": context}
    for key in avg_perf.keys():
        if key != "errors":
            results[key] = f"{avg_perf[key]:.3f}±{std_pref[key]:.3f}"
    table.append(results)

ground_truth = pd.read_csv("ground_truth.csv")
ground_truth = ground_truth[ground_truth.impactful.isin(("Yes", "No"))]

table.append(
    {
        "model": "yesman",
        "context": "yesman",
        "errors": 0,
        **evaluate_performance(
            ground_truth.impactful, ["Yes"] * len(ground_truth), "Yes", "No"
        ),
    }
)

table.append(
    {
        "model": "noman",
        "context": "noman",
        "errors": 0,
        **evaluate_performance(
            ground_truth.impactful, ["No"] * len(ground_truth), "Yes", "No"
        ),
    }
)

table.append(
    {
        "model": "random",
        "context": "random",
        "errors": 0,
        "macro_precision": 0.5,
        "macro_recall": 0.5,
        "macro_f1": 0.5,
    }
)

pd.DataFrame(table)

Unnamed: 0,model,context,macro_precision,macro_recall,macro_f1,errors
0,gpt-4o,no+context+no+reasoning,0.668±nan,0.585±nan,0.582±nan,
1,gpt-4o,no+context,0.712±nan,0.625±nan,0.634±nan,
2,gpt-4o,no+reasoning,0.720±nan,0.737±nan,0.727±nan,
3,gpt-4o,context+reasoning,0.699±nan,0.699±nan,0.699±nan,
4,gpt-4o,context+reasoning+complex,0.775±nan,0.807±nan,0.784±nan,
5,gpt-4o-mini,no+context+no+reasoning,0.747±nan,0.602±nan,0.602±nan,
6,gpt-4o-mini,no+context,0.657±nan,0.542±nan,0.510±nan,
7,gpt-4o-mini,no+reasoning,0.785±nan,0.591±nan,0.582±nan,
8,gpt-4o-mini,context+reasoning,0.691±nan,0.605±nan,0.609±nan,
9,gpt-4o-mini,context+reasoning+complex,0.677±nan,0.574±nan,0.564±nan,


In [None]:
models = ["gpt-4o-mini", "llama-v3p3", "gemini-2.0"]
context = "context+reasoning"
ground_truth = pd.read_csv("ground_truth.csv")
merged = defaultdict(Counter)
for repo, dep in zip(ground_truth.repo, ground_truth.dep):
    for model in models:
        for run in range(10):
            file = f"reports/summary-{model}-{context}-run-{run}.csv"
            df = pd.read_csv(file)
            df = df[(df.repo == repo) & (df.dep == dep)]
            if len(df) == 0:
                continue
            merged[(repo, dep)][df.ai_eval.values[0]] += 1
merged_perf = []
for (repo, dep), counts in merged.items():
    merged_perf.append(
        {
            "repo": repo,
            "dep": dep,
            "ai_yes": counts["Yes"],
            "ai_no": counts["No"],
            "ai_majority": max(counts, key=counts.get),
        }
    )
merged_perf = pd.DataFrame(merged_perf)
for repo, dep in zip(merged_perf.repo, merged_perf.dep):
    dev_eval = ground_truth[
        (ground_truth.repo == repo) & (ground_truth.dep == dep)
    ].impactful.values[0]
    merged_perf.loc[
        (merged_perf.repo == repo) & (merged_perf.dep == dep), "dev_eval"
    ] = dev_eval

In [None]:
evaluate_performance(merged_perf.dev_eval, merged_perf.ai_majority, "Yes", "No")

{'macro_precision': 0.7158371040723982,
 'macro_recall': 0.6673684210526316,
 'macro_f1': 0.6799375487900079}