In [3]:
import json
import pandas as pd

from collections import defaultdict, Counter

In [4]:
with open('reports/performance.json') as f:
    perf = json.load(f)

In [5]:
table = []
for setting, perf_logs in perf.items():
    model = setting.split("+")[0]
    context = "+".join(setting.split("+")[1:])
    avg_perf = pd.DataFrame(perf_logs).mean()
    std_pref = pd.DataFrame(perf_logs).std()
    results = {"model": model, "context": context}
    for key in avg_perf.keys():
        results[key] = f"{avg_perf[key]:.3f}±{std_pref[key]:.3f}"
    table.append(results)
pd.DataFrame(table)

Unnamed: 0,model,context,accuracy,precision,recall,f1
0,gpt-4o-mini,no+context+no+reasoning,0.724±0.017,0.782±0.017,0.837±0.022,0.808±0.012
1,gpt-4o-mini,no+context,0.736±0.018,0.735±0.010,0.972±0.019,0.837±0.012
2,gpt-4o-mini,no+reasoning,0.753±0.017,0.746±0.013,0.979±0.007,0.847±0.009
3,gpt-4o-mini,context+reasoning,0.748±0.020,0.753±0.014,0.947±0.018,0.839±0.012
4,gpt-4o-mini,context+reasoning+complex,0.734±0.021,0.730±0.012,0.981±0.019,0.837±0.013
5,deepseek-v3,no+context+no+reasoning,0.771±0.021,0.832±0.009,0.840±0.028,0.836±0.017
6,deepseek-v3,no+context,0.762±0.010,0.834±0.005,0.821±0.020,0.827±0.009
7,deepseek-v3,no+reasoning,0.652±0.010,0.870±0.010,0.588±0.015,0.701±0.011
8,deepseek-v3,context+reasoning,0.684±0.025,0.893±0.012,0.619±0.033,0.731±0.025
9,deepseek-v3,context+reasoning+complex,0.723±0.037,0.915±0.013,0.663±0.051,0.768±0.036


In [6]:
models = ["gpt-4o-mini", "llama-v3p3", "gemini-2.0"]
context = "context+reasoning"
ground_truth = pd.read_csv("ground_truth.csv")
merged = defaultdict(Counter)
for repo, dep in zip(ground_truth.repo, ground_truth.dep):
    for model in models:
        for run in range(10):
            file = f"reports/summary-{model}-{context}-run-{run}.csv"
            df = pd.read_csv(file)
            df = df[(df.repo == repo) & (df.dep == dep)]
            if len(df) == 0:
                continue
            merged[(repo, dep)][df.ai_eval.values[0]] += 1
merged_perf = []
for (repo, dep), counts in merged.items():
    merged_perf.append(
        {
            "repo": repo,
            "dep": dep,
            "ai_yes": counts["Yes"],
            "ai_no": counts["No"],
            "ai_majority": max(counts, key=counts.get),
        }
    )
merged_perf = pd.DataFrame(merged_perf)
for repo, dep in zip(merged_perf.repo, merged_perf.dep):
    dev_eval = ground_truth[
        (ground_truth.repo == repo) & (ground_truth.dep == dep)
    ].impactful.values[0]
    merged_perf.loc[
        (merged_perf.repo == repo) & (merged_perf.dep == dep), "dev_eval"
    ] = dev_eval

In [7]:
def evaluate_performance(
    ground_truth: pd.Series, report: pd.Series, true_label: str, false_label: str
) -> dict[str, float]:
    total, tp, fp, tn, fn = len(report), 0, 0, 0, 0
    for dev_eval, ai_eval in zip(ground_truth, report):
        if dev_eval == true_label and ai_eval == true_label:
            tp += 1
        elif dev_eval == false_label and ai_eval == true_label:
            fp += 1
        elif dev_eval == true_label and ai_eval == false_label:
            fn += 1
        elif dev_eval == false_label and ai_eval == false_label:
            tn += 1
    acc, precision, recall = (tp + tn) / (total), tp / (tp + fp), tp / (tp + fn)
    f1 = 2 * precision * recall / (precision + recall)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

evaluate_performance(merged_perf.dev_eval, merged_perf.ai_majority, "Yes", "No")

{'accuracy': 0.7560975609756098,
 'precision': 0.7846153846153846,
 'recall': 0.8947368421052632,
 'f1': 0.8360655737704918}