# Inter-Annotator Agreement

## Notebook setup

In [1]:
import json
from pathlib import Path
from collections import defaultdict

import pandas as pd

from info_loss import iaa

## Calculate Agreement

### Accuracy-oriented eval (Angle 1, Angle 2)

In [2]:
criteria = [
    "relevance_source",
    "relevance_target",
    "accuracy_snippet",
    "accuracy_answer",
    "hallucinations_answer",
    "givenness_phrasing",
    "givenness_location",
    "simplicity_jargon",
    "simplicity_standalone",
]

rater_a = pd.read_json("../data/infolossqa-v1.0/evals-accuracy/rater-a.json")[criteria]
rater_b = pd.read_json("../data/infolossqa-v1.0/evals-accuracy/rater-b.json")[criteria]

path = Path("../output/gpt-eval/gpt-4o-2024-05-13/")
rater_gpt4 = pd.concat([pd.read_json(path / f"{c}.json")[c] for c in criteria], axis=1)

path = Path("../output/gpt-eval/llama-3-70b-chat-hf/")
rater_llama3 = pd.concat(
    [pd.read_json(path / f"{c}.json")[c] for c in criteria], axis=1
)

display(rater_a.head(2))
display(rater_b.head(2))
display(rater_gpt4.head(2))
display(rater_llama3.head(2))

Unnamed: 0,relevance_source,relevance_target,accuracy_snippet,accuracy_answer,hallucinations_answer,givenness_phrasing,givenness_location,simplicity_jargon,simplicity_standalone
0,relevance_source_1,relevance_target_1,accuracy_snippet_1,accuracy_answer_1,hallucinations_answer_1,givenness_phrasing_2,givenness_location_1,simplicity_jargon_4,simplicity_standalone_1
1,relevance_source_1,relevance_target_3,accuracy_snippet_1,accuracy_answer_1,hallucinations_answer_1,givenness_phrasing_1,givenness_location_3,simplicity_jargon_4,simplicity_standalone_1


Unnamed: 0,relevance_source,relevance_target,accuracy_snippet,accuracy_answer,hallucinations_answer,givenness_phrasing,givenness_location,simplicity_jargon,simplicity_standalone
0,relevance_source_1,relevance_target_1,accuracy_snippet_1,accuracy_answer_1,hallucinations_answer_1,givenness_phrasing_2,givenness_location_1,simplicity_jargon_3,simplicity_standalone_1
1,relevance_source_3,relevance_target_3,accuracy_snippet_1,accuracy_answer_2,hallucinations_answer_1,givenness_phrasing_2,givenness_location_3,simplicity_jargon_4,simplicity_standalone_2


Unnamed: 0,relevance_source,relevance_target,accuracy_snippet,accuracy_answer,hallucinations_answer,givenness_phrasing,givenness_location,simplicity_jargon,simplicity_standalone
0,relevance_source_3,relevance_target_2,accuracy_snippet_1,accuracy_answer_1,hallucinations_answer_1,givenness_phrasing_2,givenness_location_1,simplicity_jargon_4,simplicity_standalone_1
1,relevance_source_3,relevance_target_3,accuracy_snippet_3,accuracy_answer_3,hallucinations_answer_1,givenness_phrasing_2,givenness_location_na,simplicity_jargon_4,simplicity_standalone_2


Unnamed: 0,relevance_source,relevance_target,accuracy_snippet,accuracy_answer,hallucinations_answer,givenness_phrasing,givenness_location,simplicity_jargon,simplicity_standalone
0,relevance_source_1,relevance_target_2,accuracy_snippet_2,accuracy_answer_1,hallucinations_answer_1,givenness_phrasing_2,givenness_location_1,simplicity_jargon_3,simplicity_standalone_1
1,relevance_source_3,relevance_target_3,accuracy_snippet_3,accuracy_answer_2,hallucinations_answer_1,givenness_phrasing_2,givenness_location_na,simplicity_jargon_4,simplicity_standalone_2


In [3]:
def iaa_humans(rater_a, rater_b, criteria):
    agreement = {}
    for c in criteria:
        agreement[c] = iaa.kappa(raters=[rater_a[c], rater_b[c]], method="randolph")
    return agreement


def iaa_model_vs_human_raters(rater_a, rater_b, model, criteria):
    agreement = {}
    for c in criteria:
        model_vs_a = iaa.kappa(raters=[rater_a[c], model[c]], method="randolph")
        model_vs_b = iaa.kappa(raters=[rater_b[c], model[c]], method="randolph")
        agreement[c] = (model_vs_a + model_vs_b) / 2
    return agreement

In [4]:
df_iaa_accuracy = pd.DataFrame(
    [
        iaa_humans(rater_a, rater_b, criteria),
        iaa_model_vs_human_raters(rater_a, rater_b, rater_gpt4, criteria),
        iaa_model_vs_human_raters(rater_a, rater_b, rater_llama3, criteria),
    ],
    index=["Human", "GPT-4o", "Llama3-70B"],
).T
df_iaa_accuracy

Unnamed: 0,Human,GPT-4o,Llama3-70B
relevance_source,0.605651,0.683047,0.659091
relevance_target,0.498771,0.325553,0.242629
accuracy_snippet,0.712531,0.649877,0.52457
accuracy_answer,0.679361,0.627764,0.574324
hallucinations_answer,0.891892,0.813268,0.724816
givenness_phrasing,0.793612,0.302211,0.238329
givenness_location,0.80344,0.182637,0.064701
simplicity_jargon,0.57412,0.341523,0.426699
simplicity_standalone,0.690418,0.206388,-0.304668


### Recall-oriented eval (Angle 3)

In [5]:
recall_rater_a = pd.read_json("../data/infolossqa-v1.0/evals-recall/rater-a.json")
recall_rater_a = recall_rater_a.rename({"rating": "recall"}, axis=1)
recall_rater_b = pd.read_json("../data/infolossqa-v1.0/evals-recall/rater-b.json")
recall_rater_b = recall_rater_b.rename({"rating": "recall"}, axis=1)

recall_rater_gpt4 = pd.read_json("../output/gpt-eval/gpt-4o-2024-05-13/recall.json")
recall_rater_llama3 = pd.read_json("../output/gpt-eval/llama-3-70b-chat-hf/recall.json")

In [6]:
df_iaa_recall = pd.DataFrame(
    [
        iaa_humans(recall_rater_a, recall_rater_b, ["recall"]),
        iaa_model_vs_human_raters(
            recall_rater_a, recall_rater_b, recall_rater_gpt4, ["recall"]
        ),
        iaa_model_vs_human_raters(
            recall_rater_a, recall_rater_b, recall_rater_llama3, ["recall"]
        ),
    ],
    index=["Human", "GPT-4o", "Llama3-70B"],
).T
df_iaa_recall

Unnamed: 0,Human,GPT-4o,Llama3-70B
recall,0.700231,0.666795,0.469639


### Merge results

In [7]:
criteria_rename = {
    "relevance_source": "Q is Answerable w/ X_src",
    "relevance_target": "Q is Answerable w/ X_tgt",
    "accuracy_snippet": "Accuracy - Evidence (E)",
    "accuracy_answer": "Accuracy - Answer (A)",
    "hallucinations_answer": "Hallucinations (A)",
    "givenness_phrasing": "Givenness (Q)",
    "givenness_location": "Rationale Localization (R)",
    "simplicity_jargon": "Jargon (A)",
    "simplicity_standalone": "Standalone (A)",
    "recall": "Recall of human QA",
}

df_iaa = pd.concat([df_iaa_accuracy, df_iaa_recall])
df_iaa.loc["Average"] = df_iaa.mean()
df_iaa = df_iaa.round(2)
df_iaa = df_iaa.rename(criteria_rename)
df_iaa

Unnamed: 0,Human,GPT-4o,Llama3-70B
Q is Answerable w/ X_src,0.61,0.68,0.66
Q is Answerable w/ X_tgt,0.5,0.33,0.24
Accuracy - Evidence (E),0.71,0.65,0.52
Accuracy - Answer (A),0.68,0.63,0.57
Hallucinations (A),0.89,0.81,0.72
Givenness (Q),0.79,0.3,0.24
Rationale Localization (R),0.8,0.18,0.06
Jargon (A),0.57,0.34,0.43
Standalone (A),0.69,0.21,-0.3
Recall of human QA,0.7,0.67,0.47


In [8]:
df = df_iaa.reset_index()
tex = (
    df.style.format(precision=2)
    .hide(axis=0)
    .to_latex(
        position="t",
        position_float="centering",
        hrules=True,
        caption="TODO",
        label="tab:iaa-evaluation",
    )
)
for c in df.columns:
    tex = tex.replace(c, "\\textbf{" + c + "}")
print(tex)

\begin{table}[t]
\centering
\caption{TODO}
\label{tab:iaa-evaluation}
\begin{tabular}{lrrr}
\toprule
\textbf{index} & \textbf{Human} & \textbf{GPT-4o} & \textbf{Llama3-70B} \\
\midrule
Q is Answerable w/ X_src & 0.61 & 0.68 & 0.66 \\
Q is Answerable w/ X_tgt & 0.50 & 0.33 & 0.24 \\
Accuracy - Evidence (E) & 0.71 & 0.65 & 0.52 \\
Accuracy - Answer (A) & 0.68 & 0.63 & 0.57 \\
Hallucinations (A) & 0.89 & 0.81 & 0.72 \\
Givenness (Q) & 0.79 & 0.30 & 0.24 \\
Rationale Localization (R) & 0.80 & 0.18 & 0.06 \\
Jargon (A) & 0.57 & 0.34 & 0.43 \\
Standalone (A) & 0.69 & 0.21 & -0.30 \\
Recall of human QA & 0.70 & 0.67 & 0.47 \\
Average & 0.70 & 0.48 & 0.36 \\
\bottomrule
\end{tabular}
\end{table}



## Evaluation Costs

In [9]:
from litellm import completion_cost


def report_costs(result_path, custom_cost_per_token):
    # API responses are in separate directory for criteria
    # Get the list of criteria
    criteria = [x.name for x in result_path.iterdir() if x.is_dir()]
    costs = defaultdict(float)

    # Open each response and calculate costs via `usage`
    for criterion in criteria:
        responses = (path / criterion).glob("*.json")
        for r in responses:
            with open(r) as fin:
                response = json.load(fin)
                costs[criterion] += completion_cost(
                    response, custom_cost_per_token=custom_cost_per_token
                )

    total = sum(costs.values())

    for c in criteria:
        print(f"{c:<23}: {costs[c]:5.2f}$")
    print("=" * 31)
    print(f"{'total':<23}: {total:5.2f}$")


# GPT-4o as of May 29, 2024
costs = {"input_cost_per_token": 5 / 1_000_000, "output_cost_per_token": 15 / 1_000_000}
path = Path("../output/gpt-eval/gpt-4o-2024-05-13/")
print("gpt-4o-2024-05-13")
report_costs(path, costs)
print()

# LLama3-70B on Together.ai as of May 29, 2024
costs = {
    "input_cost_per_token": 0.9 / 1_000_000,
    "output_cost_per_token": 0.9 / 1_000_000,
}
path = Path("../output/gpt-eval/llama-3-70b-chat-hf/")
print("llama-3-70b-chat-hf")
report_costs(path, costs)

gpt-4o-2024-05-13
accuracy_answer        :  1.57$
relevance_source       :  1.50$
relevance_target       :  2.34$
accuracy_snippet       :  1.59$
givenness_location     :  2.50$
recall                 :  5.45$
hallucinations_answer  :  1.54$
simplicity_jargon      :  1.46$
simplicity_standalone  :  1.34$
givenness_phrasing     :  1.42$
total                  : 20.71$

llama-3-70b-chat-hf
accuracy_answer        :  0.25$
relevance_source       :  0.24$
relevance_target       :  0.39$
accuracy_snippet       :  0.26$
givenness_location     :  0.43$
recall                 :  0.88$
hallucinations_answer  :  0.25$
simplicity_jargon      :  0.24$
simplicity_standalone  :  0.21$
givenness_phrasing     :  0.23$
total                  :  3.36$
