## Notebook setup

In [94]:
from collections import defaultdict

import pandas as pd
import tiktoken

from info_loss import iaa
from info_loss.evaluation import (
    accuracy_answer,
    accuracy_snippet,
    givenness_location,
    givenness_phrasing,
    hallucinations_answer,
    relevance_source,
    relevance_target,
    simplicity_jargon,
    simplicity_standalone,
)

## Load data

Accuracy-oriented eval (Angle 1, Angle 2)

In [95]:
samples_accuracy = pd.read_json("../data/infolossqa-v1.0/evals-accuracy/samples.json")
samples_accuracy_rater_a = pd.read_json(
    "../data/infolossqa-v1.0/evals-accuracy/rater-a.json"
)
samples_accuracy_rater_b = pd.read_json(
    "../data/infolossqa-v1.0/evals-accuracy/rater-b.json"
)

Recall-oriented eval (Angle 3)

In [96]:
samples_recall = pd.read_json("../data/infolossqa-v1.0/evals-recall/samples.json")
samples_recall_rater_a = pd.read_json(
    "../data/infolossqa-v1.0/evals-recall/rater-a.json"
)
samples_recall_rater_b = pd.read_json(
    "../data/infolossqa-v1.0/evals-recall/rater-b.json"
)

## Cost estimate

In [97]:
prompts = [
    accuracy_answer,
    accuracy_snippet,
    givenness_location,
    givenness_phrasing,
    hallucinations_answer,
    relevance_source,
    relevance_target,
    simplicity_jargon,
    simplicity_standalone,
]

In [98]:
enc = tiktoken.encoding_for_model("gpt-4o-2024-05-13")
price_per_token = 5 / 1_000_000  # 10$ for 1M tokens

n_tokens = defaultdict(int)
for qa_pair in samples_accuracy.to_dict(orient="records"):
    for prompt in prompts:
        prompt_name = prompt.__name__.split(".")[-1]
        messages = prompt.get_messages(qa_pair)
        message = " ".join(m["content"] for m in messages)
        tokenized = enc.encode(message)
        n_tokens[prompt_name] += len(tokenized)
n_tokens = dict(n_tokens)

total = 0
for prompt, tokens in n_tokens.items():
    prompt_costs = tokens * price_per_token
    total += prompt_costs
    print(f"{prompt:<23}: {prompt_costs:5.2f}$")
print("=" * 31)
print(f'{"total":<23}: {total:5.2f}$')

accuracy_answer        :  1.27$
accuracy_snippet       :  1.31$
givenness_location     :  2.21$
givenness_phrasing     :  1.15$
hallucinations_answer  :  1.25$
relevance_source       :  1.22$
relevance_target       :  2.00$
simplicity_jargon      :  1.16$
simplicity_standalone  :  0.96$
total                  : 12.52$


In [102]:
files = Path("../output/gpt-eval/llama-3-70b-chat-hf/").glob("*.json")
# files = Path("../output/gpt-eval/gpt-4o-2024-05-13/").glob('*.json')

dataframes = []
for file_path in files:
    with open(file_path, "r") as f:
        print(file_path)
        data = json.load(f)
        df = pd.DataFrame(data)
        dataframes.append(df)
gpt4 = pd.concat([df.set_index("edit_id") for df in dataframes], axis=1, join="outer")
gpt4 = gpt4.reset_index()
gpt4.head()

../output/gpt-eval/llama-3-70b-chat-hf/accuracy_answer.json
../output/gpt-eval/llama-3-70b-chat-hf/relevance_target.json
../output/gpt-eval/llama-3-70b-chat-hf/relevance_source.json
../output/gpt-eval/llama-3-70b-chat-hf/givenness_phrasing.json
../output/gpt-eval/llama-3-70b-chat-hf/simplicity_standalone.json
../output/gpt-eval/llama-3-70b-chat-hf/hallucinations_answer.json
../output/gpt-eval/llama-3-70b-chat-hf/accuracy_snippet.json
../output/gpt-eval/llama-3-70b-chat-hf/simplicity_jargon.json
../output/gpt-eval/llama-3-70b-chat-hf/givenness_location.json


Unnamed: 0,edit_id,accuracy_answer,accuracy_answer_rationale,relevance_target,relevance_target_rationale,relevance_source,relevance_source_rationale,givenness_phrasing,givenness_phrasing_rationale,simplicity_standalone,simplicity_standalone_rationale,hallucinations_answer,hallucinations_answer_rationale,accuracy_snippet,accuracy_snippet_rationale,simplicity_jargon,simplicity_jargon_rationale,givenness_location,givenness_location_rationale
0,557596d6d7b33a1a95118d543095cc39,accuracy_answer_1,The answer correctly states that the reducing ...,relevance_target_2,The simplified text only mentions that the way...,relevance_source_1,The text explicitly states that the reducing r...,givenness_phrasing_2,The question introduces the concept of 'reduci...,simplicity_standalone_1,"The answer does not contain any coreferences, ...",hallucinations_answer_1,The answer accurately summarizes the informati...,accuracy_snippet_2,The answer snippet only provides the standard ...,simplicity_jargon_3,The term 'prescribing doctor' and 'reducing re...,givenness_location_1,The question directly asks about the reducing ...
1,d772637d48e9c670356befc106609d3e,accuracy_answer_2,The answer only addresses the relative risk of...,relevance_target_3,The simplified text does not provide any infor...,relevance_source_3,"The question is asking for a range of results,...",givenness_phrasing_2,The question introduces the concept of 'range ...,simplicity_standalone_2,The answer contains the term 'relative risk' w...,hallucinations_answer_1,The answer only references the confidence inte...,accuracy_snippet_3,The answer snippet only provides a specific re...,simplicity_jargon_4,The term 'relative risk' is a technical term t...,givenness_location_na,"The question asks about the range of results, ..."
2,8f6d862e5fc200711475fe9c4327ce73,accuracy_answer_1,The answer accurately and completely responds ...,relevance_target_3,The simplified text does not mention the maxim...,relevance_source_1,The text explicitly states the maximum dosage ...,givenness_phrasing_2,"The question introduces a new concept, 'maximu...",simplicity_standalone_2,The answer mentions a specific dosage (100 uni...,hallucinations_answer_1,The answer is directly supported by the docume...,accuracy_snippet_1,The answer snippet directly provides the maxim...,simplicity_jargon_1,The term 'units' might be unfamiliar to some l...,givenness_location_na,The question directly asks about the maximum d...
3,72eeb799d1677968dd0b52b4ab581b18,accuracy_answer_1,The answer accurately and completely addresses...,relevance_target_3,The simplified text does not provide specific ...,relevance_source_1,The text explicitly mentions the measurement o...,givenness_phrasing_2,The question introduces a new concept 'neck mo...,simplicity_standalone_2,The answer introduces the term 'cervical range...,hallucinations_answer_1,The answer accurately reports the measurement ...,accuracy_snippet_2,The answer snippet only provides the result of...,simplicity_jargon_4,The answer contains technical terms like 'tota...,givenness_location_1,The question asks about the specific results o...
4,175e6b8a8847c5364a310e6f782281c9,accuracy_answer_1,The answer correctly identifies the type of su...,relevance_target_2,The simplified text does not explicitly mentio...,relevance_source_1,The text explicitly mentions 'high-risk surger...,givenness_phrasing_2,The question introduces the concept of 'kind o...,simplicity_standalone_2,The answer introduces the term 'high-risk surg...,hallucinations_answer_2,The answer provides an additional explanation ...,accuracy_snippet_3,The answer snippet does not specifically menti...,simplicity_jargon_4,The term 'high-risk surgeries' is not explaine...,givenness_location_1,The question directly asks about the type of s...


## Human agreement baselines

In [104]:
ratings_rename = {
    "relevance_source": "Q is Answerable w/ X_src",
    "relevance_target": "Q is Answerable w/ X_tgt",
    "accuracy_snippet": "Accuracy - Evidence (E)",
    "accuracy_answer": "Accuracy - Answer (A)",
    "hallucinations_answer": "Hallucination (A)",
    "givenness_phrasing": "Givenness (Q)",
    "givenness_location": "Rationale Localization (R)",
    "simplicity_jargon": "Jargon (A)",
    "simplicity_standalone": "Standalone (A)",
}
criteria = ratings_rename.keys()

In [105]:
agreement = []

ann_a = samples_accuracy_rater_a
ann_b = samples_accuracy_rater_b

for c in criteria:
    human = iaa.kappa(raters=[ann_a[c], ann_b[c]], method="randolph")
    model_vs_a = iaa.kappa(raters=[ann_a[c], gpt4[c]], method="randolph")
    model_vs_b = iaa.kappa(raters=[ann_b[c], gpt4[c]], method="randolph")
    agreement.append((c, human, (model_vs_a + model_vs_b) / 2, model_vs_a, model_vs_b))
df_agreement = pd.DataFrame(
    agreement, columns=["criterion", "human", "avg", "gpt4 vs a", "gpt4 vs b"]
)
df_agreement = df_agreement.set_index("criterion")
df_agreement = df_agreement.rename(ratings_rename)
df_agreement.round(2)

  kappa = (p_mean - p_mean_exp) / (1- p_mean_exp)
  kappa = (p_mean - p_mean_exp) / (1- p_mean_exp)


Unnamed: 0_level_0,human,avg,gpt4 vs a,gpt4 vs b
criterion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Q is Answerable w/ X_src,0.7,0.85,0.7,1.0
Q is Answerable w/ X_tgt,0.7,0.1,0.1,0.1
Accuracy - Evidence (E),,0.4,0.4,0.4
Accuracy - Answer (A),0.8,0.9,0.8,1.0
Hallucination (A),,0.8,0.8,0.8
Givenness (Q),0.8,0.5,0.4,0.6
Rationale Localization (R),1.0,0.1,0.1,0.1
Jargon (A),0.73,0.51,0.47,0.55
Standalone (A),0.6,-0.4,-0.4,-0.4


Recall

In [44]:
iaa.kappa(
    raters=[
        samples_recall_rater_a["rating"],
        samples_recall_rater_b["rating"],
    ],
    method="randolph",
)

0.7002305918524211

In [79]:
len(samples_recall_rater_a)

1301

In [77]:
samples_recall_rater_a

Unnamed: 0,doc_id,model_name,reference_id,prediction_id,reference_question,reference_answer,predicted_question,predicted_answer,score,bin,rating,comment,annotator
0,1774569,gpt-4-0613-nli,9cea213fa0014ba6f61b36c46d0aa41e,7ffa9c81b357c84c917ca267c4ddbdaf,How many patients remained abstinent at each t...,10 in the dihydrocodeine remained sober 3 mont...,How many people stopped using drugs for three ...,Ten people who were given buprenorphine and fo...,0.706561,"(0.7, 0.75]",partial,no mention of 6 months in the model-generated ...,keziah
1,1774569,gpt-4-0613-nli,800c59cdd97fe420123828b146e3f3d6,be6d78218d0d644787083d18d5db9851,How likely were patients from the buprenorphin...,"Three months after treatment, 10 patients from...",How many people continued to not use drugs for...,"Of the people in the study, seven who were giv...",0.833454,"(0.8, 0.85]",partial,no mention of 3 months in the model-generated ...,keziah
2,1774569,llama2-70b-chat-one-shot,9aa38c5bfd6874dcc2e16694ae27408f,4ff689ec3b063f5aff5cdb489857e1cd,What was the rate of a clean urine sample in t...,"In the buprenorphine group, 21% of patients pr...",What was the statistical significance of the r...,The risk of non-completion of detoxification w...,0.653179,"(0.65, 0.7]",not_aligned,,keziah
3,1774569,gpt-4-0613-nli,bde972572e862a9e97c9396fb9ad478e,7aff83eafed3bf7f95062c85b26cd3be,How much more likely was the buprenorphine gro...,21% versus 3% rate from the dihydrocodeine.,"What was the relative risk, or how the risk co...","The relative risk in this study was 0.58, mean...",0.670776,"(0.65, 0.7]",not_aligned,,keziah
4,1774569,gpt-4-0613-nli,bde972572e862a9e97c9396fb9ad478e,315c08fe5de0e27f721801aecb8f13d3,How much more likely was the buprenorphine gro...,21% versus 3% rate from the dihydrocodeine.,How much more likely were those on buprenorphi...,Those on buprenorphine were 45% more likely to...,0.665003,"(0.65, 0.7]",not_aligned,,keziah
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296,5976307,mistral-7b-instruct-one-shot,303ef8f84b1c599fb398275527d614d8,a428ca4bafdc3df3be7e82702b4e229f,When did researchers observe patients receivin...,Researchers observed patients receiving etomid...,What was the purpose of this study?,The purpose of this study was to investigate w...,0.703956,"(0.7, 0.75]",not_aligned,,keziah
1297,5976307,llama2-70b-chat-one-shot,25b18d56568dafe9fbc21640957ca537,a6b6a77f5710e296afd8d254b1a9720c,How many patients experienced vital sign fluct...,17 patients in the (27.4%) etomidate group (dr...,What specific heart and lung-related issues oc...,The study found that patients who received pro...,0.732807,"(0.7, 0.75]",not_aligned,,keziah
1298,5976307,llama2-70b-chat-one-shot,863ee67f4621599567fae44533adc58a,b63345ca29ea4bfe5884edc9a07bc7c0,How does etomidate-midazolam's effectiveness c...,Etomidate-midazolam (a combination of sedative...,What is midazolam and how does it relate to pr...,Midazolam is a sedative that is often used in ...,0.745920,"(0.7, 0.75]",not_aligned,,keziah
1299,5976307,llama2-70b-chat-one-shot,34f318c742dbbab202c82a05575679c0,66ea27d011c70ede3535b0118dda5816,What else about etomidate with midazolam in co...,The study was checking to see if the first com...,What is midazolam and how does it relate to et...,Midazolam is a sedative that is often used in ...,0.743053,"(0.7, 0.75]",partial,,keziah
