# Data conversion

In [1]:
import json
from pathlib import Path

import numpy as np
import pandas as pd

from info_loss.statistics import load_data, load_data_aggregated

## Completed annotations for manual review

This JSON file can be uploaded here to review all annotations: https://infolossqa.ikim.nrw/#/review

In [2]:
EXPORT_DIR = Path("../output/dumps/")
EXPORT_DIR.mkdir(exist_ok=True, parents=True)

In [3]:
human_qas = load_data_aggregated(
    annotators=[
        ("karim", "../data/raw/annotations/annotations-karim/"),
        ("keziah", "../data/raw/annotations/annotations-keziah/"),
        ("kathryn", "../data/raw/annotations/annotations-kathryn/"),
    ],
)
human_qas = {
    doc_id: doc for doc_id, doc in human_qas.items() if len(doc["annotations"]) >= 2
}
ids_completed = human_qas.keys()
docs_completed = list(human_qas.values())

print("Number of double-annotated documents:", len(ids_completed))

with open(EXPORT_DIR / "annotations-completed.json", "w") as fout:
    json.dump(docs_completed, fout)

Number of double-annotated documents: 104


## Human QAs + Model QAs for Website

On the website each document can also be viewed independetly. This code dumps all documents into separate json files for easy loading.

In [4]:
WEBSITE_DATA = Path("../app/public/data/")
WEBSITE_DATA.mkdir(exist_ok=True, parents=True)

In [5]:
humans_and_models = load_data_aggregated(
    annotators=[
        ("Annotator 1", "../data/raw/annotations/annotations-kathryn/"),
        ("Annotator 2", "../data/raw/annotations/annotations-keziah/"),
        ("Annotator 3", "../data/raw/annotations/annotations-karim/"),
        ("GPT-4", "../output/gpt-4-0613-one-shot/predictions.json"),
        ("Llama2-70B", "../output/llama2-70b-chat-one-shot/predictions.json"),
        ("Mistral-7B", "../output/mistral-7b-instruct-one-shot/predictions.json"),
        ("NLI Pipeline", "../output/gpt-4-0613-nli/predictions.json"),
    ]
)

humans_and_models = {
    doc_id: doc for doc_id, doc in humans_and_models.items() if doc_id in ids_completed
}

In [6]:
cleaned = {}
for doc_id, doc in humans_and_models.items():
    new_doc = doc.copy()
    annotations_new = []
    for annotator in doc["annotations"]:
        new_edits = []
        for edit in annotator["edits"]:
            if edit["category"] == "comment":
                continue

            edit["annotation"].pop("comment", "")
            new_edits.append(edit)

        annotations_new.append(
            {"annotator": annotator["annotator"], "edits": new_edits}
        )
    new_doc["annotations"] = annotations_new
    cleaned[doc_id] = new_doc

In [7]:
for doc_id, doc in cleaned.items():
    with open(WEBSITE_DATA / f"{doc_id}.json", "w") as fout:
        json.dump(doc, fout)

Dump a list of document ids + title.

In [8]:
df = pd.read_json("../data/processed/documents.json")
df = df[df["PMCID"].isin(ids_completed)]
df = df[["PMCID", "title", "sectioned"]]
df.columns = df.columns.str.lower()
df.to_json("../app/src/assets/data_index.json", orient="records")

## Export data in canonical format

In [9]:
INFOLOSSQA_PATH = Path("../data/infolossqa-v1.0/")
INFOLOSSQA_PATH.mkdir(exist_ok=True)

INFOLOSSQA_MODELS_PATH = Path("../data/infolossqa-v1.0/models/")
INFOLOSSQA_MODELS_PATH.mkdir(exist_ok=True)

### Human annotations

In [10]:
def load_flat(annotator, path):
    flat = []
    data = load_data(path, hash_edit_ids=True)

    for doc in data:
        doc_id = doc["id"]

        for edit in doc["edits"]:
            edit = edit.copy()
            edit["edit_id"] = edit["id"]
            edit.pop("id")

            edit["PMCID"] = doc_id
            edit["annotator"] = annotator
            edit = {**edit, **edit["annotation"]}

            edit.pop("annotation")
            edit.pop("comment", "")

            if edit["category"] in ["concept", "omission"]:
                flat.append(edit)
    return flat

In [18]:
all_edits_flat = []
all_edits_flat += load_flat("kathryn", "../data/raw/annotations/annotations-kathryn/")
all_edits_flat += load_flat("keziah", "../data/raw/annotations/annotations-keziah/")
all_edits_flat += load_flat("karim", "../data/raw/annotations/annotations-karim/")

df = pd.DataFrame(all_edits_flat)
df = df[
    [
        "PMCID",
        "edit_id",
        "category",
        "question",
        "answer",
        "input_idx",
        "output_idx",
        "annotator",
    ]
]
display(df.head(3))

DOC_IDS_COMPLETED = set(df["PMCID"])
df.to_json(INFOLOSSQA_PATH / "questions.json", orient="records")

Unnamed: 0,PMCID,edit_id,category,question,answer,input_idx,output_idx,annotator
0,1618957,40eb4d5da7b7b54a6fe977e64bfcaaa5,omission,What is the motivation for this study about th...,Drugs called depot somatostatin analogues (whi...,"[[22, 122]]",,kathryn
1,1618957,687f678b397f413b5b4b00e3baf16f75,concept,How long had participants been taking 30mg of ...,All participants had been taking 30mg of lanre...,"[[417, 479]]","[[432, 491]]",kathryn
2,1618957,03cfd509e632a85f3fa844a2720bf1e8,omission,How much were participants told about this tri...,"This study was an open trial, meaning particip...","[[534, 539]]",,kathryn


### Documents

In [21]:
df = pd.read_json("../data/processed/documents.json")
df = df[df["PMCID"].isin(DOC_IDS_COMPLETED)]
df = df[["PMCID", "title", "abstract", "simplification", "sectioned"]]
display(df.head(3))

df.to_json(INFOLOSSQA_PATH / "documents.json", orient="records")

Unnamed: 0,PMCID,title,abstract,simplification,sectioned
2,1618957,Efficacy of lanreotide Autogel® administered e...,OBJECTIVE AND DESIGN.\nDepot somatostatin anal...,GOAL AND PLAN\nThe goal of this study was to s...,True
4,1774569,Buprenorphine versus dihydrocodeine for opiate...,BACKGROUND.\nMany drug users present to primar...,BACKGROUND:\nMany drug users go to their regul...,True
5,1868720,Advantages and disadvantages of an objective s...,BACKGROUND.\nIt is unclear if objective select...,BACKGROUND.\nThis study looks at if picking wo...,True


### Predictions (Mistral-7B)

In [13]:
edits = load_flat(
    "mistral-7b", "../output/mistral-7b-instruct-one-shot/predictions.json"
)
df = pd.DataFrame(edits)
df = df[
    [
        "PMCID",
        "edit_id",
        "category",
        "question",
        "answer",
        "input_idx",
        "output_idx",
        "annotator",
        "prediction_errors",
    ]
]

display(df.head(3))
df.to_json(INFOLOSSQA_MODELS_PATH / "mistral-7b.json", orient="records")

Unnamed: 0,PMCID,edit_id,category,question,answer,input_idx,output_idx,annotator,prediction_errors
0,1174877,9238a12ff9fe708749e740e2d2bcd2d0,omission,What other health conditions are often found w...,Hypertension and diabetes mellitus are often f...,"[[133, 194]]",,mistral-7b,[]
1,1174877,040341b93f8e0b40dfb865871f20ef7d,omission,What is the main function of Telmisartan?,Telmisartan is a medication taken to lower hig...,"[[269, 407]]",,mistral-7b,[spurious_output_localization]
2,1174877,f96cd0b8e605c1b4556709f8afb2fb1f,concept,What is the term used to describe a group of h...,The term used to describe a group of health co...,,"[[34, 103]]",mistral-7b,[invalid_input_localization]


### Predictions (Llama2-70B)

In [14]:
edits = load_flat("llama2-70b", "../output/llama2-70b-chat-one-shot/predictions.json")
df = pd.DataFrame(edits)
df = df[
    [
        "PMCID",
        "edit_id",
        "category",
        "question",
        "answer",
        "input_idx",
        "output_idx",
        "annotator",
        "prediction_errors",
    ]
]

display(df.head(3))
df.to_json(INFOLOSSQA_MODELS_PATH / "llama2-70b.json", orient="records")

Unnamed: 0,PMCID,edit_id,category,question,answer,input_idx,output_idx,annotator,prediction_errors
0,1174877,5073dc97b29d8afca9851454a11492af,omission,What measure of ulcerative colitis disease act...,The researchers used the Partial Mayo Score to...,,,llama2-70b,[invalid_input_localization]
1,1174877,be706f59924fb4968a8fb3cba6b4be48,concept,"What is the relationship between hypertension,...",Hypertension and diabetes mellitus are common ...,"[[133, 194]]",,llama2-70b,[invalid_output_localization]
2,1174877,89daf41f9992ef7c9c92d8bb4f800a31,omission,What were the 24-hour mean systolic and diasto...,The researchers measured the 24-hour mean syst...,"[[863, 914]]",,llama2-70b,[]


### Predictions (GPT-4)

In [15]:
edits = load_flat("gpt-4", "../output/gpt-4-0613-one-shot/predictions.json")
df = pd.DataFrame(edits)
df = df[
    [
        "PMCID",
        "edit_id",
        "category",
        "question",
        "answer",
        "input_idx",
        "output_idx",
        "annotator",
        "prediction_errors",
    ]
]

display(df.head(3))
df.to_json(INFOLOSSQA_MODELS_PATH / "gpt4.json", orient="records")

Unnamed: 0,PMCID,edit_id,category,question,answer,input_idx,output_idx,annotator,prediction_errors
0,1174877,97f2bcc89f88e9358fec348627bb9c43,omission,What is metabolic syndrome?,Metabolic syndrome is a cluster of common card...,"[[12, 132]]",,gpt-4,[]
1,1174877,120ad5098834079742b856b6ec7b6137,omission,What criteria did the patients meet to be incl...,The patients met the World Health Organization...,"[[559, 630]]",,gpt-4,[]
2,1174877,90ce92b51ab47919ff97c7cbbce3cf8f,concept,What is a unique characteristic of Telmisartan...,Telmisartan is an antihypertensive agent with ...,"[[269, 407]]","[[374, 436]]",gpt-4,[]


### Predictions (NLI Pipeline)

In [16]:
edits = load_flat("nli-pipeline", "../output/gpt-4-0613-nli/predictions.json")
df = pd.DataFrame(edits)

df["output_idx"] = np.nan
df["prediction_errors"] = np.nan

df = df[
    [
        "PMCID",
        "edit_id",
        "category",
        "question",
        "answer",
        "input_idx",
        "output_idx",
        "annotator",
        "nli_fact",
        "nli_label",
        "nli_proba",
        "prediction_errors",
    ]
]

display(df.head(3))
df.to_json(INFOLOSSQA_MODELS_PATH / "nli-pipeline.json", orient="records")

Unnamed: 0,PMCID,edit_id,category,question,answer,input_idx,output_idx,annotator,nli_fact,nli_label,nli_proba,prediction_errors
0,1174877,8cd6574a747c33c5b74daf03f417147d,omission,What type of group was studied in this research?,"It was a parallel-group type study, meaning pa...","[[506, 729]]",,nli-pipeline,The study was parallel-group.,1,"[0.12969037890434265, 0.8663212656974792, 0.00...",
1,1174877,70ce2082bdde97f2c1e901719869f884,omission,How were patients assigned to different treatm...,The assignment to different treatments was ran...,"[[506, 729]]",,nli-pipeline,The study was randomized.,1,"[0.4501473605632782, 0.5478552579879761, 0.001...",
2,1174877,fa2096544f3ddf37c775219906ddb6aa,omission,"How often were the medications, Telmisartan an...",The patients were given either Telmisartan or ...,"[[506, 729]]",,nli-pipeline,Patients received once-daily doses of either t...,1,"[0.2783735692501068, 0.7184227108955383, 0.003...",
