# Llama3.1 instruct and Human aggrement

In [4]:
def replace(a):
    """
    replaces original labeling with True for "S" (supported) or False for IR (irrelevant) or NS(not-supported)
    """
    if a == "IR":
        return False
    if a == "S":
        return True
    if a == "NS":
        return False

In [5]:
# Replacing original FactScore manual labels with machine labels i.e. IR, NS, S for True or False
# passing all atomic facts and decisions to a list of {"text": str, "label": bool}
import json
from pprint import pprint

human_evals = []
no_annotations = []
count = 0
with open('../FactScore/data/labeled/InstructGPT.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        i = json.loads(line)
        if i['annotations'] is None:
            # print(i["topic"])
            continue
        for a in i['annotations']:
            if not a['is-relevant']:
                continue
            for fact in a["human-atomic-facts"]:
                human_evals.append({"text": fact["text"],
                                    "label": replace(fact.get("label")) })
                
print(len(human_evals))
print(human_evals[0])

4726
{'text': 'Doug Sheehan is an American.', 'label': True}


In [6]:
for evals in human_evals:
    try:
        assert evals["label"] in {True, False}
    except:
        print(evals["label"])

In [7]:
# Load Machine annotated data
# Pass llama3.1 decisionas as list of {"atom": str, "is_supported": bool"}
llama_evals = []
path = "/home/lucas_dfki/Documentos/repos/HalluEval-RAG/data/afv_json/InstructGPT_factscore_output_human-atomic-facts.json"
with open(path, 'r', encoding='utf-8') as jsl:
        llama_evals = json.load(jsl)
        
llamas = [atom for doc in llama_evals["decisions"] for atom in doc]
print(llamas[0])
print(len(llamas))


{'atom': 'Doug Sheehan is an American.', 'is_supported': True}
4726


In [16]:
# Sanity checks
assert len(llamas) == len(human_evals)
for machine, human in zip(llamas, human_evals):
    assert machine["atom"] == human["text"]
#    if machine["is_supported"] != human["label"]:
#        print(machine)
#        print(human)

{'atom': 'Knots Landing is a drama series.', 'is_supported': True}
{'text': 'Knots Landing is a drama series.', 'label': False}
{'atom': 'He has appeared in films.', 'is_supported': False}
{'text': 'He has appeared in films.', 'label': True}
{'atom': 'The Bronx is in New York.', 'is_supported': True}
{'text': 'The Bronx is in New York.', 'label': False}
{'atom': 'This Time was released in 1997.', 'is_supported': False}
{'text': 'This Time was released in 1997.', 'label': True}
{'atom': 'She has had guest appearances on New York Undercover.', 'is_supported': True}
{'text': 'She has had guest appearances on New York Undercover.', 'label': False}
{'atom': 'His work includes directing feature films.', 'is_supported': False}
{'text': 'His work includes directing feature films.', 'label': True}
{'atom': 'His work includes producing feature films.', 'is_supported': False}
{'text': 'His work includes producing feature films.', 'label': True}
{'atom': 'He has written and/or produced content for

# Annotator Agreement

In [12]:
from sklearn.metrics import cohen_kappa_score, confusion_matrix, accuracy_score


In [10]:
y1 = [i["is_supported"] for i in llamas]
y2 = [i["label"] for i in human_evals]
cohen_kappa_score(y1, y2)

0.6765188767596118

In [13]:
accuracy_score(y1,y2)

0.8427845958527296

In [15]:
confusion_matrix(y2,y1)

array([[2420,  206],
       [ 537, 1563]])