In [26]:
import json
import pandas as pd
from glob import glob

### Step A: open file and check if the code get the right .json files

In [40]:
# open file and check if the code get the right .json files
files = []
for f_name in glob('*.json'):
    print(f_name)
    with open(f_name, encoding='utf-8') as f:
        file = json.load(f)
        files.append([f_name, file])
print('team size:' + str(len(files)))

Team_5_subtask1_12CCAs_2277051.json
Team_5_subtask1_12CCAs_2314325.json
Team_5_subtask1_12CCAs_2401157.json
team size:3


### Step B: Check if all the .json files are in the right format

In [41]:
error = 0
for [f_name, file] in files:
    try:
        assert len(file) == 24,  f"{f_name} must have 24 annotations"
    except AssertionError as e:
        print(f"{e}")
        error = error + 1

    
    for i, annotation in enumerate(file):
        try:
            assert "result" in annotation["annotations"][0]["prediction"], f"{f_name} must have pre-annotated prediction"
            
        except AssertionError as e:
            print(f"{e}")
            error = error + 1
            
        pre_annotated_id = [result["id"] for result in annotation["annotations"][0]["prediction"]["result"]]
        annotated_id = [result["id"] for result in annotation["annotations"][0]["result"]]
        try:
            # print(" ")
            # print(annotation["annotations"][0]["result"][-1]["from_name"])
            # print("question_type")
            # print(" ")
            assert annotation["annotations"][0]["result"][-1]["from_name"] == "question_type", f"{f_name}: question_type must be annotated and should be the first one"
        except AssertionError as e:
            print(f"{e}")
            error = error + 1

       
            
        for annotated_idx in annotated_id[:-1]:
            if annotated_idx in pre_annotated_id:
                continue
            else:
                try:
                    # print(" ")
                    # print(annotated_idx)
                    # print(pre_annotated_id)
                    # print(" ")
                    assert False, f"{f_name}: annotation_idx is not in pre_annotated_id pre have different id"
                except AssertionError as e:
                    print(f"{e}")
                    error = error + 1

        

if error == 0:
    print("test passed")

test passed


### Step C: Calculate cohen kappa score

In [42]:
# save annotation results into a dict
def get_annotations(l:list):
    """
    save annotation results of a json file into annotation_dict
    
    Parameters
    ------
    l: list
        items in files
    
    return: annotation_dict: dict
                annotation result
            annotation_dict["Student_ID"]: string
                Student_ID in f_name
            annotation_dict["IDs"]: list
                list of annotation IDs of every task
            annotation_dict["question_type"]: list
                list of question_type of every task
            annotation_dict["harmfulness"]: list
                list of harmfulness of every task
            annotation_dict["sent_cate_dict"]: dict
                key: string
                    annotation IDs
                value: list
                    sents_category of every units
    """
    annotation_dict = {"Student_ID": l[0].split("_")[4].replace(".json","")}
    IDs = []
    question_type = []
    harmfulness = []
    sent_cate_dict = {}
    for data in l[1]:
        results = data['annotations'][0]['result']
        harm = 'non-harmful'
        sents_categories = []
        for result in results:
            if result['from_name'] == 'llm_answer_label_1':
                validity = result['value']['choices'][0]
                ID = result['id']
                IDs.append(ID)
            elif result['from_name'] == 'question_type':
                question_type.append(result['value']['choices'][0])
            elif result['from_name'] == 'llm_answer_fine_grain_label':
                sents_categories.append(result['value']['labels'][0])
                if result['value']['labels'][0] == "Contradiction" or result['value']['labels'][0] == "Exaggeration":
                    harm = 'harmful'
        # if answer is Invalid, then all units are invalid 
        if validity == "Invalid":
            for i in range(len(sents_categories)):
                sents_categories[i] = "Invalid"
        sent_cate_dict[ID] = sents_categories
        harmfulness.append(harm)
    annotation_dict["question_type"] = question_type
    annotation_dict["harmfulness"] = harmfulness
    annotation_dict["sent_cate_dict"] = sent_cate_dict
    return annotation_dict

In [43]:
# unit annotations categories, len(categories) = 6
categories = ["Contradiction", "Exaggeration", "Understatement", "Agree with the gold answer", "Cannot assess", "General comment"]
# get answer_fine_grained_per_category
def fine_grain(l:list):
    """
    put answer_fine_grained_per_category into annotation_dict
    
    Parameters
    ------
    l: list
        items in files
    
    return: annotation_dict: add fine_grained_unit_annotation overall and per category as lists
    """
    annotation_dict = get_annotations(l)
    # list of all the unit annotations
    overall_sents_categories_list = [c for sents_cate in annotation_dict["sent_cate_dict"].values() for c in sents_cate]
    annotation_dict["overall_sents_categories_list"] = overall_sents_categories_list
    # change all the unit annotations into either label or "Others", 
    for label in categories:
        annotation_dict[label] = ["Others" if i != label else label for i in overall_sents_categories_list]
    return annotation_dict

In [44]:
def cohen_kappa(annotation_1, annotation_2, labels):
    po = len([[label1, label2] for label1, label2 in zip(annotation_1, annotation_2) if label1 == label2]) / len(annotation_1)
    
    pe = 0
    for l in labels:
        p1 = len([label for label in annotation_1 if label == l ]) / len(annotation_1)
        p2 = len([label for label in annotation_2 if label == l ]) / len(annotation_2)
        pe = pe + p1 * p2
        
    if pe == 1:
        k = 1
    else:
        k = (po - pe) / (1 - pe)
#    print(po, p1, p2, pe)
    return k


In [45]:
# calculate cohen kappa score for 2 students
question_type_labels = ['1. Yes/no question', '2. Open ended - Comparison of different specific interventions', '3. Open ended - Specific effect of a specific intervention','4. Open ended - General effects of a specific intervention',
  '5. Open ended - Comparison of different nonspecific interventions']
validity_labels =['Valid', 'Invalid']
harmfulness_labels = ['non-harmful', 'harmful']

# expand unit annotations categories with ""Invalid"" as labels for "Overall Cohen’s kappa on sentence categories", len(categories) = 7
categories.append("Invalid")

def calculate_cohen_kappa(student_1_result, student_2_result):
    x = files[0][0].split("_")[1]
    data = {
        f"Team_{x}_student1ID": student_1_result["Student_ID"],
        f"Team_{x}_student2ID": student_2_result["Student_ID"], 
        "Overall Cohen’s kappa on question type": cohen_kappa(student_1_result["question_type"], student_2_result["question_type"], question_type_labels), 
        "Overall Cohen’s kappa on sentence categories": cohen_kappa(student_1_result["overall_sents_categories_list"], student_2_result["overall_sents_categories_list"], categories), 
        "Cohen’s kappa on contradiction": cohen_kappa(student_1_result["Contradiction"], student_2_result["Contradiction"], ["Contradiction", 'Others']), 
        "Cohen’s kappa on exaggeration": cohen_kappa(student_1_result["Exaggeration"], student_2_result["Exaggeration"], ["Exaggeration", 'Others']), 
        "Cohen’s kappa on understatement": cohen_kappa(student_1_result["Understatement"], student_2_result["Understatement"], ["Understatement", 'Others']), 
        "Cohen’s kappa on agree with the gold answer": cohen_kappa(student_1_result["Agree with the gold answer"], student_2_result["Agree with the gold answer"], ["Agree with the gold answer", 'Others']), 
        "Cohen’s kappa on cannot assess": cohen_kappa(student_1_result["Cannot assess"], student_2_result["Cannot assess"], ["Cannot assess", 'Others']), 
        "Cohen’s kappa on general comment": cohen_kappa(student_1_result["General comment"], student_2_result["General comment"], ["General comment", 'Others']), 
        "Cohen’s kappa on harmfulness": cohen_kappa(student_1_result["harmfulness"], student_2_result["harmfulness"], harmfulness_labels)
    }
    return data

In [48]:
student_1_result = fine_grain(files[0])
student_2_result = fine_grain(files[1])

df = pd.DataFrame(calculate_cohen_kappa(student_1_result, student_2_result), index=[0])

# if your team have 3 members
if len(files) == 3:
    student_3_result = fine_grain(files[2])
    for student_result in [student_1_result, student_2_result]:
        new_row = calculate_cohen_kappa(student_result, student_3_result)
        # df = df.append(new_row, ignore_index=True)
        df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

# generate .csv file name
x = files[0][0].split("_")[1]
#print(f"Team_{x}_subtask1_12CCAs_Kappa score.csv")
df.to_csv(f"Team_{x}_subtask1_12CCAs_Kappa score.csv", index=False)

In [49]:
df

Unnamed: 0,Team_5_student1ID,Team_5_student2ID,Overall Cohen’s kappa on question type,Overall Cohen’s kappa on sentence categories,Cohen’s kappa on contradiction,Cohen’s kappa on exaggeration,Cohen’s kappa on understatement,Cohen’s kappa on agree with the gold answer,Cohen’s kappa on cannot assess,Cohen’s kappa on general comment,Cohen’s kappa on harmfulness
0,2277051,2314325,0.769231,0.383182,0.305808,-0.049216,-0.00692,0.322984,0.362471,0.647747,0.117647
1,2277051,2401157,0.213115,0.294047,0.322731,0.117565,-0.00692,0.165931,0.298718,0.451602,0.2
2,2314325,2401157,0.213115,0.192486,0.112025,-0.016298,-0.010417,0.268736,0.09454,0.40711,0.272727
