## Inter-rater Agreement
- E1 stands for Eric and E2 stands for Robert, our two legal experts
- We compute the inter-rater agreement between two experts and between each expert and GPT-4

In [9]:
import numpy as np
import pandas as pd

def cohen_kappa(a: np.int32, b: np.int32) -> float:
    assert a.shape == b.shape
    po = (a == b).astype(np.float32).mean()
    categories = sorted(set(list(np.concatenate((a, b), axis=0))))
    mp = {}
    for i, c in enumerate(categories):
        mp[c] = i
    k = len(mp)
    sa = np.zeros(shape=(k,), dtype=np.int32)
    sb = np.zeros(shape=(k,), dtype=np.int32)
    n = a.shape[0]
    for x, y in zip(list(a), list(b)):
        sa[mp[x]] += 1
        sb[mp[y]] += 1
    pe = 0
    for i in range(k):
        pe += (sa[i] / n) * (sb[i] / n)
    kappa = (po - pe) / (1.0 - pe)
    return kappa

In [10]:
def map_mc_choice(text):
    text = text.strip()
    choiceMap = {"(A)": 0, "(B)": 1, "(C)": 2, "(D)": 3, "A": 0, "B": 1, "C": 2, "D": 3}
    return choiceMap[text]

df_annos = pd.read_csv("./expert_annotations/expert_annotation_agreements.tsv", sep="\t")
for column_name in ['concept_question_E2', 'concept_question_E1',
                    'concept_question_GPT4', 'prediction_question_E2',
                    'prediction_question_E1', 'prediction_question_GPT4',
                    'limitation_question_E2', 'limitation_question_E1',
                    'limitation_question_GPT4']:
    df_annos['encoded_{}'.format(column_name)] = df_annos[column_name].apply(map_mc_choice)
    del df_annos[column_name]
df_annos

Unnamed: 0,concept,condition,encoded_concept_question_E2,encoded_concept_question_E1,encoded_concept_question_GPT4,encoded_prediction_question_E2,encoded_prediction_question_E1,encoded_prediction_question_GPT4,encoded_limitation_question_E2,encoded_limitation_question_E1,encoded_limitation_question_GPT4
0,Abstention_doctrine,E1,2,2,2,2,2,2,1,0,2
1,Constitutional_convention_(political_custom),E1,1,1,1,2,2,2,3,3,3
2,Unaccompanied_minor,E2,2,2,2,1,1,1,2,2,2
3,Substantial_certainty_doctrine,E1,1,1,1,3,1,1,3,3,3
4,Learned_intermediary,E1,1,1,1,1,1,1,1,0,0
5,Doctrine_of_foreign_equivalents,E2,1,1,1,2,2,2,0,0,0
6,Prosecutorial_discretion,E1,1,1,1,1,1,1,2,2,2
7,Safe_harbor_(law),E2,1,1,1,1,1,1,2,2,3
8,FTC_v._Dean_Foods_Co.,E1,1,1,1,1,1,1,3,3,3
9,Volenti_non_fit_injuria,E2,1,1,1,0,0,0,2,2,2


In [12]:
import itertools

for q_type in ['concept', 'prediction', 'limitation']:
    for p1, p2 in itertools.combinations(['E1', 'E2', 'GPT4'], 2):
        cohen_kappa_score = round(cohen_kappa(df_annos["encoded_{}_question_{}".format(q_type, p1)], df_annos["encoded_{}_question_{}".format(q_type, p2)]), 2)
        print(q_type, p1, p2, cohen_kappa_score)

concept E1 E2 0.86
concept E1 GPT4 1.0
concept E2 GPT4 0.86
prediction E1 E2 0.68
prediction E1 GPT4 0.92
prediction E2 GPT4 0.77
limitation E1 E2 0.86
limitation E1 GPT4 0.85
limitation E2 GPT4 0.78


## Expert annotations on Questions
- Experts are asked to answer the following three points for each generated question in the `comment` columns:
    1. Does your answer agree with the suggested answer?
    2. Is the suggested answer correct or wrong? Why?
    3. What suggestions do you have to improve the question?

In [14]:
df_e1 = pd.read_csv("./expert_annotations/E1_annotations.tsv", sep="\t")
df_e1.head(3)

Unnamed: 0,concept,concept_question,Unnamed: 2,prediction_question,Unnamed: 4,limitation_question,Unnamed: 6,Concept Difficulty
0,,your answer,comments,your answer,comments,your answer,comments,
1,Abstention_doctrine,C,(1) Yes. (2) The suggested answer is correct....,C,(1) Yes. (2) The suggested answer is correct....,A,(1) No. (2) Based on the definition and story...,Hard
2,Constitutional_convention_(political_custom),B,(1) Yes. (2) The suggested answer is correct....,C,(1) Yes. (2) The suggested answer is correct....,D,(1) Yes. (2) The suggested answer is correct....,Easy


In [15]:
df_e2 = pd.read_csv("./expert_annotations/E2_annotations.tsv", sep="\t")
df_e2.head(3)

Unnamed: 0,concept,concept_question,Unnamed: 2,prediction_question,Unnamed: 4,limitation_question,Unnamed: 6,Concept Difficulty
0,,your answer,comments,your answer,comments,your answer,comments,
1,Abstention_doctrine,C,(1) Yes. (2) The suggested answer is correct....,C,(1) Yes. (2) The suggested answer is correct....,B,(1) No. (2) The suggested answer is correct. ...,Easy
2,Constitutional_convention_(political_custom),B,(1) Yes. (2) The suggested answer is correct....,C,(1) Yes. (2) The suggested answer is correct....,D,(1) Yes. (2) The suggested answer is correct....,Medium


In [17]:
df_final_answers = pd.read_csv("./expert_annotations/Final_answer_annotations.tsv", sep="\t")
df_final_answers.head(3)

Unnamed: 0,concept,concept_question_answer,prediction_question_answer,limitation_question_answer,difficulty,q_id
0,Abstention_doctrine,C,C,C,hard,0
1,Constitutional_convention_(political_custom),B,C,D,medium,1
2,Unaccompanied_minor,C,B,C,medium,2


In [18]:
df_final_regenerated_question_20 = pd.read_csv("./expert_annotations/Final_regenerated_questions_20.tsv", sep="\t")
df_final_regenerated_question_20.head(3)

Unnamed: 0,concept,concept_full_name,intro_text,story,Q_concept,Q_prediction,Q_limitation
0,Abstention_doctrine,Abstention doctrine,An abstention doctrine is any of several doctr...,Imagine a big family dinner where everyone has...,Question: What is the main purpose of the Abst...,"Question: In a hypothetical scenario, a large ...",Question: What is a potential limitation or ex...
1,Constitutional_convention_(political_custom),Constitutional convention (political custom),A constitutional convention is an informal and...,Imagine you're playing a game of soccer with y...,Question: Which of the following best describe...,"Question: In the fictional country of Veridia,...",Question: What is a potential limitation or ex...
2,Unaccompanied_minor,Unaccompanied minor,"An unaccompanied minor (sometimes ""unaccompani...",Imagine a young boy named Timmy. Timmy is only...,Question: Which of the following best describe...,"Question: In a hypothetical scenario, 12-year-...",Question: According to the UN Committee on the...
