## Human Evaluation Scores
- stories: readability, relevance, redundancy, cohesiveness, co pleteness, factuality, likeability, believability
- questions: q1-rating (concept question), q2-rating (ending question), q3-rating (limitation question)

In [1]:
import pandas as pd

def get_concat_scores(df_current_batch, q_ids, is_batch_20=False):

    question_types = ['readability-1', 'readability-2', 'relevance', 'redundancy', 'cohesiveness', 'completeness', 'factuality', 'likeability', 'believability', 'q1-rating', 'q2-rating', 'q3-rating']
    df_list = []
    end_batch_num = 88 if is_batch_20 else 86 # the 20th batch contains 7 concepts instead of 5
    for q_num, q_id in zip(range(81, end_batch_num), q_ids):
        mean_row, std_row = [], []
        questions_per_instance = []
        for q_type in question_types:
            q_full_name = "Q{}-{}".format(q_num, q_type)
            questions_per_instance.append(q_full_name)
        df_ = pd.DataFrame(df_current_batch[questions_per_instance].astype(float).values, columns=question_types)
        df_['q_id'] = q_id
            # print(df_.mean(), df_.std())
        df_list.append(df_)

    df_concat = pd.concat(df_list)
    return df_concat

def concatenate_batches(model_name, num_batches):

    df_llm_batches = []
    for batch_id in range(1, num_batches+1):
        df_batch = pd.read_csv("./prolific_annotations/{}/Legal-Story-Batch-{}.csv".format(model_name, batch_id)).iloc[2:5]
        if batch_id == 20:
            q_ids = list(range(5*batch_id-5, 5*batch_id+2))
            df_concat = get_concat_scores(df_batch, q_ids, is_batch_20=True)
        else:
            q_ids = list(range(5*batch_id-5, 5*batch_id))
            df_concat = get_concat_scores(df_batch,  q_ids)
        df_llm_batches.append(df_concat)

    df_concat_batches = pd.concat(df_llm_batches)
    return df_concat_batches

### GPT-4 (101 concepts)

In [2]:
df_concat_batches_gpt4 = concatenate_batches('gpt4', 20)

# remove q_id=42 because this is a duplicate
df_concat_batches_gpt4 = df_concat_batches_gpt4[df_concat_batches_gpt4.q_id != 42]
df_concat_batches_gpt4.q_id = df_concat_batches_gpt4.q_id.apply(lambda x: x-1 if x > 42 else x)
df_concat_batches_gpt4.shape

(303, 13)

In [3]:
df_101 = pd.read_csv("../data/101-doctrines/gpt4_story_question_101.tsv", delimiter="\t")
df_101['q_id'] = df_101.index

# save all annotations
df_101_merge = df_concat_batches_gpt4.merge(df_101, on="q_id")
df_101_merge.to_csv("../analysis/prolific_annotations/gpt4/101_raw_annotation_scores.tsv", sep="\t")
print(df_101_merge.shape)

# save mean scores
df_concat_batches_mean = df_concat_batches_gpt4.groupby(by="q_id").mean().round(2).reset_index()
df_concat_batches_mean_merge = df_concat_batches_mean.merge(df_101, on="q_id")
df_concat_batches_mean_merge.to_csv("../analysis/prolific_annotations/gpt4/101_mean_annotation_scores.tsv", sep="\t")
print(df_concat_batches_mean_merge.shape)

(303, 20)
(101, 20)


In [4]:
df_concat_batches_gpt4 = df_concat_batches_gpt4[['readability-1', 'readability-2', 'relevance', 'redundancy',
                                                 'cohesiveness', 'completeness', 'factuality', 'likeability',
                                                 'believability', 'q1-rating', 'q2-rating', 'q3-rating', 'q_id']]
print(df_concat_batches_gpt4.shape)

print("mean scores", df_concat_batches_gpt4.mean().round(2).values.tolist())
print("std scores", df_concat_batches_gpt4.std().round(2).values.tolist())

(303, 13)
mean scores [3.95, 4.66, 4.56, 4.0, 4.63, 4.57, 4.56, 4.36, 4.54, 4.46, 4.35, 4.14, 50.0]
std scores [1.04, 0.6, 0.71, 1.26, 0.62, 0.67, 0.69, 0.81, 0.74, 0.77, 0.78, 0.98, 29.2]


### GPT-4 (20 concepts)

In [5]:
df_20 = pd.read_csv("../data/20-doctrines/legal_doctrines_20.tsv", delimiter="\t")
df_101 = pd.read_csv("../data/101-doctrines/gpt4_story_question_101.tsv", delimiter="\t")
df_101['q_id'] = df_101.index

df_sampled_20_name_idx = df_101.merge(df_20, on="concept")[['concept', 'q_id']]
df_sampled_20_name_idx

Unnamed: 0,concept,q_id
0,Abstention_doctrine,0
1,Brady_disclosure,7
2,Commanding_precedent,9
3,Constitutional_convention_(political_custom),11
4,Doctrine_of_foreign_equivalents,18
5,FTC_v._Dean_Foods_Co.,35
6,Learned_intermediary,49
7,Maxwellisation,54
8,Pacta_sunt_servanda,65
9,Plain_meaning_rule,69


In [6]:
df_sampled_scores = df_concat_batches_gpt4.merge(df_sampled_20_name_idx, on="q_id")
print(df_sampled_scores.shape)
df_sampled_scores = df_sampled_scores[['readability-1', 'readability-2', 'relevance', 'redundancy', 'cohesiveness', 'completeness', 'factuality', 'likeability', 'believability', 'q1-rating', 'q2-rating', 'q3-rating', 'q_id']]
print("mean scores:", df_sampled_scores.mean().round(2).values.tolist())
print("std scores", df_sampled_scores.std().round(2).values.tolist())

(60, 14)
mean scores: [3.98, 4.7, 4.52, 3.78, 4.57, 4.58, 4.52, 4.42, 4.48, 4.42, 4.37, 4.15, 59.4]
std scores [1.07, 0.46, 0.68, 1.29, 0.56, 0.53, 0.62, 0.79, 0.7, 0.72, 0.8, 1.04, 33.63]


### GPT-3.5 (20 concepts)

In [13]:
df_concat_batches_chatgpt = concatenate_batches('gpt3.5', 4)
print(df_concat_batches_chatgpt.shape)
df_concat_batches_chatgpt.head(5)

# fix the q_ids here
df_20_concept = pd.read_csv("../data/20-doctrines/gpt3.5_story_question_20.tsv", delimiter="\t")[['concept']]
df_20_concept['q_id'] = df_20_concept.index
df_concat_batches_chatgpt = df_concat_batches_chatgpt.merge(df_20_concept, on="q_id")

df_concat_batches_chatgpt = df_concat_batches_chatgpt[['readability-1', 'readability-2', 'relevance', 'redundancy', 'cohesiveness', 'completeness', 'factuality', 'likeability', 'believability', 'q1-rating', 'q2-rating', 'q3-rating', 'q_id']]
print("mean scores", df_concat_batches_chatgpt.mean().round(2).values.tolist())
print("std scores", df_concat_batches_chatgpt.std().round(2).values.tolist())

(60, 13)
mean scores [3.3, 4.35, 4.2, 3.72, 4.3, 4.03, 4.12, 4.1, 4.13, 4.12, 3.95, 3.48, 9.5]
std scores [1.01, 0.68, 0.78, 0.8, 0.74, 0.78, 0.69, 0.95, 0.65, 0.64, 0.85, 0.91, 5.81]


### LLaMA 2 (20 concepts)

In [14]:
df_20 = pd.read_csv("../data/20-doctrines/llama2_story_question_20.tsv", delimiter="\t")
df_101 = pd.read_csv("../data/101-doctrines/llama2_story_question_101.tsv", delimiter="\t")
df_101['q_id'] = df_101.index
df_sampled_20_name_idx = df_101.merge(df_20, on="concept")[['concept', 'q_id']]
df_sampled_20_name_idx


Unnamed: 0,concept,q_id
0,Abstention_doctrine,0
1,Brady_disclosure,7
2,Commanding_precedent,9
3,Constitutional_convention_(political_custom),11
4,Doctrine_of_foreign_equivalents,18
5,FTC_v._Dean_Foods_Co.,35
6,Learned_intermediary,49
7,Maxwellisation,54
8,Pacta_sunt_servanda,65
9,Plain_meaning_rule,69


In [15]:
df_concat_batches_llama2 = concatenate_batches('llama2', 4)
print(df_concat_batches_llama2.shape)
df_concat_batches_llama2.head(5)

# fix the q_ids here
df_20_concept = pd.read_csv("../data/20-doctrines/llama2_story_question_20.tsv", delimiter="\t")[['concept']]
df_20_concept['q_id'] = df_20_concept.index
df_concat_batches_llama2 = df_concat_batches_llama2.merge(df_20_concept, on="q_id")
print(df_concat_batches_llama2.shape)

df_concat_batches_llama2 = df_concat_batches_llama2[['readability-1', 'readability-2', 'relevance', 'redundancy', 'cohesiveness', 'completeness', 'factuality', 'likeability', 'believability', 'q1-rating', 'q2-rating', 'q3-rating', 'q_id']]
print("mean scores", df_concat_batches_llama2.mean().round(2).values.tolist())
print("std scores", df_concat_batches_llama2.std().round(2).values.tolist())

(60, 13)
(60, 14)
mean scores [3.72, 4.35, 4.4, 3.92, 4.38, 4.15, 4.1, 4.2, 4.35, 4.23, 4.1, 4.12, 9.5]
std scores [1.15, 0.86, 0.85, 1.33, 0.83, 1.12, 1.17, 1.04, 0.94, 0.96, 1.22, 1.12, 5.81]


## Error Type Analysis

In [16]:
def get_concat_comments(df_current_batch, q_ids, is_batch_20=False):

    question_types = ['story-comments', 'q1-issues', 'q2-issues', 'q3-issues']
    df_list = []
    end_batch_num = 88 if is_batch_20 else 86 # the 20th batch contains 7 concepts instead of 5
    for q_num, q_id in zip(range(81, end_batch_num), q_ids):
        questions_per_instance = []
        for q_type in question_types:
            q_full_name = "Q{}-{}".format(q_num, q_type)
            questions_per_instance.append(q_full_name)
        df_ = pd.DataFrame(df_current_batch[questions_per_instance].astype(str).values, columns=question_types)
        df_['q_id'] = q_id
            # print(df_.mean(), df_.std())
        df_list.append(df_)

    df_concat = pd.concat(df_list)
    return df_concat

def concatenate_batches_comments(model_name, num_batches):

    df_llm_batches = []
    for batch_id in range(1, num_batches+1):
        df_batch = pd.read_csv("./prolific_annotations/{}/Legal-Story-Batch-{}.csv".format(model_name, batch_id)).iloc[2:5]
        if batch_id == 20:
            q_ids = list(range(5*batch_id-5, 5*batch_id+2))
            df_concat = get_concat_comments(df_batch, q_ids, is_batch_20=True)
        else:
            q_ids = list(range(5*batch_id-5, 5*batch_id))
            df_concat = get_concat_comments(df_batch,  q_ids)
        df_llm_batches.append(df_concat)

    df_concat_batches = pd.concat(df_llm_batches)
    return df_concat_batches

In [17]:
def get_comment_statistics(comment_list):
    number_instances = len(comment_list)
    error_type_dict = {}
    for each_comment_list in comment_list:
        for each_comment in each_comment_list.split(","):
            if each_comment in error_type_dict:
                error_type_dict[each_comment] += 1
            else:
                error_type_dict[each_comment] = 1
    
    return {k:round(100*float(v)/number_instances, 2) for k, v in error_type_dict.items()}

def get_no_issue_statistics(comment_list):
    number_instances = len(comment_list)
    error_type_dict = {}
    for each_comment_list in comment_list:
        if each_comment_list == "There is no issue.":
            if each_comment_list in error_type_dict:
                error_type_dict[each_comment_list] += 1
            else:
                error_type_dict[each_comment_list] = 1
    
    return {k:round(100*float(v)/number_instances, 2) for k, v in error_type_dict.items()}


def get_error_type_rate(df_agg_comments, error_types):
    q1_error_rates = get_no_issue_statistics(df_agg_comments['q1-issues'].tolist())

    q2_error_rates = get_no_issue_statistics(df_agg_comments['q2-issues'].tolist())

    q3_error_rates = get_no_issue_statistics(df_agg_comments['q3-issues'].tolist())

    rows = []
    for error_type in error_types:
        row = []
        for current_dict in [q1_error_rates, q2_error_rates, q3_error_rates]:
            if error_type in current_dict:
                row.append(current_dict[error_type])
            else:
                row.append(0.0)
        rows.append(row)
    df_rates = pd.DataFrame(rows, columns=['concept_question', 'prediction_question', 'limitation_question'], index=error_types)
    return df_rates.sort_index()

# ensure all the 8 error types are covered in the 101 examples
# error_types = sorted(get_comment_statistics(df_concat_gpt4_comments['q1-issues'].tolist()))
# get_error_type_rate(df_concat_gpt4_comments, error_types)


### GPT-4 No Issue Rate

In [18]:
# need to remove one duplicate for gpt-4 annotations 
df_concat_gpt4_comments = concatenate_batches_comments('gpt4', 20)
df_concat_gpt4_comments = df_concat_gpt4_comments[df_concat_gpt4_comments.q_id != 42]
df_concat_gpt4_comments.q_id = df_concat_gpt4_comments.q_id.apply(lambda x: x-1 if x > 42 else x)
df_concat_gpt4_comments.shape

(303, 5)

In [19]:
df_sampled_gpt4_20_comments = df_concat_gpt4_comments.merge(df_sampled_20_name_idx, on="q_id")
df_sampled_gpt4_20_comments = df_sampled_gpt4_20_comments[['concept', 'q_id', 'story-comments', 'q1-issues', 'q2-issues', 'q3-issues']]
df_sampled_gpt4_20_comments.to_csv("./prolific_annotations/gpt4/gpt4_sampled_20_errors.tsv", sep="\t")
print(df_sampled_gpt4_20_comments.shape)

error_types = sorted(get_comment_statistics(df_concat_gpt4_comments['q1-issues'].tolist()))
# only counts if an annotator marks it "There is no issue." and marks no other issue at the same time
get_error_type_rate(df_sampled_gpt4_20_comments, ["There is no issue."])

(60, 6)


Unnamed: 0,concept_question,prediction_question,limitation_question
There is no issue.,83.33,75.0,80.0


### GPT-3.5 No Issue Rate

In [22]:
df_concat_chatgpt_comments = concatenate_batches_comments('gpt3.5', 4)

df_concat_chatgpt_comments = df_concat_chatgpt_comments.merge(df_20_concept, on="q_id")
df_concat_chatgpt_comments = df_concat_chatgpt_comments[['concept', 'q_id', 'story-comments', 'q1-issues', 'q2-issues', 'q3-issues']]
df_concat_chatgpt_comments.to_csv("./prolific_annotations/gpt3.5/gpt3.5_sampled_20_errors.tsv", sep="\t")
print(df_concat_chatgpt_comments.shape)

error_types = sorted(get_comment_statistics(df_concat_gpt4_comments['q1-issues'].tolist()))
get_error_type_rate(df_concat_chatgpt_comments, ["There is no issue."])

(60, 6)


Unnamed: 0,concept_question,prediction_question,limitation_question
There is no issue.,71.67,71.67,41.67


### LLaMA 2 No Issue Rate

In [24]:
df_concat_llama2_comments = concatenate_batches_comments('llama2', 4)
df_concat_llama2_comments = df_concat_llama2_comments.merge(df_20_concept, on="q_id")
df_concat_llama2_comments = df_concat_llama2_comments[['concept', 'q_id', 'story-comments', 'q1-issues', 'q2-issues', 'q3-issues']]
df_concat_llama2_comments.to_csv("./prolific_annotations/llama2/llama2_sampled_20_errors.tsv", sep="\t")
print(df_concat_llama2_comments.shape)

error_types = sorted(get_comment_statistics(df_concat_llama2_comments['q1-issues'].tolist()))
# only counts if an annotator marks it "There is no issue." and marks no other issue at the same time
get_error_type_rate(df_concat_llama2_comments, ["There is no issue."])


(60, 6)


Unnamed: 0,concept_question,prediction_question,limitation_question
There is no issue.,66.67,60.0,65.0
