In [None]:
import pandas as pd
from utils.correlations import krippendorff_alpha, interval_metric, nominal_metric
import numpy as np

In [None]:
def calculate_inter_annotator_agreement_factuality_krippendorff_finegrained_longsciverify(data_frames):
    model_nums = ["1", "2", "3"]
    sent_nums = ["1", "2", "3"]
    all_scores = []
    for df in data_frames:
        annotator_scores = []
        for model_num in model_nums:
            for idx, row in df.iterrows():
                av_over_sents = []
                for sent_num in sent_nums:
                    score = row[f"model_{model_num}_sent_{sent_num}_factuality"]
                    if not np.isnan(score):
                        annotator_scores.append(score)
        all_scores.append(annotator_scores)
    return krippendorff_alpha(all_scores, nominal_metric)

def calculate_inter_annotator_agreement_factuality_krippendorff_aggregated_longsciverify(data_frames):
    model_nums = ["1", "2", "3"]
    sent_nums = ["1", "2", "3"]
    all_scores = []
    for df in data_frames:
        annotator_scores = []
        for model_num in model_nums:
            for idx, row in df.iterrows():
                article_scores_array = []
                for sent_num in sent_nums:
                    score = row[f"model_{model_num}_sent_{sent_num}_factuality"]
                    if not np.isnan(score):
                        article_scores_array.append(score)
                av_over_sents = np.mean(article_scores_array)
                annotator_scores.append(av_over_sents)
        all_scores.append(annotator_scores)
    return krippendorff_alpha(all_scores, interval_metric), all_scores

### LongSciVerify PubMed IAA

In [None]:
df_1 = pd.read_csv("../data/human_eval_results/LongSciVerify/pubmed_reviewer_1.csv")
df_2 = pd.read_csv("../data/human_eval_results/LongSciVerify/pubmed_reviewer_2.csv")
df_3 = pd.read_csv("../data/human_eval_results/LongSciVerify/pubmed_reviewer_3.csv")
raw_path = "./data/raw_data/LongSciVerify/pubmed_test.json"

In [None]:
k = calculate_inter_annotator_agreement_factuality_krippendorff_finegrained_longsciverify([df_1,df_2,df_3])
print(f"Fine-grained IAA of Pubmed LongSciVerify data set {k}")
k, all_scores = calculate_inter_annotator_agreement_factuality_krippendorff_aggregated_longsciverify([df_1,df_2,df_3])
print(f"Summary-level IAA of Pubmed LongSciVerify data set {k}")

### LongSciVerify ArXiv IAA

In [None]:
df_1 = pd.read_csv("../data/human_eval_results/LongSciVerify/arxiv_reviewer_1.csv")
df_2 = pd.read_csv("../data/human_eval_results/LongSciVerify/arxiv_reviewer_2.csv")
df_3 = pd.read_csv("../data/human_eval_results/LongSciVerify/arxiv_reviewer_3.csv")

In [None]:
k = calculate_inter_annotator_agreement_factuality_krippendorff_finegrained_longsciverify([df_1,df_2,df_3])
print(f"Fine-grained IAA of ArXiv LongSciVerify data set {k}")
k, all_scores = calculate_inter_annotator_agreement_factuality_krippendorff_aggregated_longsciverify([df_1,df_2,df_3])
print(f"Summary-level IAA of ArXiv LongSciVerify data set {k}")

### LongEval PubMed IAA

In [None]:
import json
with open('../data/human_eval_results/LongEval/pubmed_fine_scores.json','r') as f:
    longeval = json.load(f)

In [None]:
all_scores = []
for reviewer_idx in range(3):
    scores = []
    for model_eval in longeval:
        scores.append(model_eval['score'][reviewer_idx])
    all_scores.append(scores)

In [None]:
k = krippendorff_alpha(all_scores, interval_metric)
print(f"Summary-level IAA of PubMed LongEval data set {k}")