In [126]:
all_6models = [
    "gold",
    "chatgpt",
    "flan-t5-xxl",
    "alpaca-7B",
    "alpaca-13B",
    "dolly-v2-7B",
    "dolly-v2-12B"
    ]

In [127]:
top_3models = [
    "gold",
    "chatgpt",
    "flan-t5-xxl",
    "alpaca-7B"
    ]

In [128]:
# In main paper, top 3 models
# In appendix, all 6 models
if_top3_models = True

if if_top3_models:
    mode = "TOP 3 models"
else:
    mode = "ALL 6 models"

# Inter-annotator stats

In [129]:
import pandas as pd
import numpy as np
from statsmodels.stats import inter_rater as irr
from scipy import stats
from statistics import mean

file_name_1 = "TURKERS-EVAL_data-intersection-ALL_6_models.csv"
file_name_2 = "TURKERS-EVAL_data-intersection-TOP_3_models.csv"

if if_top3_models == True:
    eval_df = pd.concat([
        pd.read_csv(file_name_1),
        pd.read_csv(file_name_2),
    ])
    eval_df = eval_df[eval_df["Model"].isin(top_3models)]
else:
    eval_df = pd.read_csv(file_name_1)

len(eval_df)

606

### Fleiss Kappa

In [130]:
def fleiss_kappa(df: pd.DataFrame, feature, len_annotators): # df == annotations within each Group
    """
    - Fleiss Kappa:
    :input: raw_task_data = data containing category assignment with subjects in rows and raters in columns.
    :use: for questions 1, 3 & 4
    """
    f_task_data = list()

    hid_ids = np.unique(list(df["HIT ID"]))

    for hid_id in hid_ids:
        same_hit_df = df[df["HIT ID"] == hid_id] # annotations within each hit id

        if len(same_hit_df) == len_annotators: # only calculate annotations == len_annotators (2)
            anno_vec = [] # anno_vectors for each dimension, e.g.: [yes, no, yes]

            for _, row in same_hit_df.iterrows():
                anno_vec.append (row[feature])

            f_task_data.append(anno_vec)

    #print (f_task_data)

    agg = irr.aggregate_raters(f_task_data) # returns a tuple (data, categories)
    #print (agg)
    fleiss_rslt = irr.fleiss_kappa(agg[0], method='randolph')

    return round (fleiss_rslt, 3)

### Krippendorff's alpha (ordinal distance)

In [131]:
def mean_NA(x):
    """
    :Usage: to calculate the mean of the 3 annotators' scores
    - Returns the mean of a list of 3 annotators:
        - If >= 2 annotators put "NA", returns "NA"
        - else: returns the mean of the 3 annotators, ignoring "NA"
    """
    if x.isna().sum() >= 2:
        return np.nan
    else:
        non_nan_values = [i for i in x if not np.isnan(i)]
        return sum(non_nan_values) / len(non_nan_values)

In [132]:
from nltk.metrics.agreement import AnnotationTask
from nltk.metrics.distance import interval_distance

def krippendorff_alpha(df: pd.DataFrame, feature, show_per_worker=False):
    """
    - Krippendorff's Alpha using Ordinal Distance:
    - Apply on the human evaluation dataframe *after* converting the selections to numerical values (eval_df_copy).
    :usage: do it on each worker against all others (mean), then take average of all workers
    :input: df - data containing category assignment with subjects in rows and raters in columns.
            feature - the column name of the feature to calculate alpha for.
    :return: Krippendorff's alpha using ordinal distance.
    """
    agg_dict = {}
    agg_dict[feature] = mean_NA
    
    result = []

    worker_ids = np.unique(list(df["Worker ID"]))
    for worker_id in worker_ids:
        worker_df = df[df["Worker ID"] == worker_id] # annotations by the single annotator
        worker_df = worker_df.groupby("HIT ID").agg(agg_dict).reset_index(drop=False).sort_values(by="HIT ID") # sort by HIT ID
        all_other_worker_df = df[ (df["Worker ID"] != worker_id) & (df["HIT ID"].isin(worker_df["HIT ID"])) ] # annotations by all other annotators on the same posts
        all_other_worker_mean_df = all_other_worker_df.groupby("HIT ID").agg(agg_dict).reset_index(drop=False) # taking the mean of the other annotators' scores
        all_other_worker_mean_df = all_other_worker_mean_df.sort_values(by="HIT ID") # sort by HIT ID

        # filter out HITs that have only 1 annotator
        worker_df = worker_df[worker_df["HIT ID"].isin(all_other_worker_mean_df["HIT ID"])]
        assert worker_df["HIT ID"].tolist() == all_other_worker_mean_df["HIT ID"].tolist() # make sure the two dataframes are aligned

        task_data = []
        for idx, hit_id in enumerate(np.unique(list(worker_df["HIT ID"]))):
            task_data.append ( ("coder_1", idx, worker_df[worker_df["HIT ID"] == hit_id][feature].tolist()[0]) )
            task_data.append ( ("coder_2", idx, all_other_worker_mean_df[all_other_worker_mean_df["HIT ID"] == hit_id][feature].tolist()[0]) )

        task = AnnotationTask(distance = interval_distance)
        task.load_array(task_data)
        worker_krippendorff = task.alpha()
        result.append(worker_krippendorff)
        
        if show_per_worker:
            print (worker_id, worker_krippendorff)

    return round (mean(result), 3)

### Pearson Correlation

In [133]:
def calculate_corr(df: pd.DataFrame, feature, method, show_per_worker=False): # df == annotations within each Group
    """
    - Calculate the pair-wise Pearson correlation coefficient between each pair of annotators on each dimension
    - We do it on each dimension so that we can average --> cross examples
    :usage: Compare each annotator's work against the average of the other one/two annotators
    """
    agg_dict = {}
    agg_dict[feature] = mean_NA

    results = []

    worker_ids = np.unique(list(df["Worker ID"]))

    # cross-posts, for each dimension
    for worker_id in worker_ids:
        worker_results = []

        worker_df = df[df["Worker ID"] == worker_id] # annotations by the single annotator
        worker_df = worker_df.groupby("HIT ID").agg(agg_dict).reset_index(drop=False).sort_values(by="HIT ID") # sort by HIT ID
        all_other_worker_df = df[ (df["Worker ID"] != worker_id) & (df["HIT ID"].isin(worker_df["HIT ID"])) ] # annotations by all other annotators on the same posts
        all_other_worker_mean_df = all_other_worker_df.groupby("HIT ID").agg(agg_dict).reset_index(drop=False) # taking the mean of the other annotators' scores
        all_other_worker_mean_df = all_other_worker_mean_df.sort_values(by="HIT ID") # sort by HIT ID

        # filter out HITs that have only 1 annotator
        worker_df = worker_df[worker_df["HIT ID"].isin(all_other_worker_mean_df["HIT ID"])]

        #display (worker_df)
        #display (all_other_worker_mean_df)
        assert worker_df["HIT ID"].tolist() == all_other_worker_mean_df["HIT ID"].tolist() # make sure the two dataframes are aligned

        # annotations from a single annotator compared to other annotators' mean scores, in each feature asked
        annotator_vec = worker_df[feature].tolist() # annotations from a single annotator, in each dimension for all posts that he annotated
        other_annotators_mean_vec = all_other_worker_mean_df[feature].tolist() # mean scores from other annotators, in each dimension for all posts that they annotated

        #print (annotator_vec)
        #print (other_annotators_mean_vec)

        if len(annotator_vec) >= 2: # ValueError: x and y must have length at least 2.
            if np.var(annotator_vec) != 0 and np.var(other_annotators_mean_vec) != 0: # non-0 variance
                if method == "pearson":
                    results.append (stats.pearsonr(annotator_vec, other_annotators_mean_vec))
                    worker_results.append (stats.pearsonr(annotator_vec, other_annotators_mean_vec))
                elif method == "spearman":
                    results.append (stats.spearmanr(annotator_vec, other_annotators_mean_vec))
                    worker_results.append (stats.spearmanr(annotator_vec, other_annotators_mean_vec))
            else:
                print ("Error: 0 variance!")
        else:
            print ("Error: x and y must have length at least 2.")

        #print (stats.pearsonr(annotator_vec, other_annotators_mean_vec))

        if show_per_worker and worker_results != []:
            print (f"Worker {worker_id}: corr = {round (mean([item[0] for item in worker_results]), 3)}, p = {round (mean([item[1] for item in worker_results]), 3)}")

    if results == []:
        return np.nan
    else:
        return round (mean([item[0] for item in results]), 3), round (mean([item[1] for item in results]), 3)

### Results

In [134]:
# map the string values to integers
mapping_dict = {
    "yes": 1,
    "minor": 0.5,
    "maybe": 0.5,
    "no": 0,
}

mapping_dict_rel = {
    1:  0,
    2:  0.25,
    3:  0.5,
    4:  0.75,
    5:  1
    }

eval_df_copy = eval_df.copy()
eval_df_copy[["factuality", "justifies", "usefulness"]] = eval_df_copy[["factuality", "justifies", "usefulness"]].apply(lambda x: x.map(mapping_dict))
eval_df_copy[["relevance"]] = eval_df_copy[["relevance"]].apply(lambda x: x.map(mapping_dict_rel))

In [135]:
len(eval_df) - len(np.unique(eval_df["HIT ID"]))

279

In [136]:
print (mode)
print ("hits:", len(np.unique(eval_df["HIT ID"])))
print ("================================================")
print ("              *** Fleiss Kappa ***")
print ("Factuality:", fleiss_kappa(eval_df, "factuality", 2))
print ("Justifies:", fleiss_kappa(eval_df, "justifies", 2))
print ("Relevance:", fleiss_kappa(eval_df, "relevance", 2))
print ("Usefulness:", fleiss_kappa(eval_df, "usefulness", 2))
print ("------------------------------------------------")
print ("*** Krippendorff's alpha (interval distance) ***")
print ("Factuality:", krippendorff_alpha(eval_df_copy, "factuality"))
print ("Justifies:", krippendorff_alpha(eval_df_copy, "justifies"))
print ("Relevance:", krippendorff_alpha(eval_df_copy, "relevance"))
print ("Usefulness:", krippendorff_alpha(eval_df_copy, "usefulness"))
print ("------------------------------------------------")
print ("         *** Pearson Correlation ***")
print ("Pearson Correlation on RELEVANCE:", calculate_corr(eval_df, "relevance", "pearson", show_per_worker=True))
print ("================================================")

TOP 3 models
hits: 327
              *** Fleiss Kappa ***
Factuality: 0.29
Justifies: 0.699
Relevance: 0.355
Usefulness: 0.446
------------------------------------------------
*** Krippendorff's alpha (interval distance) ***
Factuality: 0.59
Justifies: 0.576
Relevance: 0.718
Usefulness: 0.668
------------------------------------------------
         *** Pearson Correlation ***
Worker A1KEA2Z47S3UPI: corr = 0.659, p = 0.0
Worker A2A6FH0F7LD9ND: corr = 0.8, p = 0.0
Worker A2MO3EE6D0P3KR: corr = 0.97, p = 0.006
Worker AKA8TN8H8DQ6T: corr = 0.671, p = 0.0
Pearson Correlation on RELEVANCE: (0.775, 0.002)
