In [None]:
import pandas as pd
from itertools import combinations
from sklearn.metrics import cohen_kappa_score

df = pd.read_csv("annotator_agreement.csv")

human_cols = ["human_score_0", "human_score_1", "human_score_2"]
llm_cols   = ["llm_score_0",   "llm_score_1",   "llm_score_2"]

In [21]:
def pairwise_kappa(df, cols):
    """Return a DataFrame of pairwise Cohen's kappa between the given columns."""
    results = []
    for a, b in combinations(cols, 2):
        # Only keep rows where both annotators have a value
        mask = df[[a, b]].notna().all(axis=1)
        kappa = cohen_kappa_score(df.loc[mask, a], df.loc[mask, b])
        results.append({"rater_1": a, "rater_2": b, "kappa": kappa})
    return pd.DataFrame(results)


In [25]:
# 1) Human–Human agreement
human_human_agreement = pairwise_kappa(df, human_cols)

display(human_human_agreement)
human_human_agreement.to_markdown()

Unnamed: 0,rater_1,rater_2,kappa
0,human_score_0,human_score_1,0.949416
1,human_score_0,human_score_2,0.950273
2,human_score_1,human_score_2,0.949416


'|    | rater_1       | rater_2       |    kappa |\n|---:|:--------------|:--------------|---------:|\n|  0 | human_score_0 | human_score_1 | 0.949416 |\n|  1 | human_score_0 | human_score_2 | 0.950273 |\n|  2 | human_score_1 | human_score_2 | 0.949416 |'

In [26]:
# 2) LLM–LLM agreement
llm_llm_agreement = pairwise_kappa(df, llm_cols)
display(llm_llm_agreement)
llm_llm_agreement.to_markdown()

Unnamed: 0,rater_1,rater_2,kappa
0,llm_score_0,llm_score_1,1.0
1,llm_score_0,llm_score_2,1.0
2,llm_score_1,llm_score_2,1.0


'|    | rater_1     | rater_2     |   kappa |\n|---:|:------------|:------------|--------:|\n|  0 | llm_score_0 | llm_score_1 |       1 |\n|  1 | llm_score_0 | llm_score_2 |       1 |\n|  2 | llm_score_1 | llm_score_2 |       1 |'

In [31]:
# 3) Human–LLM agreement (all cross pairs)
def cross_group_kappa(df, group1, group2):
    results = []
    for a in group1:
        for b in group2:
            mask = df[[a, b]].notna().all(axis=1)
            kappa = cohen_kappa_score(df.loc[mask, a], df.loc[mask, b])
            results.append({"rater_1": a, "rater_2": b, "kappa": kappa})
    return pd.DataFrame(results)

human_llm_agreement = cross_group_kappa(df, human_cols, [llm_cols[0]])

display(human_llm_agreement)

human_llm_agreement.to_markdown()

Unnamed: 0,rater_1,rater_2,kappa
0,human_score_0,llm_score_0,0.870185
1,human_score_1,llm_score_0,0.867076
2,human_score_2,llm_score_0,0.81826


'|    | rater_1       | rater_2     |    kappa |\n|---:|:--------------|:------------|---------:|\n|  0 | human_score_0 | llm_score_0 | 0.870185 |\n|  1 | human_score_1 | llm_score_0 | 0.867076 |\n|  2 | human_score_2 | llm_score_0 | 0.81826  |'