In [1]:
import srsly
from pathlib import Path
import polars as pl
from lazylines import LazyLines

This is a polars dataframe with all the annotations. 

## Google Emotions Dataset

Small warning: this is Reddit data. We can't expect texts of people that behave nicely.

In [2]:
df_emotions = pl.concat([pl.read_csv(f"go_emotions/goemotions_{i}.csv") for i in [1,2,3]])

df_emotions.head(3)

text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,disapproval,disgust,embarrassment,excitement,fear,gratitude,grief,joy,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
str,str,str,str,str,str,f64,i64,bool,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""That game hurt…","""eew5j0j""","""Brdd9""","""nrl""","""t3_ajis4z""","""t1_eew18eq""",1548400000.0,1,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
""" >sexuality sh…","""eemcysk""","""TheGreen888""","""unpopularopini…","""t3_ai4q37""","""t3_ai4q37""",1548100000.0,37,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""You do right, …","""ed2mah1""","""Labalool""","""confessions""","""t3_abru74""","""t1_ed2m7g7""",1546400000.0,37,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [3]:
label_of_interest = "excitement"

In [4]:
df_disagree = (
    df_emotions
    .select("text", "id", "rater_id", label_of_interest)
    .group_by("id", "text")
    .agg(
        n_unique_annot=pl.col(label_of_interest).n_unique(), 
        n_annot=pl.col(label_of_interest).count(),
        rating=pl.col(label_of_interest).mean(),
        n_pos=pl.col(label_of_interest).sum()
    )
    .filter(pl.col("n_unique_annot") != 1)
)

## Statistics

Before diving into the statistics, it helps to remember the difference between an example and an annotation.

![](images/annot.png)

In [5]:
n_disagree = df_disagree.shape[0]
n_pos_label = df_emotions.filter(pl.col(label_of_interest) == 1).shape[0]
n_pos_disagree = df_disagree.with_columns(pl.col("rating") * pl.col("n_annot")).sum().to_dicts()[0]['n_pos']

print(f"There are {n_disagree} / {df_emotions.group_by('id').count().shape[0]} individual texts with disagreement.")
print(f"That's about {100 * n_disagree / df_emotions.group_by('id').count().shape[0]:.2f}% of all individual texts that have disagreement.")
print("")
print("But maybe we can also look at annotations. There are at least 3 per example.")
print("But maybe we care a bit more about the positive cases.")

There are 4217 / 58011 individual texts with disagreement.
That's about 7.27% of all individual texts that have disagreement.

But maybe we can also look at annotations. There are at least 3 per example.
But maybe we care a bit more about the positive cases.


In [6]:
print(f"There are {n_pos_label} annotations that indicate {label_of_interest} in total.")
print(f"Out of these {n_pos_label} annotations, {n_pos_disagree} has disagreement.")
print(f"That suggests that {n_pos_disagree} / {n_pos_label} = {100 * n_pos_disagree/n_pos_label:.2f}% of the positive {label_of_interest} annotations point to a doubtful example.")

There are 5629 annotations that indicate excitement in total.
Out of these 5629 annotations, 5216 has disagreement.
That suggests that 5216 / 5629 = 92.66% of the positive excitement annotations point to a doubtful example.


## Observation 

These stats may give you pause. And yet ... this dataset is [pretty popular](https://huggingface.co/datasets/go_emotions).

## Why? 

When you explore the samples though, you may start to understand why it is also a hard thing to annotate.

In [7]:
for ex in df_disagree.select("text", "rating", "n_annot").sample().to_dicts():
    print(ex)

{'text': 'Cheers man, will be taking it easy for sure. ', 'rating': 0.3333333333333333, 'n_annot': 3}


### Towards a Metric 

We're going to be annotating our own dataset in a bit and we'd like to compare the quality of our own annotations to the quality of GoEmotions. However ... we would prefer to compare apples to apples. 

So let's introduce a metric that might allow us to compare across datasets. 

In [8]:
from lazylines import LazyLines
import itertools as it

def to_pairs(lines, id_key="_input_hash", label_of_interest="annot", annot_id="_annotator_id"):
    for ex in lines:
        for combo in it.combinations(ex['subset'], 2):
            yield {
                "ex": ex[id_key],
                "u1": combo[0][annot_id],
                "u2": combo[1][annot_id],
                "agree": combo[0][label_of_interest] == combo[1][label_of_interest]
            }

def calc_observed_agreement(lines):
    i = 0
    s = 0
    for ex in lines:
        i += 1
        s += ex['agree']
    yield s/i

In [9]:
label_of_interest = "excitement"

# Calculate prob somebody would annotator positive label
prob_exp = 1 - df_emotions.select(label_of_interest).mean().to_dicts()[0][label_of_interest]

# Calculate how often an example has everyone agree
lines = LazyLines(df_emotions.select("id", "rater_id", label_of_interest).to_dicts())
             
prob_obs = (
    lines
        .progress()
        .nest_by("id")
        .pipe(to_pairs, label_of_interest=label_of_interest, annot_id="rater_id", id_key="id")
        .pipe(calc_observed_agreement)
        .collect()[0]
)

100%|██████████████████████████████████████| 211225/211225 [00:00<00:00, 1241244.33it/s]


A little bit of theory for agreement statistics. We're about to calculate a number.

$$
\kappa \equiv \frac{p_o-p_e}{1-p_e}=1-\frac{1-p_o}{1-p_e}
$$

The thinking behind this number is that it's best to compare the _expected_ agreement with the _observed_ agreement. This is especially useful in unbalanced label scenarios. 

In [10]:
print(f"Probability expected: {prob_exp}")
print(f"Probability observed: {prob_obs}")
print(f"Agreement statistic: {(prob_obs - prob_exp)/(1 - prob_exp)}")

Probability expected: 0.9733506923896319
Probability observed: 0.9557999612251536
Agreement statistic: -0.6585811316782585


This number is still just a number. But it is something one might track over time and it is also a number that can give you a smell test that something is up. 

## Our Own Dataset

In [11]:
def cast_to_cat(dataf, *cols):
    return dataf.with_columns(**{c: pl.col(c).cast(pl.Categorical) for c in cols})

df = pl.read_ndjson("cleaned.jsonl").pipe(cast_to_cat, "label", "answer", "_annotator_id", "_session_id", "kind")
df.head()

text,_input_hash,_task_hash,label,answer,_annotator_id,_session_id,kind,annot
str,i64,i64,cat,cat,cat,cat,cat,bool
"""To facilitate …",-1631564981,-130688452,"""new-dataset""","""accept""","""f96f9733-e216-…","""7b09a927-88e4-…","""single""",True
"""This collectio…",-1597612878,-1280357114,"""new-dataset""","""accept""","""f96f9733-e216-…","""7b09a927-88e4-…","""single""",True
"""We assume that…",-1228026693,2068011284,"""new-dataset""","""reject""","""f96f9733-e216-…","""7b09a927-88e4-…","""single""",False
"""To facilitate …",364583976,-2053011513,"""new-dataset""","""accept""","""f96f9733-e216-…","""7b09a927-88e4-…","""single""",True
"""To fully unloc…",-1833406195,-332649941,"""new-dataset""","""accept""","""f96f9733-e216-…","""7b09a927-88e4-…","""single""",True


## Do you agree with yourself?

This is a query that allows you to compare a single annotator across annotation interfaces.

In [12]:
label = "new-dataset"
annot_id = "f96f9733-e216-4362-9e68-ca314f186b5e"

(
    df
        .filter(pl.col("label") == label)
        .filter(pl.col("_annotator_id") == annot_id)
        .pivot(values="annot", index="_input_hash", columns="kind", aggregate_function="sum")
        .filter(
            pl.col("single").is_not_null() & pl.col("multi").is_not_null()
        )
        .group_by("single", "multi")
        .count()
)

single,multi,count
u32,u32,u32
0,0,2
1,0,3
0,1,5
1,1,4


## Do annotators agree with eachother?

Let's first check the examples that differ.

In [13]:
label = "new-dataset"
kind = "single" 

(
    df
        .filter(pl.col("label") == label)
        .filter(pl.col("kind") == kind)
        .group_by("_input_hash")
        .agg(
            n_unique_annot=pl.col("annot").n_unique(), 
            n_annot=pl.col("annot").count(),
            rating=pl.col("annot").mean()
        )
        .filter(pl.col("n_unique_annot") != 1)
)

_input_hash,n_unique_annot,n_annot,rating
i64,u32,u32,f64
1128546023,2,2,0.5
-1208466695,2,2,1.0
-664389724,2,2,0.5
-1293250435,2,2,0.5
-310040368,2,2,0.5
-1150096620,2,2,0.5
1985152956,2,2,1.0
-262199974,2,2,0.5
1002264393,2,2,0.5
72276985,2,2,0.0


You can pick an example below.

In [14]:
input_hash = -2086104188

print(df.filter(pl.col("_input_hash") == input_hash).head(1).to_dict(as_series=False)['text'][0])

(
    df
        .filter(pl.col("label") == label)
        .filter(pl.col("kind") == kind)
        .filter(pl.col("_input_hash") == input_hash)
        .select("_annotator_id", "annot")
)

Results on a modified version of the EntailmentBank dataset and a new dataset called Everyday Norms: Why Not? show that abductive generation with validation can recover premises across in- and out-of-domain settings


_annotator_id,annot
cat,bool
"""f96f9733-e216-…",False
"""38966de5-008d-…",True


Also here we can try to calculate annotator agreement statistics. 

In [15]:
df_singles.head(1)

NameError: name 'df_singles' is not defined

In [None]:
df_singles = df.filter(pl.col("label") == "new-dataset").filter(pl.col("kind") == "single")

# Calculate prob somebody would annotator positive label
prob_exp = 1 - df_singles.select("annot").mean().to_dicts()[0]["annot"]

# Calculate how often an example has everyone agree
lines = LazyLines(df_singles.select("_input_hash", "_annotator_id", "annot").to_dicts())
             
prob_obs = (
    lines
        .progress()
        .nest_by("_input_hash")
        .pipe(to_pairs)
        .pipe(calc_observed_agreement)
        .collect()[0]
)

This number should be much better than GoEmotions.

In [None]:
print(f"Probability expected: {prob_exp}")
print(f"Probability observed: {prob_obs}")
print(f"Agreement statistic: {(prob_obs - prob_exp)/(1 - prob_exp)}")