### Annotation analysis
This notebook analyses the annotations in a results file from PhiTag.

The instanceIDs in the annotation data were inconsistent across the experiments, so we need to destinguish between them in the analysis.
To make this easier, this process has been reduced to one variable: `round`.


### Usage
Set `round` to the according round of the annotation phase1

In [1]:
import pandas as pd

In [2]:
rounds = ['en1', 'en2', 'sw1', 'sw2']
round = rounds[2] 

In [3]:
if round == 'en1':
    data_path = '../data/annotation_results/phase1/11-26_eng_results.csv'
elif round == 'en2':
    data_path = '../data/annotation_results/phase2/12-11_eng_results.csv'
elif round == 'sw1':
    data_path = '../data/annotation_results/phase1/11-26_swe_results.csv'
elif round == 'sw2':
    data_path = '../data/annotation_results/phase2/12-11_swe_results.csv'

In [4]:
# extracts the usageID which is part of the instanceID
def extract_usageID(dataframe):
    if round == 'en1':
        return dataframe["instanceID"].str.split('-').str[2:].str.join('-')
    elif round == 'en2':
        return dataframe["instanceID"].str.split('-').str[1:-1].str.join('-')
    elif round == 'sw1':
        return dataframe["instanceID"].str.split('-').str[:-1].str.join('-')
    elif round == 'sw2':
        return dataframe["instanceID"].str.split('-').str[1:-2].str.join('-')

In [5]:
# extracts the senseID which is part of the instanceID
def extract_senseID(dataframe):
    if round == 'en1':
        return dataframe['instanceID'].str.split('-').str[:3].str.join('-')
    elif round == 'en2':
        return dataframe['instanceID'].str.split('-').str[-1]
    elif round == 'sw1':
        return dataframe['instanceID'].str.split('-').str[-2:].str.join('-')
    elif round == 'sw2':
        return dataframe['instanceID'].str.split('-').str[-2:].str.join('-')

In [6]:
# examine modern instances
df = pd.read_csv(data_path, delimiter='\t', quoting=3, na_filter=False)
df_label_dist = pd.DataFrame()
df_stats = pd.DataFrame()
df["usageID"] = extract_usageID(df)

df_stats["all"] = df.nunique()
df_stats["modern"] = df.query('instanceID.str.contains("news")', engine='python').nunique()
df_stats["historical"] = df.query('~instanceID.str.contains("news")', engine='python').nunique()

display(df_stats)

df_label_dist["all"] = df["label"].value_counts()
df_label_dist["modern"] = df.query('instanceID.str.contains("news")', engine='python')["label"].value_counts()
df_label_dist["historical"] = df.query('~instanceID.str.contains("news")', engine='python')["label"].value_counts()
display(df_label_dist)

print(df.shape)
display(df.head(10))

Unnamed: 0,all,modern,historical
instanceID,1202,579,623
label,3,3,3
comment,96,9,89
annotator,3,3,3
usageID,706,337,369


Unnamed: 0,all,modern,historical
1,2104,1058,1046
0,1294,650,644
-,208,29,179


(3606, 5)


Unnamed: 0,instanceID,label,comment,annotator,usageID
0,PROCESSED_kubhist2a.json-10-0:0-fia..0-0,0,,Benjamin,PROCESSED_kubhist2a.json-10-0:0-fia..0
1,PROCESSED_kubhist2a.json-10-3:3-pergament..0-0,1,,Benjamin,PROCESSED_kubhist2a.json-10-3:3-pergament..0
2,PROCESSED_kubhist2a.json-1000-1:1-sol..0-0,-,,Benjamin,PROCESSED_kubhist2a.json-1000-1:1-sol..0
3,PROCESSED_kubhist2a.json-1000-1:1-sol..0-2,-,,Benjamin,PROCESSED_kubhist2a.json-1000-1:1-sol..0
4,PROCESSED_kubhist2a.json-1000-1:1-sol..0-3,-,,Benjamin,PROCESSED_kubhist2a.json-1000-1:1-sol..0
5,PROCESSED_kubhist2a.json-1003-19:19-rättesnöre...,1,,Benjamin,PROCESSED_kubhist2a.json-1003-19:19-rättesnöre..0
6,PROCESSED_kubhist2a.json-1005-19:19-därstädes....,1,,Benjamin,PROCESSED_kubhist2a.json-1005-19:19-därstädes..0
7,PROCESSED_kubhist2a.json-1011-2:2-just..0-0,0,,Benjamin,PROCESSED_kubhist2a.json-1011-2:2-just..0
8,PROCESSED_kubhist2a.json-1011-2:2-just..0-4,0,,Benjamin,PROCESSED_kubhist2a.json-1011-2:2-just..0
9,PROCESSED_kubhist2a.json-102-16:16-uppbära..0-0,1,,Benjamin,PROCESSED_kubhist2a.json-102-16:16-uppbära..0


In [7]:
# verify completeness
verify_df = df.groupby('annotator').count()
display(verify_df["instanceID"])

annotator
Benjamin             1202
JosefinKokkinakis    1202
lilianerika          1202
Name: instanceID, dtype: int64

In [8]:
import numpy as np

# Get all annotators
annotators = df.annotator.unique()
display(annotators)

# Replace the "cannot decide" judgment
df['label'] = df['label'].replace('-', np.nan)
label_set = df.label.unique()
display(label_set)

# Get aggregated data as instance versus annotator
df_instance_vs_ann = pd.DataFrame()
for annotator in annotators:
    judgments_annotator = df[df['annotator'] == annotator][['instanceID', 'label']].rename(columns={'label': annotator}, inplace=False)
    df_instance_vs_ann = pd.concat([df_instance_vs_ann, judgments_annotator])

df_instance_vs_ann_aggregated = df_instance_vs_ann.groupby(['instanceID']).first().reset_index() 
df_instance_vs_ann_aggregated = df_instance_vs_ann_aggregated.fillna(value=np.nan) # replace None
display(df_instance_vs_ann_aggregated)

array(['Benjamin', 'lilianerika', 'JosefinKokkinakis'], dtype=object)

array(['0', '1', nan], dtype=object)

Unnamed: 0,instanceID,Benjamin,lilianerika,JosefinKokkinakis
0,PROCESSED_kubhist2a.json-10-0:0-fia..0-0,0,,0
1,PROCESSED_kubhist2a.json-10-3:3-pergament..0-0,1,1,1
2,PROCESSED_kubhist2a.json-1000-1:1-sol..0-0,,1,0
3,PROCESSED_kubhist2a.json-1000-1:1-sol..0-2,,,0
4,PROCESSED_kubhist2a.json-1000-1:1-sol..0-3,,0,0
...,...,...,...,...
1197,PROCESSED_swe_news_2022_1M-sentences.json-978-...,0,0,0
1198,PROCESSED_swe_news_2022_1M-sentences.json-995-...,1,1,1
1199,PROCESSED_swe_news_2022_1M-sentences.json-999-...,1,1,1
1200,PROCESSED_swe_news_2022_1M-sentences.json-999-...,0,1,0


In [9]:
usage_df = df_instance_vs_ann_aggregated


clean_df = usage_df.dropna(subset=annotators, thresh=2) # drop rows with more than 2 NaNs

clean_df = clean_df.drop(clean_df[clean_df[annotators].apply(lambda x: len(set(x)) == len(x), axis=1)].index) # drop rows where all annotator labels are different

# get differences between clean_df and usage_df
diff_df = pd.concat([usage_df, clean_df]).drop_duplicates(keep=False)

display(diff_df)
print(len(usage_df), len(clean_df))

print("modern removed", diff_df.query('instanceID.str.contains("news")', engine='python')["instanceID"].nunique())
print("hist removed" ,diff_df.query('~instanceID.str.contains("news")', engine='python')["instanceID"].nunique())


Unnamed: 0,instanceID,Benjamin,lilianerika,JosefinKokkinakis
2,PROCESSED_kubhist2a.json-1000-1:1-sol..0-0,,1,0
3,PROCESSED_kubhist2a.json-1000-1:1-sol..0-2,,,0
7,PROCESSED_kubhist2a.json-1011-2:2-just..0-0,0,1,
8,PROCESSED_kubhist2a.json-1011-2:2-just..0-4,0,1,
12,PROCESSED_kubhist2a.json-1026-4:4-passande..0-1,0,1,
...,...,...,...,...
1023,PROCESSED_swe_news_2022_1M-sentences.json-659-...,1,0,
1057,PROCESSED_swe_news_2022_1M-sentences.json-737-...,1,0,
1133,PROCESSED_swe_news_2022_1M-sentences.json-893-...,0,1,
1169,PROCESSED_swe_news_2022_1M-sentences.json-959-...,1,0,


1202 1115
modern removed 10
hist removed 77


In [10]:
# Aggregate labels
clean_df['majority'] = clean_df[annotators].mode(axis=1)[0]
clean_df['usageID'] = extract_usageID(clean_df)
clean_df['senseID'] = extract_senseID(clean_df)


clean_df.drop('instanceID', axis=1, inplace=True)
clean_df.drop(annotators, axis=1, inplace=True)
display(clean_df.head(5))
all_grouped = clean_df.groupby(['usageID'])['senseID'].apply(list).reset_index()
#display(all_grouped.head(5))
print("usages", len(all_grouped))
print("     modern", len(all_grouped.query('usageID.str.contains("news")', engine='python')))
print("     historical", len(all_grouped.query('~usageID.str.contains("news")', engine='python')))

majority_df = clean_df[clean_df['majority'] == '1'] # keep only rows with majority label = 1
majority_df = majority_df.drop('majority', axis=1)

majority_grouped = majority_df.groupby(['usageID'])['senseID'].apply(list).reset_index() # group by usageID and aggregate senseIDs
print("assigned", len(majority_grouped))
print("     modern", len(majority_grouped.query('usageID.str.contains("news")', engine='python')))
print("     historical", len(majority_grouped.query('~usageID.str.contains("news")', engine='python')))

Unnamed: 0,majority,usageID,senseID
0,0,PROCESSED_kubhist2a.json-10-0:0-fia..0,fia..0-0
1,1,PROCESSED_kubhist2a.json-10-3:3-pergament..0,pergament..0-0
4,0,PROCESSED_kubhist2a.json-1000-1:1-sol..0,sol..0-3
5,1,PROCESSED_kubhist2a.json-1003-19:19-rättesnöre..0,rättesnöre..0-0
6,1,PROCESSED_kubhist2a.json-1005-19:19-därstädes..0,därstädes..0-0


usages 674
     modern 333
     historical 341
assigned 562
     modern 293
     historical 269


In [11]:
import krippendorff as krippendorff # installation: https://github.com/pln-fing-udelar/fast-krippendorff#installation
from itertools import combinations

# Get all annotators for each type of data
data = df_instance_vs_ann_aggregated[annotators]
modern_data = df_instance_vs_ann_aggregated.query('instanceID.str.contains("news")', engine='python')[annotators]
historical_data = df_instance_vs_ann_aggregated.query('~instanceID.str.contains("news")', engine='python')[annotators]

# Re-interpret strings as floats (Krippendorff's alpha requires this)
data = data.astype(float) 
modern_data = modern_data.astype(float) 
historical_data = historical_data.astype(float)

# Transpose data arrays for right input format for Krippendorff
data_values = np.transpose(data.values)
modern_values = np.transpose(modern_data.values)
historical_values = np.transpose(historical_data.values)

# Krippendorff's alpha
kri = krippendorff.alpha(reliability_data=data_values, level_of_measurement='nominal', value_domain=[1.0, 0.0])
modern_kri = krippendorff.alpha(reliability_data=modern_values, level_of_measurement='nominal', value_domain=[1.0, 0.0])
historical_kri = krippendorff.alpha(reliability_data=historical_values, level_of_measurement='nominal', value_domain=[1.0, 0.0])
print('full', kri)
print('modern', modern_kri)
print('historical', historical_kri)

# pairwise Krippendorff's alpha
for data_set, name in [(data, "all"), (modern_data, "modern"), (historical_data, "historical")]:
    for a, b in combinations(annotators, 2):
        kri = krippendorff.alpha(reliability_data=[data_set[a].values, data_set[b].values], level_of_measurement='nominal', value_domain=[1.0, 0.0])
        print(f'pairwise {name}:', a, b, kri)

full 0.48007666224384493
modern 0.5879569579758616
historical 0.37104238793118915
pairwise all: Benjamin lilianerika 0.40822263851044505
pairwise all: Benjamin JosefinKokkinakis 0.5746523461455588
pairwise all: lilianerika JosefinKokkinakis 0.5241637141011238
pairwise modern: Benjamin lilianerika 0.5058186683967236
pairwise modern: Benjamin JosefinKokkinakis 0.6144676631304941
pairwise modern: lilianerika JosefinKokkinakis 0.6553708892642909
pairwise historical: Benjamin lilianerika 0.31744158602785
pairwise historical: Benjamin JosefinKokkinakis 0.5269597498029444
pairwise historical: lilianerika JosefinKokkinakis 0.35268429050812466
