In [None]:
# !pip install pandas ir_datasets statsmodels krippendorff

In [1]:
import pandas as pd
import os

In [35]:
df = pd.read_csv("queries_annotation_all.csv")
df["query"] = df["query"].str.lower()

In [None]:
dir = "../data/processed/queries_annotation"
files = os.listdir(dir)

groups = {0: pd.DataFrame()}

for file in files:    
    print(file)
    if not file.endswith(".tsv"):
        continue
    annotations = pd.read_csv(f"{dir}/{file}", sep="\t")
        
    # Cleaning
    annotations = annotations.rename(columns={"1. Nicht Zeitlich": 1, "2. Explizit Zeitlich": 2, "3. Ereignis": 3, "4. Mehrdeutig": 4, "5. Aktualität": 5})
    annotations["query"] = annotations["query"].str.lower()
    annotations = annotations[annotations["query"]!="airport security"]  # duplicate topic
    try:
        annotations = annotations.drop(columns=["Notizen"])
    except:
        print(f"{file} has no notizen")

    # One-Hot Decoding
    annotations = pd.melt(annotations, id_vars=['query'], var_name='category', value_name='Value')
    annotations = annotations[annotations['Value'] == 1].drop(columns='Value').reset_index(drop=True)    
    
    annotations["annotator"] = file.split("_")[1].replace(".tsv", "")
    
    # Merge per group
    updated = False
    for key, group in groups.items():
        if group.empty:
            groups[key] = annotations.copy()
            updated = True
            break
        elif not group['query'].isin(annotations['query']).any():
            groups[key] = pd.concat([group, annotations], ignore_index=True)
            updated = True
            break

    if not updated:
        groups[key+1] = annotations.copy()

queries_jonina.tsv
queries_braun.tsv
queries_lin.tsv
queries_large.tsv
queries_mmoershe.tsv
queries_loewenstein.tsv
queries_maron.tsv
.gitkeep
queries_simon.tsv
queries_romanovskis.tsv
queries_romanovskis.tsv has no notizen
README.md
queries_ofunim.tsv
queries_ofunim.tsv has no notizen
queries_busch.tsv
queries_coban.tsv
queries_hovhannisyan.tsv
queries_el_ghadioui.tsv
queries_wollenberg.tsv
queries_maassen.tsv
queries_witalla.tsv


In [None]:
groups.keys()  # we have up to 5 annotators for one group

dict_keys([0, 1, 2, 3, 4])

In [32]:
table = groups[0][["query"]].set_index('query')

for id, group in groups.items():
    if key == 0:
        continue
    table = table.join(group.set_index('query').copy(), rsuffix=f"_{id}")

In [38]:
df_annotated = table.join(df.set_index("query"))

In [39]:
df_annotated.to_csv("queries_annotation_all_grouped.csv")

# Interrator Agreement

In [56]:
import pandas as pd
from statsmodels.stats.inter_rater import fleiss_kappa, aggregate_raters

In [49]:
df_annotated[["category", "category_1", "category_2"]].dropna()

Unnamed: 0_level_0,category,category_1,category_2
query,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
taxing social security,1,1,1
term limits,1,4,1
potatoes,1,1,1
lyme disease arthritis,1,1,1
international organized crime,1,4,1
...,...,...,...
control of mrsa,4,1,5
declining middle class in u.s.,4,5,5
el nino,4,5,5
ethanol and food prices,5,5,5


In [59]:
rater_matrix = aggregate_raters(df_annotated[["category", "category_1", "category_2"]].dropna().values)

In [66]:
fleiss_kappa(rater_matrix[0])

np.float64(0.3303714504323723)

In [68]:
import krippendorff

In [70]:
krippendorff.alpha(value_counts=rater_matrix[0], level_of_measurement="nominal")

np.float64(0.330892967994341)

In [71]:
df_annotated

Unnamed: 0_level_0,category,annotator,category_1,annotator_1,category_2,annotator_2,category_3,annotator_3,category_4,annotator_4,id,dataset
query,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
taxing social security,1,jonina,1,maron,1,coban,,,,,608_robust04,Robust 04
term limits,1,jonina,4,maron,1,coban,,,,,699_robust04,Robust 04
potatoes,1,jonina,1,maron,1,coban,,,,,2_longeval,LongEval
lyme disease arthritis,1,jonina,1,maron,1,coban,,,,,604_robust04,Robust 04
international organized crime,1,jonina,4,maron,1,coban,,,,,301_robust04,Robust 04
...,...,...,...,...,...,...,...,...,...,...,...,...
control of mrsa,4,loewenstein,1,busch,5,wollenberg,,,,,823_core18,Core 18
declining middle class in u.s.,4,loewenstein,5,busch,5,wollenberg,,,,,803_core18,Core 18
el nino,4,loewenstein,5,busch,5,wollenberg,,,,,365_robust04,Robust 04
ethanol and food prices,5,loewenstein,5,busch,5,wollenberg,,,,,825_core18,Core 18
