In [1]:
import pandas as pd
import os
import re
from sklearn.metrics import f1_score, cohen_kappa_score

def process_annotation_csvs(folder_path: str, annotator: str) -> dict:
    assert annotator in ['AG', 'MM']
    new_dfs = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(f"_{annotator}.csv"):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path, sep='\t')
            new_df = fill_missing_annotation(df)
            df_name = re.search('\d+', filename).group()
            new_dfs[df_name] = new_df
    return new_dfs

def fill_missing_annotation(df: pd.DataFrame) -> pd.DataFrame:
    for i in range(len(df)):
        if pd.isna(df.iloc[i].annotation):
            df.iloc[i].annotation = df.iloc[i].pos
    return df

def accuracy_df(processed_df_dict: dict) -> pd.DataFrame:

    df_accs = []
    df_names = []
    df_f1s = []
    for key in processed_df_dict.keys():
        current_df = processed_df_dict[key]
        current_df['correct'] = (current_df.annotation == current_df.pos).astype(int)
        current_acc = sum(current_df.correct)/len(current_df)
        current_f1 = f1_score(current_df.annotation, current_df.pos, average='weighted')
        df_f1s.append(current_f1)
        df_names.append(key)
        df_accs.append(current_acc)

    df_acc = pd.DataFrame(columns=['epitome', 'annot_acc', 'annot_f1'])
    df_acc.epitome = df_names
    df_acc.annot_acc = df_accs
    df_acc.annot_f1 = df_f1s

    return df_acc

In [2]:
path = os.getcwd()
df_dict_ag = process_annotation_csvs(path, 'AG')
df_dict_mm = process_annotation_csvs(path, 'MM')

In [3]:
df_acc_ag = accuracy_df(df_dict_ag)
df_acc_mm = accuracy_df(df_dict_mm)

In [4]:
df_acc_ag

Unnamed: 0,epitome,annot_acc,annot_f1
0,190,0.947977,0.941867
1,286,0.919192,0.914346
2,432,0.957746,0.955509
3,584,0.977612,0.974656
4,915,0.960894,0.962744
5,1406,0.954455,0.94906
6,1616,0.937824,0.928352
7,1717,0.929825,0.930375
8,1904,0.951583,0.943427
9,2309,0.94964,0.945366


In [5]:
df_acc_mm

Unnamed: 0,epitome,annot_acc,annot_f1
0,190,0.895954,0.874162
1,1616,0.901554,0.887017
2,1717,0.877193,0.894625


In [25]:
df_iaa = pd.DataFrame(columns=['epitome', 'cohen_kappa', 'intersect_cohen_kappa'])
df_iaa_epitomes = []
df_iaa_kappas = []
df_iaa_intersect_kappas = []
for epitome in df_dict_mm.keys():
    kappa = cohen_kappa_score(df_dict_mm[epitome].annotation, df_dict_ag[epitome].annotation)
    # this finds all occurrences where ag/mm did not agree with the tagger
    disagreement_ag = df_dict_ag[epitome][df_dict_ag[epitome].annotation != df_dict_ag[epitome].pos].index
    disagreement_mm = df_dict_mm[epitome][df_dict_mm[epitome].annotation != df_dict_mm[epitome].pos].index
    # this then finds the indices where both ag and mm disagreed with the tagger (i.e. the intersect)
    disagreement_intersect = list(set(disagreement_ag) & set(disagreement_mm))
    # finally we can calculate the iaa for only those tags where both disagreed with the tagger
    intersect_kappa = cohen_kappa_score(df_dict_ag[epitome].iloc[disagreement_intersect].annotation, 
                                        df_dict_mm[epitome].iloc[disagreement_intersect].annotation)
    
    df_iaa_epitomes.append(epitome)
    df_iaa_kappas.append(kappa)
    df_iaa_intersect_kappas.append(intersect_kappa)


df_iaa.epitome = df_iaa_epitomes
df_iaa.cohen_kappa = df_iaa_kappas
df_iaa.intersect_cohen_kappa = df_iaa_intersect_kappas

In [26]:
df_iaa

Unnamed: 0,epitome,cohen_kappa,intersect_cohen_kappa
0,190,0.9334,0.857143
1,1616,0.91785,0.854839
2,1717,0.937017,1.0
