In [2]:
import json
import os
import pandas as pd
import urllib.request
import time
from itertools import combinations
import datetime

# Load Full Report Annotations

In [3]:
run1 = "Full_Report1722845.csv"
run3 = "Full_Report_1726283.csv"
run4 = "Full_Report_1726283_2.csv"
run5 = "Full_Report_1726283_3.csv"
run6 = "Full_Report_1734122.csv"



report_name=run6


#full_report = pd.read_csv("appen_results/" + report_name)

all_files = ["Full_Report1722845.csv", 
     "Full_Report_1726283.csv", 
     "Full_Report_1726283_2.csv", 
     "Full_Report_1726283_3.csv", 
     "Full_Report_1734122.csv"]

full_report=None
for report_name in all_files:
    if not os.path.exists('intermediate_store/' + report_name + '.pickle'):
        raise Exception("File Missing:", report_name)
    else:
        if full_report is not None:
            read = pd.read_pickle('intermediate_store/' + report_name + '.pickle')
            full_report = pd.concat([full_report, read], axis=0)
        else:
            full_report = pd.read_pickle('intermediate_store/' + report_name + '.pickle')
        print("Loaded Pickle File:", report_name)

Loaded Pickle File: Full_Report1722845.csv
Loaded Pickle File: Full_Report_1726283.csv
Loaded Pickle File: Full_Report_1726283_2.csv
Loaded Pickle File: Full_Report_1726283_3.csv
Loaded Pickle File: Full_Report_1734122.csv


## Download JSONs, since Appen only delivers a link..

In [4]:
%%time
#The annotations are for some reasons only embedded as link --> extra download needed.
def download_json(link, pandas_id):
    print(pandas_id)
    
    try:
        data = urllib.request.urlopen(link).read()
        output = json.loads(data)
        return output
    except Exception as e:
        return {}
    


#And Persist Results, since we don't wanna do this every time...
if not os.path.exists('intermediate_store/' + report_name + '.pickle'):
    full_report["anno_json"] = full_report.apply(lambda row: download_json(row['annotations'], row.name ), axis=1)
    full_report.to_pickle('intermediate_store/' + report_name + '.pickle')
    print("Created new Pickle File")
else:
    full_report = pd.read_pickle('intermediate_store/' + report_name + '.pickle')
    full_report.anno_json.head(1)
    print("Loaded Pickle File")



Loaded Pickle File
CPU times: user 1.5 s, sys: 292 ms, total: 1.8 s
Wall time: 1.79 s


# Decide whether only new annotation or also agreement between gold annotations

In [5]:
new_annotations = full_report[full_report['_golden'] == False].copy()
#new_annotations = full_report.copy()
new_annotations.shape

(900, 21)

In [6]:
#print("Amount of Internal channel values : \n", new_annotations._channel.value_counts())
#new_annotations =  new_annotations[new_annotations['_channel'] != 'cf_internal'].copy()
#new_annotations.shape

In [7]:
print("Amount of tainted values : \n", new_annotations._tainted.value_counts())
new_annotations =  new_annotations[new_annotations['_tainted'] == False].copy()
new_annotations.shape

Amount of tainted values : 
 False    900
Name: _tainted, dtype: int64


(900, 21)

## Remove old annotations to check individual results

In [8]:
new_annotations._created_at = pd.to_datetime(new_annotations._created_at)
new_annotations = new_annotations[new_annotations['_created_at'] >=  datetime.datetime(2021,2,20,0,0,0)].copy()
new_annotations.shape

(900, 21)

# Evaluate Performance with F1 Score and Cohen's Kappa

In [9]:
# Convert Json to binary annotation list
def json_to_list(json):
    annotations = []
    try:
        for i, span in enumerate(json['spans']):
            if len(span['classnames']) > 0 and span['classnames'][0] == 'Drug':
                annotations.append(1)
            elif len(span['classnames']) > 0 and span['classnames'][0] == 'None':
                annotations.append(2)
            else:
                annotations.append(0)
    except KeyError:
        print("error at ", str(json))
    return annotations        
    
new_annotations['anno_list'] = new_annotations["anno_json"].apply(lambda json: json_to_list(json))

In [10]:
worker_annotations = {}
def write_to_dict(worker_id, unit_id, anno_list):
    if worker_id in worker_annotations:
        current_elements = worker_annotations[worker_id]
        current_elements[unit_id] = anno_list
        worker_annotations[worker_id] = current_elements
    else:
        worker_annotations[worker_id] = {unit_id:anno_list}
    
    
new_annotations.apply(lambda row: write_to_dict(row['_worker_id'], row['_unit_id'], row['anno_list']), axis = 1)

0      None
1      None
2      None
3      None
4      None
       ... 
895    None
896    None
897    None
898    None
899    None
Length: 900, dtype: object

In [11]:
# Credits to Louis de Bruijn (https://towardsdatascience.com/inter-annotator-agreement-2f46c6d37bf3)
def cohen_kappa(ann1, ann2):
    """Computes Cohen kappa for pair-wise annotators.
    :param ann1: annotations provided by first annotator
    :type ann1: list
    :param ann2: annotations provided by second annotator
    :type ann2: list
    :rtype: float
    :return: Cohen kappa statistic
    """
    count = 0
    for an1, an2 in zip(ann1, ann2):
        if an1 == an2:
            count += 1
    A = count / len(ann1)  # observed agreement A (Po)

    uniq = set(ann1 + ann2)
    E = 0  # expected agreement E (Pe)
    for item in uniq:
        cnt1 = ann1.count(item)
        cnt2 = ann2.count(item)
        count = ((cnt1 / len(ann1)) * (cnt2 / len(ann2)))
        E += count

    return round((A - E) / (1 - E), 4)

In [12]:
# FScore formula for inter annotator agreement from: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1090460/

micro_fscores={}
cohen_kappas = {}
unique_worker_combis = list(combinations(new_annotations._worker_id.unique(), 2))

#Global Fscore
g_a = 0
g_b = 0
g_c = 0
for (worker1, worker2) in unique_worker_combis:
    intersection = worker_annotations[worker1].keys() & worker_annotations[worker2].keys()
    w1_full_annos = []
    w2_full_annos = []
    
    # Calculate Measures only for overlapping pairs..
    if len(intersection) > 0: 
        #micro Fscores
        a = 0
        b = 0
        c = 0
        for item in intersection:
            w1_full_annos.extend(worker_annotations[worker1][item])
            w2_full_annos.extend(worker_annotations[worker2][item])
            for (x, y) in zip( worker_annotations[worker1][item], worker_annotations[worker2][item] ):
                if (x == 1 and y == 1) or (x == 2 and y == 2):
                    a += 1
                    g_a += 1
                elif x > y:
                    b += 1
                    g_b += 1
                elif x < y:
                    c += 1
                    g_c += 1
        #print(item, ' a:', a, ' b:', b, 'c:', c)
        #F-Score for user pair
        if a != 0 or b != 0 or c != 0:
            fscore = 2*a /  ( (2*a) + b + c )
            micro_fscores[(worker1, worker2)] = fscore
            #Cohens Kappa for user pai
            cohen_kappas[(worker1, worker2)] = cohen_kappa(w1_full_annos, w2_full_annos)
        else:
            fscore = 1 #TODO Check if that is really true

            


In [13]:
#Report Global F1 Score        
global_fscore = 2*g_a /  ( (2*g_a) + g_b + g_c )
print('The Mikro F-Score is: ', global_fscore)

#Macro Average F1 Score
makro_fscore = 0
for val in micro_fscores.values(): 
    makro_fscore += val 

makro_fscore = makro_fscore / len(micro_fscores)
print('The Makro F-Score is: ', makro_fscore)

#Macro Averaged Cohens Kappa
makro_cohen = 0
for val in cohen_kappas.values(): 
    makro_cohen += val 

makro_cohen = makro_cohen / len(cohen_kappas) 
print('The Avg. Pairwise Cohen Kappa is: ', makro_cohen)


print(' The Average Agreement according to Appen (Test Question Score of users who agreed multiplied / Sum of annotators)', 
     new_annotations._trust.mean())

#micro_fscores

The Mikro F-Score is:  0.5517805976258698
The Makro F-Score is:  0.5993098519827286
The Avg. Pairwise Cohen Kappa is:  0.4310162079510703
 The Average Agreement according to Appen (Test Question Score of users who agreed multiplied / Sum of annotators) 0.897842
