# Notebook to calculate Inter Annotator Agreement

### Import the libraries

In [1]:
import sys
import pandas as pd
import collections 
import os
import numpy as np
from itertools import chain
from itertools import combinations
sys.path.insert(0, '..')
from src.experiment_utils.helper_classes import token, span, repository
from src.d02_corpus_statistics.corpus import Corpus
from src.d03_inter_annotator_agreement.inter_annotator_agremment import Inter_Annotator_Agreement, _get_score_article
from definitions import df_annotation_marker
from src.d03_inter_annotator_agreement.inter_annotator_agremment import row_to_span_list, keep_valid_anotations


from definitions import ROOT_DIR


## Small Tutorial

Load the dataframe stat_df

In [2]:
    
dataframe_dir = os.path.join(ROOT_DIR,'data/02_processed_to_dataframe', 'preprocessed_dataframe.pkl')
stat_df = pd.read_pickle(dataframe_dir)
stat_df.head()

Unnamed: 0,Policy,Text,Tokens,Article_State,Finished_Annotators,Curation,Alisha,Fride,Onerva,Fabian,Lynn,Sebastian,Joel
EU_32018R1999_Title_0_Chapter_7_Section_3_Article_43,,article 43\r\nexercise of the delegation\r\n1....,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Fride]",[annotator:Curation layer:Instrumenttypes type...,[annotator:Alisha layer:Instrumenttypes type:I...,[annotator:Fride layer:Policydesigncharacteris...,,,,,
EU_32019R0631_Title_0_Chapter_0_Section_0_Article_12,,article 12\r\nreal-world co2 emissions and fue...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Onerva, Fabian]",[annotator:Curation layer:Instrumenttypes type...,,,[annotator:Onerva layer:Instrumenttypes type:I...,[annotator:Fabian layer:Policydesigncharacteri...,,,
EU_32018L2001_Title_0_Chapter_0_Section_0_Article_11,,article 11\r\njoint projects between member st...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Fride, Onerva]",[annotator:Curation layer:Instrumenttypes type...,,[annotator:Fride layer:Instrumenttypes type:In...,[annotator:Onerva layer:Instrumenttypes type:I...,,,,
EU_32018R1999_Title_0_Chapter_7_Section_3_Article_56,,article 56\r\namendments to directive (eu) 201...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Fride]",[annotator:Curation layer:Policydesigncharacte...,[annotator:Alisha layer:Policydesigncharacteri...,[annotator:Fride layer:Policydesigncharacteris...,,,,,
EU_32018L2001_Title_0_Chapter_0_Section_0_Article_03,,article 3\r\nbinding overall union target for ...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Fride, Onerva, Fabian]",[annotator:Curation layer:Instrumenttypes type...,,[annotator:Fride layer:Instrumenttypes type:In...,[annotator:Onerva layer:Instrumenttypes type:I...,[annotator:Fabian layer:Instrumenttypes type:I...,,,


First create a object of class Inter annotator agreement. The constructor takes a stat_df as input, has a optional argument DEBUG where only the first 10 articles are taken to test different functions

In [3]:
test_evaluator = Inter_Annotator_Agreement(stat_df, front_and_whereas = False)
test_evaluator_debug = Inter_Annotator_Agreement(stat_df, DEBUG = True)


In [4]:
test_evaluator.df.shape

(448, 13)

Drop whereas and front. This should not be necessary, but for some reason the test_evaluator does not correctly inherit the corpus init function

In [5]:
ls = ['front', 'Whereas']
test_evaluator.drop_articles_based_on_string(ls)
test_evaluator.df.shape

(412, 13)

Inter_Annotator_Agreement is a child class of the Corpus class, so all methods of the Corpus class are available

In [6]:
test_dir = repository(policy = 'EU_32008R1099')
test_evaluator.get_span_list(conditional_rep = test_dir, columns = 'annotators', item = 'tag', value =  'Tech_LowCarbon')

[annotator:Onerva layer:Technologyandapplicationspecificity type:TechnologySpecificity tag:Tech_LowCarbon start:18 stop:25 text:nuclear,
 annotator:Onerva layer:Technologyandapplicationspecificity type:TechnologySpecificity tag:Tech_LowCarbon start:95 stop:109 text:nuclear energy,
 annotator:Onerva layer:Technologyandapplicationspecificity type:TechnologySpecificity tag:Tech_LowCarbon start:151 stop:158 text:nuclear,
 annotator:Onerva layer:Technologyandapplicationspecificity type:TechnologySpecificity tag:Tech_LowCarbon start:125 stop:141 text:renewable energy,
 annotator:Onerva layer:Technologyandapplicationspecificity type:TechnologySpecificity tag:Tech_LowCarbon start:393 stop:409 text:renewable energy,
 annotator:Onerva layer:Technologyandapplicationspecificity type:TechnologySpecificity tag:Tech_LowCarbon start:499 stop:515 text:renewable energy,
 annotator:Onerva layer:Technologyandapplicationspecificity type:TechnologySpecificity tag:Tech_LowCarbon start:1276 stop:1292 text:ren

To calculate the inter annonator agreement, there are two options


## Append the score to dataframe

This method appends the inter-annotator agreement for each article which at least two valid annoations based on a set of inter-annotator agreement measures. The scores are calculated in parallel, this is the recommended method for computationally intensive scores.

First, we only consider the articles where the curation is finished and at least two annotators are present:

In [6]:
test_evaluator.keep_only_finished_articles()

In [7]:
scoring_metrics = ['f1_exact', 'f1_tokenwise', 'f1_partial', 'f1_heuristic']

In [9]:
scoring_metrics = ['f1_exact', 'f1_tokenwise', 'f1_partial', 'f1_heuristic', 'pygamma']

In [10]:
test_evaluator.append_total_score_per_article(scoring_metrics)

 36%|███▋      | 150/412 [06:20<11:05,  2.54s/it]  


AssertionError: 

There is also a normal implementation which uses parallel

In [13]:
test_evaluator.append_total_score_per_article_parallel(scoring_metrics)

INFO: Pandarallel will run on 48 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


Checking out the dataframe now:

In [10]:
test_evaluator.df.head()

Unnamed: 0,Policy,Text,Tokens,Article_State,Finished_Annotators,Curation,Alisha,Fride,Onerva,Fabian,Lynn,Sebastian,Joel,f1_exact_score,f1_tokenwise_score,f1_partial_score,f1_heuristic_score
EU_32018R1999_Title_0_Chapter_7_Section_3_Article_43,,article 43\r\nexercise of the delegation\r\n1....,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Fride]",[annotator:Curation layer:Instrumenttypes type...,[annotator:Alisha layer:Instrumenttypes type:I...,[annotator:Fride layer:Policydesigncharacteris...,,,,,,0.21875,0.226574,0.28125,0.28125
EU_32019R0631_Title_0_Chapter_0_Section_0_Article_12,,article 12\r\nreal-world co2 emissions and fue...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Onerva, Fabian]",[annotator:Curation layer:Instrumenttypes type...,,,[annotator:Onerva layer:Instrumenttypes type:I...,[annotator:Fabian layer:Policydesigncharacteri...,,,,0.289157,0.263793,0.409639,0.421603
EU_32018L2001_Title_0_Chapter_0_Section_0_Article_11,,article 11\r\njoint projects between member st...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Fride, Onerva]",[annotator:Curation layer:Instrumenttypes type...,,[annotator:Fride layer:Instrumenttypes type:In...,[annotator:Onerva layer:Instrumenttypes type:I...,,,,,0.567308,0.512946,0.653846,0.653846
EU_32018R1999_Title_0_Chapter_7_Section_3_Article_56,,article 56\r\namendments to directive (eu) 201...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Fride]",[annotator:Curation layer:Policydesigncharacte...,[annotator:Alisha layer:Policydesigncharacteri...,[annotator:Fride layer:Policydesigncharacteris...,,,,,,0.736842,0.583333,0.736842,0.736842
EU_32018L2001_Title_0_Chapter_0_Section_0_Article_03,,article 3\r\nbinding overall union target for ...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Fride, Onerva, Fabian]",[annotator:Curation layer:Instrumenttypes type...,,[annotator:Fride layer:Instrumenttypes type:In...,[annotator:Onerva layer:Instrumenttypes type:I...,[annotator:Fabian layer:Instrumenttypes type:I...,,,,0.420198,0.410204,0.532749,0.544437


### Get total score

test_evaluator.get_total_score_df(weight_by = 'Spans')To retrieve the total score of the corpus, use get_total_score_df() on a dataframes where the scores for individual articles have been calculated.

In [12]:
test_evaluator.get_total_score_df(weight_by = 'no_weighting')

{'f1_exact_score': 0.40028724799007476,
 'f1_tokenwise_score': 0.397687590370067,
 'f1_partial_score': 0.5159440267362017,
 'f1_heuristic_score': 0.5273362748620107}

In [15]:
test_evaluator.get_total_score_df(weight_by = 'Tokens')

{'f1_exact_score': 0.37492081552891304,
 'f1_tokenwise_score': 0.35240598751298724,
 'f1_partial_score': 0.4739162741735001,
 'f1_heuristic_score': 0.48679580434940306}

In [14]:
test_evaluator.get_total_score_df(weight_by = 'Spans')

{'f1_exact_score': 0.396540566261322,
 'f1_tokenwise_score': 0.39389908818200375,
 'f1_partial_score': 0.5118514006608382,
 'f1_heuristic_score': 0.5231773763902876}

if only specific scores are required:

In [12]:
test_evaluator.get_total_score_df('f1_exact_score')

{'f1_exact_score': 0.37492081552891304}

or

In [13]:
test_evaluator.get_total_score_df(['f1_exact_score', 'f1_tokenwise_score'])

{'f1_exact_score': 0.5072929766398167,
 'f1_tokenwise_score': 0.4906718122217034}

### Get total score per annotator

In [16]:
annotators = ['Onerva', 'Alisha', 'Fabian', 'Fride']
for ann in annotators:
    print('annotator: ', ann)
    print(test_evaluator.get_total_score_df(annotator = ann))
    print('----------------')

annotator:  Onerva
{'f1_exact_score': 0.5527361645289197, 'f1_tokenwise_score': 0.5332588243090812, 'f1_partial_score': 0.6201484593830799, 'f1_heuristic_score': 0.6285039167633368}
----------------
annotator:  Alisha
{'f1_exact_score': 0.44876786556687887, 'f1_tokenwise_score': 0.4368619207722595, 'f1_partial_score': 0.5274636826847383, 'f1_heuristic_score': 0.538662640333709}
----------------
annotator:  Fabian
{'f1_exact_score': 0.41685338816967765, 'f1_tokenwise_score': 0.3962183278243504, 'f1_partial_score': 0.48073314419512525, 'f1_heuristic_score': 0.48610464897550437}
----------------
annotator:  Fride
{'f1_exact_score': 0.5862470096195773, 'f1_tokenwise_score': 0.5703371832149943, 'f1_partial_score': 0.6687261286904853, 'f1_heuristic_score': 0.6814722763306621}
----------------


In [14]:
test_evaluator.get_score_annotator('Fride', ['f1_exact_score', 'f1_tokenwise_score'])

{'f1_exact_score': 0.4443964697402863,
 'f1_tokenwise_score': 0.4230426239210114}

### Rank articles by score

In [15]:
test_evaluator.df.sort_values(by=['f1_heuristic_score'])

Unnamed: 0,Policy,Text,Tokens,Article_State,Finished_Annotators,Curation,Onerva,Fride,Lynn,Sebastian,Alisha,Fabian,f1_exact_score,f1_tokenwise_score,f1_partial_score,f1_heuristic_score
EU_32009L0028_Title_0_Chapter_0_Section_0_Article_28,,article 28\r\nentry into force\r\nthis directi...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Fabian]",[annotator:curation layer:Policydesigncharacte...,[],,,[],[annotator:Alisha layer:Policydesigncharacteri...,[],0.000000,0.000000,0.0,0.0
EU_32019R0631_Title_0_Chapter_0_Section_0_Article_18,,article 18\r\nrepeal\r\nregulations (ec) no 44...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Onerva, Fabian]",[annotator:curation layer:Policydesigncharacte...,[annotator:Onerva layer:Policydesigncharacteri...,,,,,[annotator:Fabian layer:Policydesigncharacteri...,0.000000,0.000000,0.0,0.0
EU_32019L0944_Title_0_Chapter_4_Section_0_Article_39,,article 39\r\ncombined operator\r\narticle 35(...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Onerva]",[annotator:curation layer:Policydesigncharacte...,[annotator:Onerva layer:Policydesigncharacteri...,,,,[annotator:Alisha layer:Policydesigncharacteri...,,0.000000,0.000000,0.0,0.0
EU_32006L0066_Title_0_Chapter_0_Section_0_Article_29,,article 29\r\nentry into force\r\nthis directi...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Fride, Onerva]",[annotator:curation layer:Policydesigncharacte...,[annotator:Onerva layer:Policydesigncharacteri...,[annotator:Fride layer:Policydesigncharacteris...,[],,,,0.000000,0.000000,0.0,0.0
EU_32008R1099_Title_0_Chapter_0_Section_0_Article_10,,article 10\r\nimplementing measures\r\n1. th...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Fride]",[annotator:curation layer:Instrumenttypes type...,[annotator:Onerva layer:Policydesigncharacteri...,[annotator:Fride layer:Instrumenttypes type:In...,,,[annotator:Alisha layer:Instrumenttypes type:I...,,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
EU_32006L0032_Title_0_Chapter_4_Section_0_Article_20,,article 20\r\naddressees\r\nthis directive is ...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Fride, Onerva]",[annotator:curation layer:Policydesigncharacte...,[annotator:Onerva layer:Policydesigncharacteri...,[annotator:Fride layer:Policydesigncharacteris...,,,,,1.000000,1.000000,1.0,1.0
EU_32018R1999_Title_0_Chapter_7_Section_3_Article_55,,article 55\r\namendment to directive 2013/30/e...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Fride]",[annotator:curation layer:Policydesigncharacte...,,[annotator:Fride layer:Policydesigncharacteris...,,,[annotator:Alisha layer:Policydesigncharacteri...,,0.909091,0.969697,1.0,1.0
EU_32006L0066_Title_0_Chapter_0_Section_0_Article_30,,article 30\r\naddressees\r\nthis directive is ...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Fride, Onerva]",[annotator:curation layer:Policydesigncharacte...,[annotator:Onerva layer:Policydesigncharacteri...,[annotator:Fride layer:Policydesigncharacteris...,[],,,,1.000000,1.000000,1.0,1.0
EU_32012L0027_Title_0_Chapter_5_Section_0_Article_30,,article 30\r\naddressees\r\nthis directive is ...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Fride, Onerva]",[annotator:curation layer:Policydesigncharacte...,[annotator:Onerva layer:Policydesigncharacteri...,[annotator:Fride layer:Policydesigncharacteris...,,,,,1.000000,1.000000,1.0,1.0


## Get total score based on a spanlist

The inter annotator agreement score can be also calculated from a spanlist. For all the spans present, it calculates the inter agreement scores for alle the articls with at least two valid annoations. Can be used to caluclate simmilarity to curation.

In [19]:
test_dir = repository.from_repository_name('EU_32006L0032_Title_0_Chapter_1_Section_0_Article_03')
span_list = test_evaluator.get_span_list(test_dir, ['Onerva', 'Fride'])

In [20]:
test_evaluator.get_score_spanlist(span_list, 'f1_heuristic')

0.5664166174246597

## Check closeness to curation

To check the agreement with the curation of all annotators, we simply create a spanlist for each annotator containing all his spans and the ones from the curation
Since this is based on a big list instead of a dataframe, the computation is very slow

In [24]:
annotators = ['Onerva', 'Alisha', 'Fabian', 'Fride']
repo = repository()

for ann in annotators:
    span_list_annotator_curation = test_evaluator.get_span_list(repo, ['Curation', ann])
    score = test_evaluator.get_score_spanlist(span_list_annotator_curation, 'f1_heuristic')
    print('annotator: ', ann, ', score: ', score)
    
    
    
    

annotator:  Onerva , score:  0.707738142033301
annotator:  Alisha , score:  0.632147918537081
annotator:  Fabian , score:  0.44097366388791853
annotator:  Fride , score:  0.7413404022482636


# Check scores in different categories

In [27]:
layers = ['Technologyandapplicationspecificity', 'Policydesigncharacteristics', 'Instrumenttypes' ]
repo = repository()

for l in layers:
    span_list_layer = test_evaluator.get_span_list(repo, columns = 'annotators', item = 'layer', value = l)
    score = test_evaluator.get_score_spanlist(span_list_layer, 'f1_heuristic')
    print('layer: ', l, ', score: ', score)
    

layer:  Technologyandapplicationspecificity , score:  0.4630743356234037
layer:  Policydesigncharacteristics , score:  0.5087584057577884
layer:  Instrumenttypes , score:  0.45662242886098364


In [None]:
all_tags = 