# Notebook to calculate Inter Annotator Agreement

### Import the libraries

In [1]:
import sys
import pandas as pd
import collections 
import os
import numpy as np
from itertools import chain
from itertools import combinations
sys.path.insert(0, '..')
from src.experiment_utils.helper_classes import token, span, repository
from src.d02_corpus_statistics.corpus import Corpus
from src.d03_inter_annotator_agreement.inter_annotator_agremment import Inter_Annotator_Agreement, _get_score_article
from definitions import df_annotation_marker
from src.d03_inter_annotator_agreement.inter_annotator_agremment import row_to_span_list, keep_valid_anotations
from src.d03_inter_annotator_agreement.scoring_functions import create_scoring_matrix


from definitions import ROOT_DIR


## Small Tutorial

Load the dataframe stat_df

In [3]:
    
dataframe_dir = os.path.join(ROOT_DIR,'data/02_processed_to_dataframe', 'preprocessed_dataframe.pkl')
stat_df = pd.read_pickle(dataframe_dir)
stat_df.head()

Unnamed: 0,Policy,Text,Tokens,Article_State,Finished_Annotators,Curation,Alisha,Fride,Onerva,Fabian,Lynn,Sebastian,Joel
EU_32018R1999_Title_0_Chapter_7_Section_3_Article_43,,article 43\r\nexercise of the delegation\r\n1....,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Fride]",[span id:CUR0 annotator:Curation layer:Instrum...,[span id:A1 annotator:Alisha layer:Instrumentt...,[span id:B1 annotator:Fride layer:Policydesign...,,,,,
EU_32019R0631_Title_0_Chapter_0_Section_0_Article_12,,article 12\r\nreal-world co2 emissions and fue...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Onerva, Fabian]",[span id:CUR36 annotator:Curation layer:Instru...,,,[span id:C1 annotator:Onerva layer:Instrumentt...,[span id:D1 annotator:Fabian layer:Policydesig...,,,
EU_32018L2001_Title_0_Chapter_0_Section_0_Article_11,,article 11\r\njoint projects between member st...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Fride, Onerva]",[span id:CUR116 annotator:Curation layer:Instr...,,[span id:B28 annotator:Fride layer:Instrumentt...,[span id:C58 annotator:Onerva layer:Instrument...,,,,
EU_32018R1999_Title_0_Chapter_7_Section_3_Article_56,,article 56\r\namendments to directive (eu) 201...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Fride]",[span id:CUR202 annotator:Curation layer:Polic...,[span id:A38 annotator:Alisha layer:Policydesi...,[span id:B129 annotator:Fride layer:Policydesi...,,,,,
EU_32018L2001_Title_0_Chapter_0_Section_0_Article_03,,article 3\r\nbinding overall union target for ...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Fride, Onerva, Fabian]",[span id:CUR211 annotator:Curation layer:Instr...,,[span id:B138 annotator:Fride layer:Instrument...,[span id:C165 annotator:Onerva layer:Instrumen...,[span id:D27 annotator:Fabian layer:Instrument...,,,


First create a object of class Inter annotator agreement. The constructor takes a stat_df as input, has a optional argument DEBUG where only the first 10 articles are taken to test different functions

In [12]:
test_evaluator = Inter_Annotator_Agreement(stat_df, front_and_whereas = False)
test_evaluator_debug = Inter_Annotator_Agreement(stat_df, DEBUG = True)


In [13]:
test_evaluator.df.shape

(412, 13)

In [14]:
test_evaluator_debug.df.shape

(10, 13)

Inter_Annotator_Agreement is a child class of the Corpus class, so all methods of the Corpus class are available

In [15]:
test_dir = repository(policy = 'EU_32008R1099')
test_evaluator.get_span_list(conditional_rep = test_dir, annotators = 'annotators', item = 'tag', value =  'Tech_LowCarbon')

[span id:C3882 annotator:Onerva layer:Technologyandapplicationspecificity type:TechnologySpecificity tag:Tech_LowCarbon start:18 stop:25 text:nuclear,
 span id:C3883 annotator:Onerva layer:Technologyandapplicationspecificity type:TechnologySpecificity tag:Tech_LowCarbon start:95 stop:109 text:nuclear energy,
 span id:C3884 annotator:Onerva layer:Technologyandapplicationspecificity type:TechnologySpecificity tag:Tech_LowCarbon start:151 stop:158 text:nuclear,
 span id:C5161 annotator:Onerva layer:Technologyandapplicationspecificity type:TechnologySpecificity tag:Tech_LowCarbon start:125 stop:141 text:renewable energy,
 span id:C5162 annotator:Onerva layer:Technologyandapplicationspecificity type:TechnologySpecificity tag:Tech_LowCarbon start:393 stop:409 text:renewable energy,
 span id:C5163 annotator:Onerva layer:Technologyandapplicationspecificity type:TechnologySpecificity tag:Tech_LowCarbon start:499 stop:515 text:renewable energy,
 span id:C5164 annotator:Onerva layer:Technologyand

To calculate the inter annonator agreement, there are two options


## Append the score to dataframe

This method appends the inter-annotator agreement for each article based on a set of inter-annotator agreement measures. The scores can calculated in parallel, this is the recommended method for computationally intensive scores.

First, we only consider the articles where the curation is finished and at least two annotators are present:

In [16]:
test_evaluator.keep_only_finished_articles()

In [17]:
scoring_metrics = ['f1_exact', 'f1_tokenwise', 'f1_partial', 'f1_heuristic']

**append_total_score_per_article(scoring_metrics, parallel = False, ** optional_tuple_properties)**

This function calculates the individual score for each article and for each metric defined in scoring_metrics (Can be a list of metrics or a single metric). For each metric, a new column is appended to the dataframe, therefore the scores can be stored and don't have to be recalculated. To speed up the computation, the scores can be calculated in parallel using the pandarell library. The kwargs "optional_tuple_properties" are reserved for pygamma properties such as a dissimilarity matrix. 



In [18]:
test_evaluator.append_total_score_per_article(scoring_metrics)

100%|██████████| 412/412 [00:03<00:00, 103.50it/s]
100%|██████████| 412/412 [00:04<00:00, 101.05it/s]
100%|██████████| 412/412 [00:03<00:00, 103.81it/s]
100%|██████████| 412/412 [00:00<00:00, 1065.00it/s]


There is also a normal implementation which uses parallel

In [19]:
test_evaluator.append_total_score_per_article(scoring_metrics, parallel = True)

INFO: Pandarallel will run on 64 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### Appending Pygamma

Pygamma is a special case. To calculate the  Pygamma-IAA score, we have the option to feed in a dissimilarity matrix and a list of all the possible tags. The dissimilarity matrix can be calculated via the create_scoring_matrix function. Each missmatch between tags of the same tagset or layer are penalized less. For more details, check the explanation in the source code. This matrix can be fed into the function as kwargs. When specyfing no dissimilarity matrix, all the missmatches are penalized equaly

In [20]:
# create custom scoring matrix
#by setting soft tagset dissimilarity equal true, missmatches in the same tagset are penalized less
# if soft layer dissimilarity would be set true, missmatches in the same layer woudl be penalized less
category_list, cat_dissimilarity_matrix = create_scoring_matrix(os.path.join(ROOT_DIR,'src/experiment_utils/tag_set.json'),  soft_tagset_dissimilarity = True, soft_layer_dissimilarity = False)

In [21]:
test_evaluator.append_total_score_per_article(scoring_metrics = 'pygamma', category_list = category_list, cat_dissimilarity_matrix = cat_dissimilarity_matrix)

100%|██████████| 412/412 [05:30<00:00,  1.25it/s]  


Checking out the dataframe now:

In [11]:
test_evaluator.df.head()

Unnamed: 0,Policy,Text,Tokens,Article_State,Finished_Annotators,Curation,Alisha,Fride,Onerva,Fabian,Lynn,Sebastian,Joel,f1_exact_score,f1_tokenwise_score,f1_partial_score,f1_heuristic_score,pygamma_score
EU_32018R1999_Title_0_Chapter_7_Section_3_Article_43,,article 43\r\nexercise of the delegation\r\n1....,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Fride]",[span id:CUR0 annotator:Curation layer:Instrum...,[span id:A1 annotator:Alisha layer:Instrumentt...,[span id:B1 annotator:Fride layer:Policydesign...,,,,,,0.21875,0.226574,0.28125,0.28125,0.451832
EU_32019R0631_Title_0_Chapter_0_Section_0_Article_12,,article 12\r\nreal-world co2 emissions and fue...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Onerva, Fabian]",[span id:CUR36 annotator:Curation layer:Instru...,,,[span id:C1 annotator:Onerva layer:Instrumentt...,[span id:D1 annotator:Fabian layer:Policydesig...,,,,0.289157,0.263793,0.409639,0.421603,0.357654
EU_32018L2001_Title_0_Chapter_0_Section_0_Article_11,,article 11\r\njoint projects between member st...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Fride, Onerva]",[span id:CUR116 annotator:Curation layer:Instr...,,[span id:B28 annotator:Fride layer:Instrumentt...,[span id:C58 annotator:Onerva layer:Instrument...,,,,,0.567308,0.512946,0.653846,0.653846,0.584463
EU_32018R1999_Title_0_Chapter_7_Section_3_Article_56,,article 56\r\namendments to directive (eu) 201...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Fride]",[span id:CUR202 annotator:Curation layer:Polic...,[span id:A38 annotator:Alisha layer:Policydesi...,[span id:B129 annotator:Fride layer:Policydesi...,,,,,,0.736842,0.583333,0.736842,0.736842,0.573874
EU_32018L2001_Title_0_Chapter_0_Section_0_Article_03,,article 3\r\nbinding overall union target for ...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Fride, Onerva, Fabian]",[span id:CUR211 annotator:Curation layer:Instr...,,[span id:B138 annotator:Fride layer:Instrument...,[span id:C165 annotator:Onerva layer:Instrumen...,[span id:D27 annotator:Fabian layer:Instrument...,,,,0.420198,0.410204,0.532749,0.544437,0.463723


### Get total score

**test_evaluator.get_total_score_df(scoring_metrics = 'all', annotator = 'all', weight_by = 'Tokens')**

Calculates the total scores specified in scoring metrics of the dataframe, either for a specific annotator or for all the annotators. The scores can be weighted by {'no_weighting, 'Tokens', 'Spans'}. Note that this works only for scoring metrics that are already calculated. The default argument 'all' retrieves the score for all the scores that have bee apended to the dataframe in the previous step.


In [12]:
#Get the total score of the corpus calculated as a mean of all the individual article scores
test_evaluator.get_total_score_df(weight_by = 'no_weighting')

{'f1_exact_score': 0.40028724799007476,
 'f1_tokenwise_score': 0.397687590370067,
 'f1_partial_score': 0.5159440267362017,
 'f1_heuristic_score': 0.5273362748620107,
 'pygamma_score': 0.4535421022357389}

In [13]:
#Get the total score of the corpus calculated as a mean of all the individual article scores weighted by the total number of tokens per article
test_evaluator.get_total_score_df(weight_by = 'Tokens')

{'f1_exact_score': 0.37492081552891304,
 'f1_tokenwise_score': 0.35240598751298724,
 'f1_partial_score': 0.4739162741735001,
 'f1_heuristic_score': 0.48679580434940306,
 'pygamma_score': 0.43390749330297906}

In [14]:
#Get the total score of the corpus calculated as a mean of all the individual article scores weighted by the total number of spans per article
test_evaluator.get_total_score_df(weight_by = 'Spans')

{'f1_exact_score': 0.3874058515882508,
 'f1_tokenwise_score': 0.36379438736998276,
 'f1_partial_score': 0.4885814966811494,
 'f1_heuristic_score': 0.5028562323485702,
 'pygamma_score': 0.44427801517119536}

if only specific scores are required:

In [15]:
test_evaluator.get_total_score_df(scoring_metrics = 'f1_exact', weight_by = 'no_weighting')

{'f1_exact_score': 0.40028724799007476}

or

In [16]:
test_evaluator.get_total_score_df(scoring_metrics =['f1_exact', 'f1_tokenwise'], weight_by = 'Spans')

{'f1_exact_score': 0.3874058515882508,
 'f1_tokenwise_score': 0.36379438736998276}

### Get total score per annotator

The same function can be used to retrieve the score of individual annotators. That is the weighted average of scores for all the articles the annotator has participated. 

In [17]:
test_evaluator.get_total_score_df(scoring_metrics = 'all', annotator = 'Fride', weight_by = 'no_weighting')

{'f1_exact_score': 0.46897294886445867,
 'f1_tokenwise_score': 0.4639696458990632,
 'f1_partial_score': 0.5917248835439597,
 'f1_heuristic_score': 0.6056887697375231,
 'pygamma_score': 0.49924958792303503}

For all the annotators

In [18]:
annotators = ['Onerva', 'Alisha', 'Fabian', 'Fride']
for ann in annotators:
    print('annotator: ', ann)
    print(test_evaluator.get_total_score_df(annotator = ann, weight_by = 'Spans'))
    print('----------------')

annotator:  Onerva
{'f1_exact_score': 0.4381654829355101, 'f1_tokenwise_score': 0.4090364908255155, 'f1_partial_score': 0.5339943394957395, 'f1_heuristic_score': 0.5467190944483692, 'pygamma_score': 0.48740478117874464}
----------------
annotator:  Alisha
{'f1_exact_score': 0.32323742866301736, 'f1_tokenwise_score': 0.30806281985414746, 'f1_partial_score': 0.42750169278836747, 'f1_heuristic_score': 0.44385587315930597, 'pygamma_score': 0.39740775822068164}
----------------
annotator:  Fabian
{'f1_exact_score': 0.3071820274746298, 'f1_tokenwise_score': 0.2747252372274946, 'f1_partial_score': 0.38820915429061104, 'f1_heuristic_score': 0.3962207997917874, 'pygamma_score': 0.3561665189876795}
----------------
annotator:  Fride
{'f1_exact_score': 0.4427467808588547, 'f1_tokenwise_score': 0.4214078450791374, 'f1_partial_score': 0.5584121507299492, 'f1_heuristic_score': 0.576894166530852, 'pygamma_score': 0.4930925344199355}
----------------


Or for specific scores

In [20]:
test_evaluator.get_total_score_df(annotator ='Fride', scoring_metrics = ['f1_exact', 'f1_tokenwise'], weight_by = 'Spans')


{'f1_exact_score': 0.4427467808588547,
 'f1_tokenwise_score': 0.4214078450791374}

### Rank articles by score

In [39]:
test_evaluator.df.sort_values(by=['f1_heuristic_score'])

Unnamed: 0,Policy,Text,Tokens,Article_State,Finished_Annotators,Curation,Alisha,Fride,Onerva,Fabian,Lynn,Sebastian,Joel,f1_exact_score,f1_tokenwise_score,f1_partial_score,f1_heuristic_score,pygamma_score
EU_32012L0027_Title_0_Chapter_4_Section_0_Article_21,,article 21\r\nconversion factors\r\nfor the pu...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Fride, Onerva]",[span id:CUR7809 annotator:Curation layer:Tech...,,[span id:B4371 annotator:Fride layer:Technolog...,[span id:C4487 annotator:Onerva layer:Technolo...,,,,,0.0,0.000000,0.0,0.0,-0.187198
EU_32019R0631_Title_0_Chapter_0_Section_0_Article_18,,article 18\r\nrepeal\r\nregulations (ec) no 44...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Onerva, Fabian]",[span id:CUR13524 annotator:Curation layer:Pol...,,,[span id:C6758 annotator:Onerva layer:Policyde...,[span id:D3340 annotator:Fabian layer:Policyde...,,,,0.0,0.000000,0.0,0.0,-0.140600
EU_32008R1099_Title_0_Chapter_0_Section_0_Article_08,,article 8\r\nannual nuclear statistics\r\nthe ...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Fride]",[span id:CUR6743 annotator:Curation layer:Inst...,[span id:A2293 annotator:Alisha layer:Instrume...,[span id:B3594 annotator:Fride layer:Policydes...,[span id:C3873 annotator:Onerva layer:Policyde...,,,,,0.0,0.000000,0.0,0.0,-0.095282
EU_32008R1099_Title_0_Chapter_0_Section_0_Article_05,,article 5\r\ntransmission and dissemination\r\...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Fride]",[span id:CUR15065 annotator:Curation layer:Ins...,[span id:A5801 annotator:Alisha layer:Instrume...,[span id:B8089 annotator:Fride layer:Instrumen...,[span id:C7141 annotator:Onerva layer:Policyde...,,[],,,0.0,0.000000,0.0,0.0,-0.037073
EU_32006L0066_Title_0_Chapter_0_Section_0_Article_29,,article 29\r\nentry into force\r\nthis directi...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Fride, Onerva]",[span id:CUR10714 annotator:Curation layer:Pol...,,[span id:B6443 annotator:Fride layer:Policydes...,[span id:C5728 annotator:Onerva layer:Policyde...,,[],,,0.0,0.000000,0.0,0.0,-0.763486
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
EU_32009R0079_Title_0_Chapter_0_Section_0_Article_14,,article 14\r\namendments to directive 2007/46/...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Fabian, Onerva]",[span id:CUR12469 annotator:Curation layer:Pol...,,,[span id:C6392 annotator:Onerva layer:Policyde...,[span id:D2925 annotator:Fabian layer:Policyde...,,,,1.0,1.000000,1.0,1.0,1.000000
EU_32019L0944_Title_0_Chapter_7_Section_5_Article_73,,article 73\r\nentry into force\r\nthis directi...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Fabian]",[span id:CUR12501 annotator:Curation layer:Pol...,[span id:A4492 annotator:Alisha layer:Policyde...,,[],[span id:D2942 annotator:Fabian layer:Policyde...,,,,0.0,0.846465,1.0,1.0,0.956143
EU_32018R1999_Title_0_Chapter_7_Section_3_Article_54,,article 54\r\namendments to directive 2012/27/...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Fride]",[span id:CUR9806 annotator:Curation layer:Poli...,[span id:A3223 annotator:Alisha layer:Policyde...,[span id:B5863 annotator:Fride layer:Policydes...,,,,,,1.0,1.000000,1.0,1.0,1.000000
EU_32008R1099_Title_0_Chapter_0_Section_0_Article_12,,article 12\r\nentry into force\r\nthis regulat...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Fride]",[span id:CUR384 annotator:Curation layer:Polic...,[span id:A100 annotator:Alisha layer:Policydes...,[span id:B243 annotator:Fride layer:Policydesi...,[span id:C268 annotator:Onerva layer:Policydes...,,,,,0.5,0.857143,1.0,1.0,0.960483


## Get total score based on a spanlist

The inter annotator agreement score can be also calculated from a spanlist. For all the spans present, it calculates the inter agreement scores for alle the articls with at least two valid annoations. Can be used to caluclate simmilarity to curation.

**get_score_spanlist(conditional_rep, annotators , scoring_metric, item = None, value = None, weight_by = 'Spans', ** optional_tuple_properties)**

The function is designed similar to the get_span_list function. A set of spans is selected by providing a conditional repository and a optional item and value. For all the spans present, it calculates the inter agreement scores for all the articles where two finsihed annotators are found. Returns the resulting spalist and the score. Can be used to caluclate simmilarity to curation or scores in different categories.



In [14]:
test_dir = repository.from_repository_name('EU_32008R1099_Title_0_Chapter_0_Section_0_Article_12')
span_list = test_evaluator.get_span_list(test_dir, ['Alisha', 'Fride'])

In [15]:
span_list

[span id:A100 annotator:Alisha layer:Policydesigncharacteristics type:Time tag:Time_InEffect start:76 stop:110 text:20th day following its publication,
 span id:A101 annotator:Alisha layer:Policydesigncharacteristics type:Actor tag:Addressee_default start:239 stop:252 text:member states,
 span id:B243 annotator:Fride layer:Policydesigncharacteristics type:Time tag:Time_InEffect start:76 stop:134 text:20th day following its publication in the official journal,
 span id:B244 annotator:Fride layer:Policydesigncharacteristics type:Actor tag:Addressee_default start:239 stop:252 text:member states]

So this would be the equivalent spanlist. Retrieving the IAA score of such a spanlist

In [17]:
span_list_score, score = test_evaluator.get_score_spanlist(conditional_rep = test_dir, annotators = ['Alisha', 'Fride'] , scoring_metric = 'f1_heuristic', weight_by = 'Spans')
print(span_list_score)
print(f"\nscore: {score}")

[span id:A100 annotator:Alisha layer:Policydesigncharacteristics type:Time tag:Time_InEffect start:76 stop:110 text:20th day following its publication, span id:A101 annotator:Alisha layer:Policydesigncharacteristics type:Actor tag:Addressee_default start:239 stop:252 text:member states, span id:B243 annotator:Fride layer:Policydesigncharacteristics type:Time tag:Time_InEffect start:76 stop:134 text:20th day following its publication in the official journal, span id:B244 annotator:Fride layer:Policydesigncharacteristics type:Actor tag:Addressee_default start:239 stop:252 text:member states]

score: 1.0


Or use this function to get scores in specific categories:

In [18]:
test_dir = repository.from_repository_name('EU_32008R1099_Title_0_Chapter_0_Section_0_Article_05')
span_list = test_evaluator.get_span_list(test_dir, annotators = ['Alisha', 'Fride'], item = 'layer', value = 'Policydesigncharacteristics')
span_list

[span id:A5803 annotator:Alisha layer:Policydesigncharacteristics type:Actor tag:Addressee_monitored start:48 stop:61 text:member states,
 span id:A5804 annotator:Alisha layer:Policydesigncharacteristics type:Actor tag:Authority_monitoring start:84 stop:94 text:commission,
 span id:A5805 annotator:Alisha layer:Policydesigncharacteristics type:Actor tag:Authority_monitoring start:96 stop:104 text:eurostat,
 span id:A5806 annotator:Alisha layer:Policydesigncharacteristics type:Compliance tag:Form_monitoring start:110 stop:129 text:national statistics,
 span id:A5807 annotator:Alisha layer:Policydesigncharacteristics type:Compliance tag:Form_monitoring start:366 stop:385 text:national statistics,
 span id:A5808 annotator:Alisha layer:Policydesigncharacteristics type:Actor tag:Addressee_monitored start:528 stop:540 text:member state,
 span id:A5809 annotator:Alisha layer:Policydesigncharacteristics type:Reversibility tag:Reversibility_policy start:553 stop:578 text:exemptions or derogation

Calculating the IAA of such a spanlist

In [19]:
span_list_score, score = test_evaluator.get_score_spanlist(test_dir, annotators = ['Alisha', 'Fride'], scoring_metric = 'f1_heuristic', weight_by = 'Spans')
print(span_list_score)
print(f"\nscore: {score}")

[span id:A5801 annotator:Alisha layer:Instrumenttypes type:InstrumentType tag:RegulatoryInstr start:425 stop:445 text:regulatory procedure, span id:A5802 annotator:Alisha layer:Instrumenttypes type:InstrumentType tag:RegulatoryInstr start:635 stop:655 text:regulatory procedure, span id:A5803 annotator:Alisha layer:Policydesigncharacteristics type:Actor tag:Addressee_monitored start:48 stop:61 text:member states, span id:A5804 annotator:Alisha layer:Policydesigncharacteristics type:Actor tag:Authority_monitoring start:84 stop:94 text:commission, span id:A5805 annotator:Alisha layer:Policydesigncharacteristics type:Actor tag:Authority_monitoring start:96 stop:104 text:eurostat, span id:A5806 annotator:Alisha layer:Policydesigncharacteristics type:Compliance tag:Form_monitoring start:110 stop:129 text:national statistics, span id:A5807 annotator:Alisha layer:Policydesigncharacteristics type:Compliance tag:Form_monitoring start:366 stop:385 text:national statistics, span id:A5808 annotator

## Check closeness to curation

In the same spirit as calculating the IAA scores, we can check the closeness to the curation.

**append_score_to_curation(self, scoring_metrics, parallel = False, ** optional_tuple_properties)**

This methid works very similar to get_total_score_df, but calculated the closeness to the curation for all the scoring_metrics defined in scoring_metrics. Again, the scores can be calculted in parallel. Appends al the scores for all annotators that contributed in tuples, where each element corresponds to a scoring metric. Again, the optional tuple properties are reserved for the pygamma score

In [5]:
scoring_metrics = ['f1_exact', 'f1_tokenwise', 'f1_partial', 'f1_heuristic']

In [6]:
test_evaluator.append_score_to_curation(scoring_metrics, parallel = False)

100%|██████████| 412/412 [00:07<00:00, 52.66it/s] 
100%|██████████| 412/412 [00:05<00:00, 74.05it/s] 
100%|██████████| 412/412 [00:07<00:00, 56.64it/s] 
100%|██████████| 412/412 [00:06<00:00, 68.29it/s] 


As before, we can specify a dissimilarity matrix for pygamma

In [7]:
category_list, cat_dissimilarity_matrix = create_scoring_matrix(os.path.join(ROOT_DIR,'src/experiment_utils/tag_set.json'),  soft_tagset_dissimilarity = True, soft_layer_dissimilarity = False)
test_evaluator.append_score_to_curation(scoring_metrics = 'pygamma', category_list = category_list, cat_dissimilarity_matrix = cat_dissimilarity_matrix)

100%|██████████| 412/412 [00:53<00:00,  7.64it/s]
100%|██████████| 412/412 [00:30<00:00, 13.62it/s]
100%|██████████| 412/412 [00:52<00:00,  7.87it/s]
100%|██████████| 412/412 [00:39<00:00, 10.34it/s]


Checking out the dataframe:

In [9]:
test_evaluator.df.head()

Unnamed: 0,Policy,Text,Tokens,Article_State,Finished_Annotators,Curation,Alisha,Fride,Onerva,Fabian,Lynn,Sebastian,Joel,Onerva_to_curation,Fabian_to_curation,Fride_to_curation,Alisha_to_curation
EU_32018R1999_Title_0_Chapter_7_Section_3_Article_43,,article 43\r\nexercise of the delegation\r\n1....,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Fride]",[span id:CUR0 annotator:Curation layer:Instrum...,[span id:A1 annotator:Alisha layer:Instrumentt...,[span id:B1 annotator:Fride layer:Policydesign...,,,,,,,,"[0.2222222222222222, 0.25396825396825395, 0.25...","[0.547945205479452, 0.6575342465753425, 0.6575..."
EU_32019R0631_Title_0_Chapter_0_Section_0_Article_12,,article 12\r\nreal-world co2 emissions and fue...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Onerva, Fabian]",[span id:CUR36 annotator:Curation layer:Instru...,,,[span id:C1 annotator:Onerva layer:Instrumentt...,[span id:D1 annotator:Fabian layer:Policydesig...,,,,"[0.6861313868613139, 0.744307577454274, 0.7153...","[0.2830188679245283, 0.35757575757575755, 0.32...",,
EU_32018L2001_Title_0_Chapter_0_Section_0_Article_11,,article 11\r\njoint projects between member st...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Fride, Onerva]",[span id:CUR116 annotator:Curation layer:Instr...,,[span id:B28 annotator:Fride layer:Instrumentt...,[span id:C58 annotator:Onerva layer:Instrument...,,,,,"[0.6113989637305699, 0.7253886010362695, 0.725...",,"[0.6737967914438502, 0.6737967914438502, 0.673...",
EU_32018R1999_Title_0_Chapter_7_Section_3_Article_56,,article 56\r\namendments to directive (eu) 201...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Fride]",[span id:CUR202 annotator:Curation layer:Polic...,[span id:A38 annotator:Alisha layer:Policydesi...,[span id:B129 annotator:Fride layer:Policydesi...,,,,,,,,"[0.8888888888888888, 1.0, 1.0, 0.9523809523809...","[0.631578947368421, 0.7368421052631577, 0.7368..."
EU_32018L2001_Title_0_Chapter_0_Section_0_Article_03,,article 3\r\nbinding overall union target for ...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Fride, Onerva, Fabian]",[span id:CUR211 annotator:Curation layer:Instr...,,[span id:B138 annotator:Fride layer:Instrument...,[span id:C165 annotator:Onerva layer:Instrumen...,[span id:D27 annotator:Fabian layer:Instrument...,,,,"[0.6057142857142856, 0.7770263647946682, 0.754...","[0.3448275862068966, 0.4137931034482759, 0.413...","[0.7553191489361701, 0.7765957446808509, 0.776...",


### Get individual score

To retrieve the scores, im simmilar fashion to get_total_score_df, we use

**get_to_curation_score(self, weight_by = 'Tokens')**

This retrieves the closeness to the curation for all the annotators that participated, weighted by one of the following methods: {'no_weighting, 'Tokens', 'Spans'}


In [10]:
test_evaluator.get_to_curation_score(weight_by = 'no_weighting')

{'Onerva': {'f1_exact': 0.6732862244002309,
  'f1_heuristic': 0.7418035160958896,
  'f1_partial': 0.7160416942508684,
  'f1_tokenwise': 0.6176183342356287,
  'pygamma': 0.6398429881987473},
 'Fabian': {'f1_exact': 0.43231750058721435,
  'f1_heuristic': 0.5092491205438597,
  'f1_partial': 0.49015694287418154,
  'f1_tokenwise': 0.37760504426835484,
  'pygamma': 0.4235659967281621},
 'Fride': {'f1_exact': 0.6829956559202468,
  'f1_heuristic': 0.7520213456882968,
  'f1_partial': 0.7364141686616807,
  'f1_tokenwise': 0.6406458629220034,
  'pygamma': 0.6611399878435612},
 'Alisha': {'f1_exact': 0.574676845271569,
  'f1_heuristic': 0.6622106992758143,
  'f1_partial': 0.6370336784394007,
  'f1_tokenwise': 0.5344264074636121,
  'pygamma': 0.5916188874355258}}

In [14]:
test_evaluator.get_to_curation_score(weight_by = 'Tokens')

{'Onerva': {'f1_exact': 0.6497229248404979,
  'f1_heuristic': 0.7155316902447743,
  'f1_partial': 0.6926120329988612,
  'f1_tokenwise': 0.588104303406712,
  'pygamma': 0.6386855251789504},
 'Fabian': {'f1_exact': 0.3884944280696229,
  'f1_heuristic': 0.44587542344065123,
  'f1_partial': 0.42722291795144346,
  'f1_tokenwise': 0.3137100596814219,
  'pygamma': 0.37423331038570085},
 'Fride': {'f1_exact': 0.6807736554301652,
  'f1_heuristic': 0.7406803749512647,
  'f1_partial': 0.7273136810436603,
  'f1_tokenwise': 0.6241358009693133,
  'pygamma': 0.667637130288239},
 'Alisha': {'f1_exact': 0.5475251978127618,
  'f1_heuristic': 0.630782505115647,
  'f1_partial': 0.6053944023319054,
  'f1_tokenwise': 0.49672538277097095,
  'pygamma': 0.5665791863213766}}

In [15]:
test_evaluator.get_to_curation_score(weight_by = 'Spans')

{'Onerva': {'f1_exact': 0.6487655252174788,
  'f1_heuristic': 0.7151208327340163,
  'f1_partial': 0.6919092874742293,
  'f1_tokenwise': 0.5876631656187074,
  'pygamma': 0.6384780755513397},
 'Fabian': {'f1_exact': 0.3859744866634834,
  'f1_heuristic': 0.4412883241131821,
  'f1_partial': 0.4219933146872233,
  'f1_tokenwise': 0.30882989118106857,
  'pygamma': 0.37047653197090646},
 'Fride': {'f1_exact': 0.6829841040462425,
  'f1_heuristic': 0.7434392410812806,
  'f1_partial': 0.7295881502890171,
  'f1_tokenwise': 0.6268153336863214,
  'pygamma': 0.670798321876068},
 'Alisha': {'f1_exact': 0.5462982718156604,
  'f1_heuristic': 0.6314991627071533,
  'f1_partial': 0.6046511627906977,
  'f1_tokenwise': 0.4947175134484899,
  'pygamma': 0.5649313770725863}}

### Get Total score

To retireve the total scloseness to the curation score, we use

**get_to_curation_score_total(self, weight_by = 'Tokens')**

For each article, we take an average of the annotator-curation scores (For all the finsihed annotators). To get the total average, all the individual article averages are weighted by one of the following methods: {'no_weighting, 'Tokens', 'Spans'}


In [16]:
test_evaluator.get_to_curation_score_total(weight_by = 'no_weighting')

{'f1_exact': 0.5988896876960187,
 'f1_heuristic': 0.6738591198644103,
 'f1_partial': 0.65271203543617,
 'f1_tokenwise': 0.5511128608780697,
 'pygamma': 0.5867342441185952}

In [17]:
test_evaluator.get_to_curation_score_total(weight_by = 'Tokens')

{'f1_exact': 0.5720544815512948,
 'f1_heuristic': 0.6385253916986301,
 'f1_partial': 0.6185247079726235,
 'f1_tokenwise': 0.5114731630019173,
 'pygamma': 0.5674438643944388}

In [18]:
test_evaluator.get_to_curation_score_total(weight_by = 'Spans')

{'f1_exact': 0.5742673921447767,
 'f1_heuristic': 0.6417420284651623,
 'f1_partial': 0.6211169455066855,
 'f1_tokenwise': 0.5134787912374995,
 'pygamma': 0.5694765988845216}

# Compare Inter Annotator Agreeement with and without dissimilarity matrices

In this section, we elaborate the difference of dissimilarity matrices. First we calculate the pygama score without specifying any matrix, so all the missmatches are penalized equaly == 1

In [21]:
#without matrix
test_evaluator = Inter_Annotator_Agreement(stat_df)
test_evaluator.keep_only_finished_articles()
scoring_metrics = ['pygamma']
test_evaluator.append_total_score_per_article(scoring_metrics = 'pygamma')
test_evaluator.get_total_score_df(weight_by='Spans')


100%|██████████| 412/412 [05:52<00:00,  1.17it/s]  


{'pygamma_score': 0.38221583739383}

Now we create a dissimilarity matrix that penalizes missmatches in the same tagset less == 0.5

In [22]:
#create a matrix that penalizes missmatches inside tagset less
category_list, cat_dissimilarity_matrix = create_scoring_matrix(os.path.join(ROOT_DIR,'src/experiment_utils/tag_set.json'),  soft_tagset_dissimilarity = True, soft_layer_dissimilarity = False)

test_evaluator_tagset = Inter_Annotator_Agreement(stat_df)
test_evaluator_tagset.keep_only_finished_articles()
scoring_metrics = ['pygamma']
test_evaluator_tagset.append_total_score_per_article(scoring_metrics = 'pygamma', category_list = category_list, cat_dissimilarity_matrix = cat_dissimilarity_matrix)

test_evaluator_tagset.get_total_score_df(weight_by='Spans')


100%|██████████| 412/412 [05:20<00:00,  1.28it/s]  


{'pygamma_score': 0.44427801517119536}

Now we create a dissimilarity matrix that penalizes missmatches in the same layer less == 0.5

In [23]:
#create a matrix that penalizes missmatches inside layer less
category_list, cat_dissimilarity_matrix = create_scoring_matrix(os.path.join(ROOT_DIR,'src/experiment_utils/tag_set.json'),  soft_tagset_dissimilarity = False, soft_layer_dissimilarity = True)

test_evaluator_layer = Inter_Annotator_Agreement(stat_df)
test_evaluator_layer.keep_only_finished_articles()
scoring_metrics = ['pygamma']
test_evaluator_layer.append_total_score_per_article(scoring_metrics = 'pygamma', category_list = category_list, cat_dissimilarity_matrix = cat_dissimilarity_matrix)

test_evaluator_layer.get_total_score_df(weight_by='Spans')

100%|██████████| 412/412 [05:02<00:00,  1.36it/s]  


{'pygamma_score': 0.4479737387388157}

# Check scores in different categories

In [23]:
layers = ['Technologyandapplicationspecificity', 'Policydesigncharacteristics', 'Instrumenttypes' ]
repo = repository()

for l in layers:
    span_list, score = test_evaluator.get_score_spanlist(conditional_rep = repo, annotators = 'annotators' , item = 'layer', value = l,scoring_metric = 'f1_heuristic', weight_by = 'Spans')
    print(f"layer: {l}, len of spanlist: {len(span_list)}, score: {score}")
    

layer: Technologyandapplicationspecificity, len of spanlist: 9799, score: 0.4908132839825293
layer: Policydesigncharacteristics, len of spanlist: 18660, score: 0.5151334014516284
layer: Instrumenttypes, len of spanlist: 5548, score: 0.4792674619854277


# Small Test

To thest if get_score_spanlist indeed yields the same result as get_total_score_df, we calculate the individual scores for all the categories, weight them by the 
number of spans per categorie, this should then yield the total score weighted by spans. Note that we got the scores with two separate mehtods, they are indentical however.
The very small difference is a difference of normalization, since we calculated indiviual scores for all categories.

In [44]:
layers = ['Technologyandapplicationspecificity', 'Policydesigncharacteristics', 'Instrumenttypes' ]
repo = repository()
score_tot = 0
spans = 0
for l in layers:
    span_list, score = test_evaluator.get_score_spanlist(conditional_rep = repo, annotators = 'annotators' , item = 'layer', value = l,scoring_metric = 'f1_heuristic', weight_by = 'Spans')
    print(f"layer: {l}, len of spanlist: {len(span_list)}, score: {score}")
    score_tot += len(span_list)*score
    spans += len(span_list)
score_tot/spans

layer: Technologyandapplicationspecificity, len of spanlist: 9799, score: 0.4908132839825293
layer: Policydesigncharacteristics, len of spanlist: 18660, score: 0.5151334014516284
layer: Instrumenttypes, len of spanlist: 5548, score: 0.4792674619854277


0.5022743705686283

In [45]:
test_evaluator.append_total_score_per_article(scoring_metrics = 'f1_heuristic')
test_evaluator.get_total_score_df(weight_by = 'Spans')

{'f1_heuristic_score': 0.5028562323485702}