# Notebook to calculate Inter Annotator Agreement

### Import the libraries

In [28]:
import sys
import pandas as pd
import collections 
import os
import numpy as np
from itertools import chain
from itertools import combinations
sys.path.append('/home/jkuettel/NLP_spark/src')
sys.path.append('/home/jkuettel/NLP_spark')
from src.experiment_utils.helper_classes import token, span, repository
from src.d02_corpus_statistics.corpus import Corpus
from src.d03_inter_annotator_agreement.inter_annotator_agremment import Inter_Annotator_Agreement, _get_score_article
from definitions import df_annotation_marker
from src.d03_inter_annotator_agreement.inter_annotator_agremment import row_to_span_list, keep_valid_anotations


from definitions import ROOT_DIR


## Small Tutorial

Load the dataframe stat_df

In [29]:
    
dataframe_dir = os.path.join(ROOT_DIR,'data/02_processed_to_dataframe', 'preprocessed_dataframe.pkl')
stat_df = pd.read_pickle(dataframe_dir)
stat_df.head()

Unnamed: 0,Policy,Text,Tokens,Article_State,Finished_Annotators,Curation,Onerva,Fride,Lynn,Sebastian,Alisha,Fabian
EU_32006L0032_Title_0_Chapter_1_Section_0_Article_03,,article 3\r\ndefinitions\r\nfor the purposes o...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Fride, Onerva]",[annotator:curation layer:Instrumenttypes type...,[annotator:Onerva layer:Instrumenttypes type:I...,[annotator:Fride layer:Instrumenttypes type:In...,,,,
EU_32006L0066_Title_0_Chapter_0_Section_0_Article_25,,article 25\r\npenalties\r\nmember states shall...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Fride, Onerva]",[annotator:curation layer:Instrumenttypes type...,[annotator:Onerva layer:Instrumenttypes type:I...,[annotator:Fride layer:Instrumenttypes type:In...,[],,,
EU_32009L0028_Title_0_Chapter_0_Section_0_Article_07,,article 7\r\njoint projects between member sta...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Fabian]",[annotator:curation layer:Policydesigncharacte...,[],,,[],[annotator:Alisha layer:Policydesigncharacteri...,[annotator:Fabian layer:Policydesigncharacteri...
EU_32019L0944_Title_0_Chapter_6_Section_3_Article_47,,article 47\r\nindependence of the transmission...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Onerva]",[annotator:curation layer:Instrumenttypes type...,[annotator:Onerva layer:Instrumenttypes type:I...,,,,[annotator:Alisha layer:Instrumenttypes type:I...,
EU_32019R0631_Title_0_Chapter_0_Section_0_Article_14,,article 14\r\nadjustment of m0 and tm0 values\...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Onerva, Fabian]",[annotator:curation layer:Instrumenttypes type...,[annotator:Onerva layer:Instrumenttypes type:I...,,,,,[annotator:Fabian layer:Policydesigncharacteri...


First create a object of class Inter annotator agreement. The constructor takes a stat_df as input, has a optional argument DEBUG where only the first 10 articles are taken to test different functions

In [3]:
test_evaluator = Inter_Annotator_Agreement(stat_df)
test_evaluator_debug = Inter_Annotator_Agreement(stat_df, DEBUG = True)

Inter_Annotator_Agreement is a child class of the Corpus class, so all methods of the Corpus class are available

In [4]:
test_dir = repository(policy = 'EU_32008R1099')
test_evaluator.get_span_list(conditional_rep = test_dir, columns = 'annotators', item = 'tag', value =  'Tech_LowCarbon')

[annotator:Onerva layer:Technologyandapplicationspecificity type:TechnologySpecificity tag:Tech_LowCarbon start:125 stop:141 text:renewable energy,
 annotator:Onerva layer:Technologyandapplicationspecificity type:TechnologySpecificity tag:Tech_LowCarbon start:393 stop:409 text:renewable energy,
 annotator:Onerva layer:Technologyandapplicationspecificity type:TechnologySpecificity tag:Tech_LowCarbon start:499 stop:515 text:renewable energy,
 annotator:Onerva layer:Technologyandapplicationspecificity type:TechnologySpecificity tag:Tech_LowCarbon start:1276 stop:1292 text:renewable energy,
 annotator:Onerva layer:Technologyandapplicationspecificity type:TechnologySpecificity tag:Tech_LowCarbon start:701 stop:716 text:energy products,
 annotator:Fride layer:Technologyandapplicationspecificity type:TechnologySpecificity tag:Tech_LowCarbon start:866 stop:888 text:electricity generation,
 annotator:Onerva layer:Technologyandapplicationspecificity type:TechnologySpecificity tag:Tech_LowCarbon 

To calculate the inter annonator agreement, there are two options


## Append the score to dataframe

This method appends the inter-annotator agreement for each article which at least two valid annoations based on a set of inter-annotator agreement measures. The scores are calculated in parallel, this is the recommended method for computationally intensive scores.

First, we only consider the articles where the curation is finished and at least two annotators are present:

In [5]:
test_evaluator.keep_only_finished_articles()

In [7]:
scoring_metrics = ['f1_exact', 'f1_tokenwise', 'f1_partial', 'f1_heuristic'] #, 'pygamma']

In [8]:
test_evaluator.append_total_score_per_article(scoring_metrics)

100%|██████████| 404/404 [00:05<00:00, 79.36it/s] 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df[column_name] = self.df.progress_apply(lambda row: _get_score_article(row_to_span_list(row), scoring_metric, row['Finished_Annotators'],  **optional_tuple_properties), axis=1)
100%|██████████| 404/404 [00:04<00:00, 88.28it/s] 
100%|██████████| 404/404 [00:04<00:00, 88.95it/s] 
100%|██████████| 404/404 [00:00<00:00, 897.21it/s] 


There is also a normal implementation which uses parallel

In [None]:
test_evaluator.append_total_score_per_article_parallel(scoring_metrics)

Checking out the dataframe now:

In [9]:
test_evaluator.df.head()

Unnamed: 0,Policy,Text,Tokens,Article_State,Finished_Annotators,Curation,Onerva,Fride,Lynn,Sebastian,Alisha,Fabian,f1_exact_score,f1_tokenwise_score,f1_partial_score,f1_heuristic_score
EU_32006L0032_Title_0_Chapter_1_Section_0_Article_03,,article 3\r\ndefinitions\r\nfor the purposes o...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Fride, Onerva]",[annotator:curation layer:Instrumenttypes type...,[annotator:Onerva layer:Instrumenttypes type:I...,[annotator:Fride layer:Instrumenttypes type:In...,,,,,0.373418,0.415289,0.544304,0.566417
EU_32006L0066_Title_0_Chapter_0_Section_0_Article_25,,article 25\r\npenalties\r\nmember states shall...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Fride, Onerva]",[annotator:curation layer:Instrumenttypes type...,[annotator:Onerva layer:Instrumenttypes type:I...,[annotator:Fride layer:Instrumenttypes type:In...,[],,,,0.740741,0.625,0.740741,0.740741
EU_32009L0028_Title_0_Chapter_0_Section_0_Article_07,,article 7\r\njoint projects between member sta...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Fabian]",[annotator:curation layer:Policydesigncharacte...,[],,,[],[annotator:Alisha layer:Policydesigncharacteri...,[annotator:Fabian layer:Policydesigncharacteri...,0.382979,0.345238,0.425532,0.425532
EU_32019L0944_Title_0_Chapter_6_Section_3_Article_47,,article 47\r\nindependence of the transmission...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Onerva]",[annotator:curation layer:Instrumenttypes type...,[annotator:Onerva layer:Instrumenttypes type:I...,,,,[annotator:Alisha layer:Instrumenttypes type:I...,,0.191304,0.176783,0.226087,0.226087
EU_32019R0631_Title_0_Chapter_0_Section_0_Article_14,,article 14\r\nadjustment of m0 and tm0 values\...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Onerva, Fabian]",[annotator:curation layer:Instrumenttypes type...,[annotator:Onerva layer:Instrumenttypes type:I...,,,,,[annotator:Fabian layer:Policydesigncharacteri...,0.0,0.344725,0.451613,0.451613


### Get total score

To retrieve the total score of the corpus, use get_total_score_df() on a dataframes where the scores for individual articles have been calculated.

In [10]:
test_evaluator.get_total_score_df()

{'f1_exact_score': 0.36249329712132256,
 'f1_tokenwise_score': 0.34105328317949757,
 'f1_partial_score': 0.4583232024589149,
 'f1_heuristic_score': 0.4710149746445954}

if only specific scores are required:

In [11]:
test_evaluator.get_total_score_df('f1_exact_score')

{'f1_exact_score': 0.36249329712132256}

or

In [12]:
test_evaluator.get_total_score_df(['f1_exact_score', 'f1_tokenwise_score'])

{'f1_exact_score': 0.36249329712132256,
 'f1_tokenwise_score': 0.34105328317949757}

### Get total score per annotator

In [13]:
annotators = ['Onerva', 'Alisha', 'Fabian', 'Fride']
for ann in annotators:
    print('annotator: ', ann)
    print(test_evaluator.get_total_score_df(annotator = ann)
    print('----------------')

annotator:  Onerva
{'f1_exact_score': 0.4014986276951068, 'f1_tokenwise_score': 0.37528383007493793, 'f1_partial_score': 0.4910256324428709, 'f1_heuristic_score': 0.5023224716566984}
----------------
annotator:  Alisha
{'f1_exact_score': 0.31857810504920586, 'f1_tokenwise_score': 0.30329676249201065, 'f1_partial_score': 0.418309684131554, 'f1_heuristic_score': 0.4325632534644839}
----------------
annotator:  Fabian
{'f1_exact_score': 0.28202672255157424, 'f1_tokenwise_score': 0.25734107758801456, 'f1_partial_score': 0.3601323035370801, 'f1_heuristic_score': 0.36693494910228464}
----------------
annotator:  Fride
{'f1_exact_score': 0.4443964697402863, 'f1_tokenwise_score': 0.4230426239210114, 'f1_partial_score': 0.5587909824988373, 'f1_heuristic_score': 0.5771693950836014}
----------------


In [14]:
test_evaluator.get_score_annotator('Fride', ['f1_exact_score', 'f1_tokenwise_score'] )

{'f1_exact_score': 0.4443964697402863,
 'f1_tokenwise_score': 0.4230426239210114}

### Rank articles by score

In [15]:
test_evaluator.df.sort_values(by=['f1_heuristic_score'])

Unnamed: 0,Policy,Text,Tokens,Article_State,Finished_Annotators,Curation,Onerva,Fride,Lynn,Sebastian,Alisha,Fabian,f1_exact_score,f1_tokenwise_score,f1_partial_score,f1_heuristic_score
EU_32009L0028_Title_0_Chapter_0_Section_0_Article_28,,article 28\r\nentry into force\r\nthis directi...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Fabian]",[annotator:curation layer:Policydesigncharacte...,[],,,[],[annotator:Alisha layer:Policydesigncharacteri...,[],0.000000,0.000000,0.0,0.0
EU_32019R0631_Title_0_Chapter_0_Section_0_Article_18,,article 18\r\nrepeal\r\nregulations (ec) no 44...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Onerva, Fabian]",[annotator:curation layer:Policydesigncharacte...,[annotator:Onerva layer:Policydesigncharacteri...,,,,,[annotator:Fabian layer:Policydesigncharacteri...,0.000000,0.000000,0.0,0.0
EU_32019L0944_Title_0_Chapter_4_Section_0_Article_39,,article 39\r\ncombined operator\r\narticle 35(...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Onerva]",[annotator:curation layer:Policydesigncharacte...,[annotator:Onerva layer:Policydesigncharacteri...,,,,[annotator:Alisha layer:Policydesigncharacteri...,,0.000000,0.000000,0.0,0.0
EU_32006L0066_Title_0_Chapter_0_Section_0_Article_29,,article 29\r\nentry into force\r\nthis directi...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Fride, Onerva]",[annotator:curation layer:Policydesigncharacte...,[annotator:Onerva layer:Policydesigncharacteri...,[annotator:Fride layer:Policydesigncharacteris...,[],,,,0.000000,0.000000,0.0,0.0
EU_32008R1099_Title_0_Chapter_0_Section_0_Article_10,,article 10\r\nimplementing measures\r\n1. th...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Fride]",[annotator:curation layer:Instrumenttypes type...,[annotator:Onerva layer:Policydesigncharacteri...,[annotator:Fride layer:Instrumenttypes type:In...,,,[annotator:Alisha layer:Instrumenttypes type:I...,,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
EU_32006L0032_Title_0_Chapter_4_Section_0_Article_20,,article 20\r\naddressees\r\nthis directive is ...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Fride, Onerva]",[annotator:curation layer:Policydesigncharacte...,[annotator:Onerva layer:Policydesigncharacteri...,[annotator:Fride layer:Policydesigncharacteris...,,,,,1.000000,1.000000,1.0,1.0
EU_32018R1999_Title_0_Chapter_7_Section_3_Article_55,,article 55\r\namendment to directive 2013/30/e...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Fride]",[annotator:curation layer:Policydesigncharacte...,,[annotator:Fride layer:Policydesigncharacteris...,,,[annotator:Alisha layer:Policydesigncharacteri...,,0.909091,0.969697,1.0,1.0
EU_32006L0066_Title_0_Chapter_0_Section_0_Article_30,,article 30\r\naddressees\r\nthis directive is ...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Fride, Onerva]",[annotator:curation layer:Policydesigncharacte...,[annotator:Onerva layer:Policydesigncharacteri...,[annotator:Fride layer:Policydesigncharacteris...,[],,,,1.000000,1.000000,1.0,1.0
EU_32012L0027_Title_0_Chapter_5_Section_0_Article_30,,article 30\r\naddressees\r\nthis directive is ...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Fride, Onerva]",[annotator:curation layer:Policydesigncharacte...,[annotator:Onerva layer:Policydesigncharacteri...,[annotator:Fride layer:Policydesigncharacteris...,,,,,1.000000,1.000000,1.0,1.0


## Get total score based on a spanlist

The inter annotator agreement score can be also calculated from a spanlist. For all the spans present, it calculates the inter agreement scores for alle the articls with at least two valid annoations. Can be used to caluclate simmilarity to curation.

In [19]:
test_dir = repository.from_repository_name('EU_32006L0032_Title_0_Chapter_1_Section_0_Article_03')
span_list = test_evaluator.get_span_list(test_dir, ['Onerva', 'Fride'])

In [20]:
test_evaluator.get_score_spanlist(span_list, 'f1_heuristic')

0.5664166174246597

## Check closeness to curation

To check the agreement with the curation of all annotators, we simply create a spanlist for each annotator containing all his spans and the ones from the curation
Since this is based on a big list instead of a dataframe, the computation is very slow

In [24]:
annotators = ['Onerva', 'Alisha', 'Fabian', 'Fride']
repo = repository()

for ann in annotators:
    span_list_annotator_curation = test_evaluator.get_span_list(repo, ['Curation', ann])
    score = test_evaluator.get_score_spanlist(span_list_annotator_curation, 'f1_heuristic')
    print('annotator: ', ann, ', score: ', score)
    
    
    
    

annotator:  Onerva , score:  0.707738142033301
annotator:  Alisha , score:  0.632147918537081
annotator:  Fabian , score:  0.44097366388791853
annotator:  Fride , score:  0.7413404022482636


# Check scores in different categories

In [27]:
layers = ['Technologyandapplicationspecificity', 'Policydesigncharacteristics', 'Instrumenttypes' ]
repo = repository()

for l in layers:
    span_list_layer = test_evaluator.get_span_list(repo, columns = 'annotators', item = 'layer', value = l)
    score = test_evaluator.get_score_spanlist(span_list_layer, 'f1_heuristic')
    print('layer: ', l, ', score: ', score)
    

layer:  Technologyandapplicationspecificity , score:  0.4630743356234037
layer:  Policydesigncharacteristics , score:  0.5087584057577884
layer:  Instrumenttypes , score:  0.45662242886098364


In [None]:
all_tags = 