# Notebook to calculate Inter Annotator Agreement

### Import the libraries

In [1]:
import sys
import pandas as pd
import collections 
import os
import numpy as np
from itertools import chain
from itertools import combinations
sys.path.insert(0, '..')
from src.experiment_utils.helper_classes import token, span, repository
from src.d02_corpus_statistics.corpus import Corpus
from src.d03_inter_annotator_agreement.inter_annotator_agremment import Inter_Annotator_Agreement, _get_score_article
from definitions import df_annotation_marker
from src.d03_inter_annotator_agreement.inter_annotator_agremment import row_to_span_list, keep_valid_anotations
from src.d03_inter_annotator_agreement.scoring_functions import create_scoring_matrix

from definitions import ROOT_DIR


## Small Tutorial

Load the dataframe stat_df

In [2]:
    
dataframe_dir = os.path.join(ROOT_DIR,'data/02_processed_to_dataframe', 'preprocessed_dataframe.pkl')
stat_df = pd.read_pickle(dataframe_dir)
stat_df.head()

Unnamed: 0,Policy,Text,Tokens,Article_State,Finished_Annotators,Curation,Alisha,Fride,Onerva,Fabian,Lynn,Sebastian,Joel
EU_32018R1999_Title_0_Chapter_7_Section_3_Article_43,,article 43\r\nexercise of the delegation\r\n1....,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Fride]",[span id:CUR0 annotator:Curation layer:Instrum...,[span id:A1 annotator:Alisha layer:Instrumentt...,[span id:B1 annotator:Fride layer:Policydesign...,,,,,
EU_32019R0631_Title_0_Chapter_0_Section_0_Article_12,,article 12\r\nreal-world co2 emissions and fue...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Onerva, Fabian]",[span id:CUR36 annotator:Curation layer:Instru...,,,[span id:C1 annotator:Onerva layer:Instrumentt...,[span id:D1 annotator:Fabian layer:Policydesig...,,,
EU_32018L2001_Title_0_Chapter_0_Section_0_Article_11,,article 11\r\njoint projects between member st...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Fride, Onerva]",[span id:CUR116 annotator:Curation layer:Instr...,,[span id:B28 annotator:Fride layer:Instrumentt...,[span id:C58 annotator:Onerva layer:Instrument...,,,,
EU_32018R1999_Title_0_Chapter_7_Section_3_Article_56,,article 56\r\namendments to directive (eu) 201...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Fride]",[span id:CUR202 annotator:Curation layer:Polic...,[span id:A38 annotator:Alisha layer:Policydesi...,[span id:B129 annotator:Fride layer:Policydesi...,,,,,
EU_32018L2001_Title_0_Chapter_0_Section_0_Article_03,,article 3\r\nbinding overall union target for ...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Fride, Onerva, Fabian]",[span id:CUR211 annotator:Curation layer:Instr...,,[span id:B138 annotator:Fride layer:Instrument...,[span id:C165 annotator:Onerva layer:Instrumen...,[span id:D27 annotator:Fabian layer:Instrument...,,,


In [3]:
!git branch

* [32mfix_load_data[m
  master[m
  plots[m


First create a object of class Inter annotator agreement. The constructor takes a stat_df as input, has a optional argument DEBUG where only the first 10 articles are taken to test different functions

In [4]:
test_evaluator = Inter_Annotator_Agreement(stat_df, front_and_whereas = False)
test_evaluator_debug = Inter_Annotator_Agreement(stat_df, DEBUG = True)


In [5]:
test_evaluator.df.shape

(412, 13)

Inter_Annotator_Agreement is a child class of the Corpus class, so all methods of the Corpus class are available

In [6]:
test_dir = repository(policy = 'EU_32008R1099')
test_evaluator.get_span_list(conditional_rep = test_dir, columns = 'annotators', item = 'tag', value =  'Tech_LowCarbon')

[span id:C3882 annotator:Onerva layer:Technologyandapplicationspecificity type:TechnologySpecificity tag:Tech_LowCarbon start:18 stop:25 text:nuclear,
 span id:C3883 annotator:Onerva layer:Technologyandapplicationspecificity type:TechnologySpecificity tag:Tech_LowCarbon start:95 stop:109 text:nuclear energy,
 span id:C3884 annotator:Onerva layer:Technologyandapplicationspecificity type:TechnologySpecificity tag:Tech_LowCarbon start:151 stop:158 text:nuclear,
 span id:C5161 annotator:Onerva layer:Technologyandapplicationspecificity type:TechnologySpecificity tag:Tech_LowCarbon start:125 stop:141 text:renewable energy,
 span id:C5162 annotator:Onerva layer:Technologyandapplicationspecificity type:TechnologySpecificity tag:Tech_LowCarbon start:393 stop:409 text:renewable energy,
 span id:C5163 annotator:Onerva layer:Technologyandapplicationspecificity type:TechnologySpecificity tag:Tech_LowCarbon start:499 stop:515 text:renewable energy,
 span id:C5164 annotator:Onerva layer:Technologyand

To calculate the inter annonator agreement, there are two options


## Append the score to dataframe

This method appends the inter-annotator agreement for each article which at least two valid annoations based on a set of inter-annotator agreement measures. The scores are calculated in parallel, this is the recommended method for computationally intensive scores.

First, we only consider the articles where the curation is finished and at least two annotators are present:

In [7]:
test_evaluator.keep_only_finished_articles()

In [8]:
scoring_metrics = ['f1_exact', 'f1_tokenwise', 'f1_partial', 'f1_heuristic']

In [9]:
test_evaluator.append_total_score_per_article(scoring_metrics)

100%|██████████| 412/412 [00:06<00:00, 60.75it/s]
100%|██████████| 412/412 [00:06<00:00, 64.96it/s]
100%|██████████| 412/412 [00:06<00:00, 67.28it/s]
100%|██████████| 412/412 [00:00<00:00, 620.82it/s]


Now append pygamma with custom scoring matrix based on our tagset

In [None]:
# create custom scoring matrix
#by setting soft layer dissimilarity equal true, missmatches in the same tagset are penalized less
category_list, cat_dissimilarity_matrix = create_scoring_matrix(os.path.join(ROOT_DIR,'src/experiment_utils/tag_set.json'),  soft_tagset_dissimilarity = True, soft_layer_dissimilarity = False)

In [None]:
test_evaluator.append_total_score_per_article(scoring_metrics = ['pygamma'], category_list = category_list, cat_dissimilarity_matrix = cat_dissimilarity_matrix)

There is also a normal implementation which uses parallel

In [None]:
#test_evaluator.append_total_score_per_article_parallel(scoring_metrics)

Checking out the dataframe now:

In [14]:
test_evaluator.df.head()

Unnamed: 0,Policy,Text,Tokens,Article_State,Finished_Annotators,Curation,Alisha,Fride,Onerva,Fabian,Lynn,Sebastian,Joel,f1_exact_score,f1_tokenwise_score,f1_partial_score,f1_heuristic_score,pygamma_score
EU_32018R1999_Title_0_Chapter_7_Section_3_Article_43,,article 43\r\nexercise of the delegation\r\n1....,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Fride]",[span id:CUR0 annotator:Curation layer:Instrum...,[span id:A1 annotator:Alisha layer:Instrumentt...,[span id:B1 annotator:Fride layer:Policydesign...,,,,,,0.21875,0.226574,0.28125,0.28125,0.450869
EU_32019R0631_Title_0_Chapter_0_Section_0_Article_12,,article 12\r\nreal-world co2 emissions and fue...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Onerva, Fabian]",[span id:CUR36 annotator:Curation layer:Instru...,,,[span id:C1 annotator:Onerva layer:Instrumentt...,[span id:D1 annotator:Fabian layer:Policydesig...,,,,0.289157,0.263793,0.409639,0.421603,0.34061
EU_32018L2001_Title_0_Chapter_0_Section_0_Article_11,,article 11\r\njoint projects between member st...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Fride, Onerva]",[span id:CUR116 annotator:Curation layer:Instr...,,[span id:B28 annotator:Fride layer:Instrumentt...,[span id:C58 annotator:Onerva layer:Instrument...,,,,,0.567308,0.512946,0.653846,0.653846,0.577621
EU_32018R1999_Title_0_Chapter_7_Section_3_Article_56,,article 56\r\namendments to directive (eu) 201...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Alisha, Fride]",[span id:CUR202 annotator:Curation layer:Polic...,[span id:A38 annotator:Alisha layer:Policydesi...,[span id:B129 annotator:Fride layer:Policydesi...,,,,,,0.736842,0.583333,0.736842,0.736842,0.559325
EU_32018L2001_Title_0_Chapter_0_Section_0_Article_03,,article 3\r\nbinding overall union target for ...,"[start:0 stop:7 text:article tag_count:0, star...",CURATION_FINISHED,"[Fride, Onerva, Fabian]",[span id:CUR211 annotator:Curation layer:Instr...,,[span id:B138 annotator:Fride layer:Instrument...,[span id:C165 annotator:Onerva layer:Instrumen...,[span id:D27 annotator:Fabian layer:Instrument...,,,,0.420198,0.410204,0.532749,0.544437,0.455536


### Get total score

test_evaluator.get_total_score_df(weight_by = 'Spans')To retrieve the total score of the corpus, use get_total_score_df() on a dataframes where the scores for individual articles have been calculated.

In [11]:
test_evaluator.get_total_score_df(weight_by = 'no_weighting')

{'f1_heuristic_score': 0.5273362748620107,
 'pygamma_score': 0.4579686061518452,
 'f1_partial_score': 0.5159440267362017,
 'f1_exact_score': 0.40028724799007476,
 'f1_tokenwise_score': 0.397687590370067}

In [12]:
test_evaluator.get_total_score_df(weight_by = 'Tokens')

{'f1_heuristic_score': 0.48679580434940306,
 'pygamma_score': 0.4383714759385591,
 'f1_partial_score': 0.4739162741735001,
 'f1_exact_score': 0.37492081552891304,
 'f1_tokenwise_score': 0.35240598751298724}

In [13]:
test_evaluator.get_total_score_df(weight_by = 'Spans')

{'f1_heuristic_score': 0.5028562323485702,
 'pygamma_score': 0.4479737387388157,
 'f1_partial_score': 0.4885814966811494,
 'f1_exact_score': 0.3874058515882508,
 'f1_tokenwise_score': 0.36379438736998276}

if only specific scores are required:

In [14]:
test_evaluator.get_total_score_df('f1_exact_score', weight_by = 'no_weighting')

{'f1_exact_score': 0.40028724799007476}

or

In [15]:
test_evaluator.get_total_score_df(['f1_exact_score', 'f1_tokenwise_score'], weight_by = 'Spans')

{'f1_exact_score': 0.3874058515882508,
 'f1_tokenwise_score': 0.36379438736998276}

### Get total score per annotator

In [16]:
annotators = ['Onerva', 'Alisha', 'Fabian', 'Fride']
for ann in annotators:
    print('annotator: ', ann)
    print(test_evaluator.get_total_score_df(annotator = ann, weight_by = 'Spans'))
    print('----------------')

annotator:  Onerva
{'f1_heuristic_score': 0.5467190944483692, 'pygamma_score': 0.48940887269928224, 'f1_partial_score': 0.5339943394957395, 'f1_exact_score': 0.4381654829355101, 'f1_tokenwise_score': 0.4090364908255155}
----------------
annotator:  Alisha
{'f1_heuristic_score': 0.44385587315930597, 'pygamma_score': 0.4030316809667925, 'f1_partial_score': 0.42750169278836747, 'f1_exact_score': 0.32323742866301736, 'f1_tokenwise_score': 0.30806281985414746}
----------------
annotator:  Fabian
{'f1_heuristic_score': 0.3962207997917874, 'pygamma_score': 0.3624746984712561, 'f1_partial_score': 0.38820915429061104, 'f1_exact_score': 0.3071820274746298, 'f1_tokenwise_score': 0.2747252372274946}
----------------
annotator:  Fride
{'f1_heuristic_score': 0.576894166530852, 'pygamma_score': 0.49495286490461726, 'f1_partial_score': 0.5584121507299492, 'f1_exact_score': 0.4427467808588547, 'f1_tokenwise_score': 0.4214078450791374}
----------------


In [17]:
test_evaluator.get_score_annotator('Fride', ['f1_exact_score', 'f1_tokenwise_score'])

{'f1_exact_score': 0.44287267910542477,
 'f1_tokenwise_score': 0.42132905468298554}

### Rank articles by score

In [10]:
test_evaluator.df.sort_values(by=['f1_heuristic_score'])

KeyError: 'f1_heuristic_score'

## Get total score based on a spanlist

The inter annotator agreement score can be also calculated from a spanlist. For all the spans present, it calculates the inter agreement scores for alle the articls with at least two valid annoations. Can be used to caluclate simmilarity to curation.

In [18]:
test_dir = repository.from_repository_name('EU_32008R1099_Title_0_Chapter_0_Section_0_Article_12')
span_list = test_evaluator.get_span_list(test_dir, ['Onerva', 'Fride'])

In [19]:
span_list


[span id:C268 annotator:Onerva layer:Policydesigncharacteristics type:Time tag:Time_InEffect start:52 stop:156 text:enter into force on the 20th day following its publication in the official journal of the european union,
 span id:C269 annotator:Onerva layer:Policydesigncharacteristics type:Actor tag:Addressee_default start:239 stop:252 text:member states,
 span id:B243 annotator:Fride layer:Policydesigncharacteristics type:Time tag:Time_InEffect start:76 stop:134 text:20th day following its publication in the official journal,
 span id:B244 annotator:Fride layer:Policydesigncharacteristics type:Actor tag:Addressee_default start:239 stop:252 text:member states]

In [20]:
test_evaluator.get_score_spanlist(span_list, 'pygamma')

0.9476514351793441

Or use this function to get scores in specific categories:

In [21]:
test_dir = repository.from_repository_name('EU_32008R1099_Title_0_Chapter_0_Section_0_Article_05')
span_list = test_evaluator.get_span_list(test_dir, columns = ['Alisha', 'Fride'], item = 'layer', value = 'Policydesigncharacteristics')
span_list

[span id:A5803 annotator:Alisha layer:Policydesigncharacteristics type:Actor tag:Addressee_monitored start:48 stop:61 text:member states,
 span id:A5804 annotator:Alisha layer:Policydesigncharacteristics type:Actor tag:Authority_monitoring start:84 stop:94 text:commission,
 span id:A5805 annotator:Alisha layer:Policydesigncharacteristics type:Actor tag:Authority_monitoring start:96 stop:104 text:eurostat,
 span id:A5806 annotator:Alisha layer:Policydesigncharacteristics type:Compliance tag:Form_monitoring start:110 stop:129 text:national statistics,
 span id:A5807 annotator:Alisha layer:Policydesigncharacteristics type:Compliance tag:Form_monitoring start:366 stop:385 text:national statistics,
 span id:A5808 annotator:Alisha layer:Policydesigncharacteristics type:Actor tag:Addressee_monitored start:528 stop:540 text:member state,
 span id:A5809 annotator:Alisha layer:Policydesigncharacteristics type:Reversibility tag:Reversibility_policy start:553 stop:578 text:exemptions or derogation

In [22]:
test_evaluator.get_score_spanlist(span_list, 'f1_heuristic')

0.0

## Check closeness to curation

To check the agreement with the curation of all annotators, we simply create a spanlist for each annotator containing all his spans and the ones from the curation
Since this is based on a big list instead of a dataframe, the computation is very slow

**Normally, all the scores can be calculated at the same time. However, there is a bug at the moment and they have to be calculated individually.**

### F1 exact

In [12]:
#no weighting
test_evaluator = Inter_Annotator_Agreement(stat_df)
test_evaluator.keep_only_finished_articles()
scoring_metrics = ['f1_exact']
test_evaluator.append_score_to_curation(scoring_metrics)
test_evaluator.get_to_curation_score(weight_by = 'no_weighting')

100%|██████████| 412/412 [00:03<00:00, 117.91it/s]
100%|██████████| 412/412 [00:02<00:00, 148.44it/s]
100%|██████████| 412/412 [00:02<00:00, 160.66it/s]
100%|██████████| 412/412 [00:03<00:00, 119.88it/s]


{'Onerva': {'f1_exact': 0.6732862244002309},
 'Alisha': {'f1_exact': 0.574676845271569},
 'Fabian': {'f1_exact': 0.43231750058721435},
 'Fride': {'f1_exact': 0.6829956559202468}}

In [13]:
#weight by spans
test_evaluator.get_to_curation_score(weight_by = 'Spans')

{'Onerva': {'f1_exact': 0.6487655252174788},
 'Alisha': {'f1_exact': 0.5462982718156604},
 'Fabian': {'f1_exact': 0.3859744866634834},
 'Fride': {'f1_exact': 0.6829841040462425}}

In [14]:
#weight by Tokens
test_evaluator.get_to_curation_score(weight_by = 'Tokens')

{'Onerva': {'f1_exact': 0.6497229248404979},
 'Alisha': {'f1_exact': 0.5475251978127618},
 'Fabian': {'f1_exact': 0.3884944280696229},
 'Fride': {'f1_exact': 0.6807736554301652}}

### F1 partial

In [15]:
#no weighting
test_evaluator = Inter_Annotator_Agreement(stat_df)
test_evaluator.keep_only_finished_articles()
scoring_metrics = ['f1_partial']
test_evaluator.append_score_to_curation(scoring_metrics)
test_evaluator.get_to_curation_score(weight_by = 'no_weighting')

100%|██████████| 412/412 [00:03<00:00, 121.87it/s]
100%|██████████| 412/412 [00:02<00:00, 142.85it/s]
100%|██████████| 412/412 [00:02<00:00, 159.89it/s]
100%|██████████| 412/412 [00:03<00:00, 120.13it/s]


{'Onerva': {'f1_partial': 0.7160416942508684},
 'Alisha': {'f1_partial': 0.6370336784394007},
 'Fabian': {'f1_partial': 0.49015694287418154},
 'Fride': {'f1_partial': 0.7364141686616807}}

In [16]:
#weight by spans
test_evaluator.get_to_curation_score(weight_by = 'Spans')

{'Onerva': {'f1_partial': 0.6919092874742293},
 'Alisha': {'f1_partial': 0.6046511627906977},
 'Fabian': {'f1_partial': 0.4219933146872233},
 'Fride': {'f1_partial': 0.7295881502890171}}

In [17]:
test_evaluator.get_to_curation_score(weight_by = 'Tokens')

{'Onerva': {'f1_partial': 0.6926120329988612},
 'Alisha': {'f1_partial': 0.6053944023319054},
 'Fabian': {'f1_partial': 0.42722291795144346},
 'Fride': {'f1_partial': 0.7273136810436603}}

### F1 Heuristic

In [18]:
#no weighting
test_evaluator = Inter_Annotator_Agreement(stat_df)
test_evaluator.keep_only_finished_articles()
scoring_metrics = ['f1_heuristic']
test_evaluator.append_score_to_curation(scoring_metrics)
test_evaluator.get_to_curation_score(weight_by = 'no_weighting')

100%|██████████| 412/412 [00:00<00:00, 873.98it/s] 
100%|██████████| 412/412 [00:00<00:00, 1285.35it/s]
100%|██████████| 412/412 [00:00<00:00, 1559.07it/s]
100%|██████████| 412/412 [00:00<00:00, 966.04it/s] 


{'Onerva': {'f1_heuristic': 0.7418035160958896},
 'Alisha': {'f1_heuristic': 0.6622106992758143},
 'Fabian': {'f1_heuristic': 0.5092491205438597},
 'Fride': {'f1_heuristic': 0.7520213456882968}}

In [19]:
#weight by spans
test_evaluator.get_to_curation_score(weight_by = 'Spans')

{'Onerva': {'f1_heuristic': 0.7151208327340163},
 'Alisha': {'f1_heuristic': 0.6314991627071533},
 'Fabian': {'f1_heuristic': 0.4412883241131821},
 'Fride': {'f1_heuristic': 0.7434392410812806}}

In [20]:
#weight by Tokens
test_evaluator.get_to_curation_score(weight_by = 'Tokens')

{'Onerva': {'f1_heuristic': 0.7155316902447743},
 'Alisha': {'f1_heuristic': 0.630782505115647},
 'Fabian': {'f1_heuristic': 0.44587542344065123},
 'Fride': {'f1_heuristic': 0.7406803749512647}}

### Pygamma

In [None]:
#no weighting
test_evaluator = Inter_Annotator_Agreement(stat_df)
test_evaluator.keep_only_finished_articles()
scoring_metrics = ['pygamma']
test_evaluator.append_score_to_curation(scoring_metrics, category_list = category_list, cat_dissimilarity_matrix = cat_dissimilarity_matrix)
test_evaluator.get_to_curation_score(weight_by = 'no_weighting')

100%|██████████| 412/412 [01:15<00:00,  5.44it/s]
100%|██████████| 412/412 [00:58<00:00,  7.05it/s]
100%|██████████| 412/412 [00:44<00:00,  9.33it/s]
  7%|▋         | 27/412 [00:03<00:35, 10.92it/s]

In [None]:
#weight by spans
test_evaluator.get_to_curation_score(weight_by = 'Spans')

In [None]:
#weight by Tokens
test_evaluator.get_to_curation_score(weight_by = 'Tokens')

## Get total score 

### F1 exact

In [24]:
#no weighting
test_evaluator = Inter_Annotator_Agreement(stat_df)
test_evaluator.keep_only_finished_articles()
scoring_metrics = ['f1_exact']
test_evaluator.append_score_to_curation(scoring_metrics)
test_evaluator.get_to_curation_score_total(weight_by = 'no_weighting')

100%|██████████| 412/412 [00:03<00:00, 113.63it/s]
100%|██████████| 412/412 [00:02<00:00, 145.61it/s]
100%|██████████| 412/412 [00:02<00:00, 159.49it/s]
100%|██████████| 412/412 [00:03<00:00, 116.52it/s]


{'f1_exact': 0.5988896876960187}

In [25]:
test_evaluator.get_to_curation_score_total(weight_by = 'Spans')

{'f1_exact': 0.5742673921447767}

In [26]:
test_evaluator.get_to_curation_score_total(weight_by = 'Tokens')

{'f1_exact': 0.5720544815512948}

### F1 partial

In [27]:
#no weighting
test_evaluator = Inter_Annotator_Agreement(stat_df)
test_evaluator.keep_only_finished_articles()
scoring_metrics = ['f1_partial']
test_evaluator.append_score_to_curation(scoring_metrics)
test_evaluator.get_to_curation_score_total(weight_by = 'no_weighting')

100%|██████████| 412/412 [00:03<00:00, 116.37it/s]
100%|██████████| 412/412 [00:02<00:00, 146.85it/s]
100%|██████████| 412/412 [00:02<00:00, 160.66it/s]
100%|██████████| 412/412 [00:03<00:00, 119.04it/s]


{'f1_partial': 0.65271203543617}

In [28]:
test_evaluator.get_to_curation_score_total(weight_by = 'Spans')

{'f1_partial': 0.6211169455066855}

In [29]:
test_evaluator.get_to_curation_score_total(weight_by = 'Tokens')

{'f1_partial': 0.6185247079726235}

### F1 heuristic

In [30]:
#no weighting
test_evaluator = Inter_Annotator_Agreement(stat_df)
test_evaluator.keep_only_finished_articles()
scoring_metrics = ['f1_heuristic']
test_evaluator.append_score_to_curation(scoring_metrics)
test_evaluator.get_to_curation_score_total(weight_by = 'no_weighting')

100%|██████████| 412/412 [00:00<00:00, 844.33it/s]
100%|██████████| 412/412 [00:00<00:00, 1262.79it/s]
100%|██████████| 412/412 [00:00<00:00, 1519.48it/s]
100%|██████████| 412/412 [00:00<00:00, 901.46it/s]


{'f1_heuristic': 0.6738591198644103}

In [31]:
test_evaluator.get_to_curation_score_total(weight_by = 'Spans')

{'f1_heuristic': 0.6417420284651623}

In [32]:
test_evaluator.get_to_curation_score_total(weight_by = 'Tokens')

{'f1_heuristic': 0.6385253916986301}

### Pygamma

In [None]:
#no weighting
test_evaluator = Inter_Annotator_Agreement(stat_df)
test_evaluator.keep_only_finished_articles()
scoring_metrics = ['pygamma']
test_evaluator.append_score_to_curation(scoring_metrics, category_list = category_list, cat_dissimilarity_matrix = cat_dissimilarity_matrix)
test_evaluator.get_to_curation_score_total(weight_by = 'no_weighting')

In [None]:
test_evaluator.get_to_curation_score_total(weight_by = 'Spans')

In [None]:
test_evaluator.get_to_curation_score_total(weight_by = 'Tokens')

### Get Total score

In [19]:
test_evaluator.get_to_curation_score_total(weight_by = 'no_weighting')

{'f1_partial': 0.5988330546269173,
 'f1_tokenwise': 0.5511742301973737,
 'pygamma': 0.652038841983237,
 'f1_heuristic': 0.6738591198644103,
 'f1_exact': 0.5870110472498731}

In [20]:
test_evaluator.get_to_curation_score_total(weight_by = 'no_weighting')

{'f1_partial': 0.5988330546269173,
 'f1_tokenwise': 0.5511742301973737,
 'pygamma': 0.652038841983237,
 'f1_heuristic': 0.6738591198644103,
 'f1_exact': 0.5870110472498731}

In [21]:
test_evaluator.get_to_curation_score_total(weight_by = 'Tokens')

{'f1_partial': 0.5720086969600027,
 'f1_tokenwise': 0.5115311549762288,
 'pygamma': 0.6177879879042762,
 'f1_heuristic': 0.6385253916986301,
 'f1_exact': 0.5686882403542074}

In [22]:
test_evaluator.get_to_curation_score_total(weight_by = 'Spans')

{'f1_partial': 0.5742097335208172,
 'f1_tokenwise': 0.5135427297793449,
 'pygamma': 0.6203035248148135,
 'f1_heuristic': 0.6417420284651623,
 'f1_exact': 0.5708419751663366}

# Check scores in different categories

In [79]:
layers = ['Technologyandapplicationspecificity', 'Policydesigncharacteristics', 'Instrumenttypes' ]
repo = repository()

for l in layers:
    span_list_layer = test_evaluator.get_span_list(repo, columns = 'annotators', item = 'layer', value = l)
    score = test_evaluator.get_score_spanlist(span_list_layer, 'f1_heuristic')
    print('layer: ', l, ', score: ', score)
    

AttributeError: 'numpy.float64' object has no attribute 'rep'

In [None]:
all_tags = 

In [11]:
#compare influence of soft dissimilarity
#without matrix
test_evaluator = Inter_Annotator_Agreement(stat_df)
test_evaluator.keep_only_finished_articles()
scoring_metrics = ['f1_exact']
test_evaluator.append_score_to_curation(scoring_metrics)
test_evaluator.get_to_curation_score(weight_by = 'no_weighting')

100%|██████████| 412/412 [00:03<00:00, 120.29it/s]
100%|██████████| 412/412 [00:02<00:00, 143.50it/s]
100%|██████████| 412/412 [00:02<00:00, 163.12it/s]
100%|██████████| 412/412 [00:03<00:00, 123.25it/s]


{'Fride': {'f1_exact': 0.6829956559202468},
 'Alisha': {'f1_exact': 0.574676845271569},
 'Fabian': {'f1_exact': 0.43231750058721435},
 'Onerva': {'f1_exact': 0.6732862244002309}}

In [12]:
#compare influence of soft dissimilarity
#without matrix
test_evaluator = Inter_Annotator_Agreement(stat_df)
test_evaluator.keep_only_finished_articles()
scoring_metrics = ['f1_tokenwise']
test_evaluator.append_score_to_curation(scoring_metrics)
test_evaluator.get_to_curation_score(weight_by = 'no_weighting')

100%|██████████| 412/412 [00:03<00:00, 116.31it/s]
100%|██████████| 412/412 [00:02<00:00, 144.48it/s]
100%|██████████| 412/412 [00:02<00:00, 158.51it/s]
100%|██████████| 412/412 [00:03<00:00, 118.40it/s]


{'Fride': {'f1_tokenwise': 0.6406458629220034},
 'Alisha': {'f1_tokenwise': 0.5344264074636121},
 'Fabian': {'f1_tokenwise': 0.37760504426835484},
 'Onerva': {'f1_tokenwise': 0.6176183342356287}}

In [13]:
#compare influence of soft dissimilarity
#without matrix
test_evaluator = Inter_Annotator_Agreement(stat_df)
test_evaluator.keep_only_finished_articles()
scoring_metrics = ['f1_heuristic']
test_evaluator.append_score_to_curation(scoring_metrics)
test_evaluator.get_to_curation_score(weight_by = 'no_weighting')

100%|██████████| 412/412 [00:00<00:00, 1004.34it/s]
100%|██████████| 412/412 [00:00<00:00, 1300.17it/s]
100%|██████████| 412/412 [00:00<00:00, 1604.44it/s]
100%|██████████| 412/412 [00:00<00:00, 917.15it/s] 


{'Fride': {'f1_heuristic': 0.7520213456882968},
 'Alisha': {'f1_heuristic': 0.6622106992758143},
 'Fabian': {'f1_heuristic': 0.5092491205438597},
 'Onerva': {'f1_heuristic': 0.7418035160958896}}