In [1]:
import sys
sys.path.insert(0, '../util/')
sys.path.insert(1, '../experiments/')

In [2]:
import pandas as pd
from pathlib import Path
import spacy
import re
from collections import defaultdict
from huggingface_utils import load_custom_dataset
from convert_annotations import entity_values
from stats_util import sort_fn, sort_order

# Corpus Statistics

In [3]:
translations = pd.read_csv('../data/guideline_translations.csv').set_index('german_name')

In [4]:
def translator_fn(name):
    if name == 'Sum':
        return name
    return translations.loc[name].english_name

In [5]:
corpus_stats = pd.read_csv('../data/cpg-stats.csv', sep=';')
corpus_stats.name = corpus_stats.name.map(translator_fn)
#corpus_stats.set_index('name', inplace=True)
corpus_stats.sort_values('num_tokens', inplace=True)
corpus_stats.reset_index(inplace=True)
corpus_stats[['name', 'num_docs', 'num_recommendations', 'num_sentences', 'num_tokens', 'num_types', 'num_litref' ]]

Unnamed: 0,name,num_docs,num_recommendations,num_sentences,num_tokens,num_types,num_litref
0,Pancreatic cancer,292,158,854,18901,3602,1154
1,Penis cancer,167,94,960,20915,4542,561
2,Psycho-oncology,121,47,778,21909,4113,835
3,Oral cavity cancer,132,96,763,22256,3947,1172
4,Malignant ovarian tumors,195,97,1103,27432,5139,1035
5,Anal cancer,216,93,1248,34429,5246,724
6,Chronic lymphocytic leukemia,285,138,1417,36811,5680,726
7,Laryngeal cancer,189,118,1526,37374,6812,681
8,Follicular lymphoma,296,149,1537,38206,6344,761
9,Oesophageal cancer,172,91,1530,38574,6615,1026


# Annotation Statistics

In [6]:
ggponc_basepath = Path('..') / 'data' / 'annotations' / 'huggingface'

In [9]:
result_list = []

for level in ['value', 'detail']:
    granularity = 'fine' if level == 'detail' else 'coarse'
    tagset = [t.replace(' ', '_') for t in entity_values[level]]
    for spans in ['short', 'long']:
        folder = ggponc_basepath / granularity / spans
        train_file = folder / f'train_{granularity}_{spans}.json'
        dev_file = folder / f'dev_{granularity}_{spans}.json'
        test_file = folder / f'test_{granularity}_{spans}.json'

        dataset, tags = load_custom_dataset(
            train=train_file, dev=dev_file, test=test_file, tag_strings=tagset)

        for split in ['train', 'dev', 'test']:
            counts = defaultdict(lambda: 0)
            token_counts = defaultdict(lambda: 0)
            
            n_sentences = len(set(list(zip(dataset[split]['fname'], dataset[split]['sentence_id']))))
            n_files = len(set(dataset[split]['fname']))
            
            cur_token_length = 0
            
            for tags in dataset[split]["tags"]:
                for tag in tags:
                    if tag.startswith('B-'):
                        tag_name = tag.replace('B-', '').replace('_', ' ')
                        counts[tag_name] += 1
                        counts['Total'] += 1
                        if cur_token_length > 0:
                            token_counts[tag_name] += cur_token_length
                            token_counts['Total'] += cur_token_length
                        cur_token_length = 1
                    elif tag.startswith('I'):
                        assert cur_token_length > 0
                        cur_token_length += 1
                    elif tag.startswith('O'):
                        counts['O'] += 1
                        if cur_token_length > 0:
                            token_counts[tag_name] += cur_token_length
                            token_counts['Total'] += cur_token_length
                            cur_token_length = 0
            for k, v in counts.items():
                result_list.append(
                    {'tag' : k, 
                     'count' : v,
                     'token_count' : token_counts[k],
                     'split' : split.capitalize(), 
                     'level' : granularity.capitalize(), 
                     'spans' : spans.capitalize(),
                     'n_sentences' : n_sentences,
                     'n_files' : n_files})



Downloading and preparing dataset json/default to /dhc/home/florian.borchert/.cache/huggingface/datasets/json/default-f88b539936a9b6f0/0.0.0/d75ead8d5cfcbe67495df0f89bd262f0023257fbbbd94a730313295f3d756d50...


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /dhc/home/florian.borchert/.cache/huggingface/datasets/json/default-f88b539936a9b6f0/0.0.0/d75ead8d5cfcbe67495df0f89bd262f0023257fbbbd94a730313295f3d756d50. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/23528 [00:00<?, ?ex/s]

  0%|          | 0/4655 [00:00<?, ?ex/s]

  0%|          | 0/4826 [00:00<?, ?ex/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]



Downloading and preparing dataset json/default to /dhc/home/florian.borchert/.cache/huggingface/datasets/json/default-421447b3dd2b2c0c/0.0.0/d75ead8d5cfcbe67495df0f89bd262f0023257fbbbd94a730313295f3d756d50...


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /dhc/home/florian.borchert/.cache/huggingface/datasets/json/default-421447b3dd2b2c0c/0.0.0/d75ead8d5cfcbe67495df0f89bd262f0023257fbbbd94a730313295f3d756d50. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/23528 [00:00<?, ?ex/s]

  0%|          | 0/4655 [00:00<?, ?ex/s]

  0%|          | 0/4826 [00:00<?, ?ex/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]



Downloading and preparing dataset json/default to /dhc/home/florian.borchert/.cache/huggingface/datasets/json/default-aa369a42bd5d02d6/0.0.0/d75ead8d5cfcbe67495df0f89bd262f0023257fbbbd94a730313295f3d756d50...


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /dhc/home/florian.borchert/.cache/huggingface/datasets/json/default-aa369a42bd5d02d6/0.0.0/d75ead8d5cfcbe67495df0f89bd262f0023257fbbbd94a730313295f3d756d50. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/23528 [00:00<?, ?ex/s]

  0%|          | 0/4655 [00:00<?, ?ex/s]

  0%|          | 0/4826 [00:00<?, ?ex/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]



Downloading and preparing dataset json/default to /dhc/home/florian.borchert/.cache/huggingface/datasets/json/default-e752de27ac5fb655/0.0.0/d75ead8d5cfcbe67495df0f89bd262f0023257fbbbd94a730313295f3d756d50...


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /dhc/home/florian.borchert/.cache/huggingface/datasets/json/default-e752de27ac5fb655/0.0.0/d75ead8d5cfcbe67495df0f89bd262f0023257fbbbd94a730313295f3d756d50. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/23528 [00:00<?, ?ex/s]

  0%|          | 0/4655 [00:00<?, ?ex/s]

  0%|          | 0/4826 [00:00<?, ?ex/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [10]:
results = pd.DataFrame(result_list).set_index(['tag', 'level', 'spans', 'split'])
results = results.unstack(['spans', 'split']).sort_index(axis=1, key=sort_fn).sort_index(axis=0, key=sort_fn)

In [11]:
token_lengths = results['token_count'].groupby('spans', axis=1).sum() / results['count'].groupby('spans', axis=1).sum()
token_lengths = pd.concat({'Tokens / Mention': pd.concat({'count': token_lengths}, axis=1)}, names=['split'], axis=1)
token_lengths.columns = token_lengths.columns.reorder_levels(results.columns.names)

In [12]:
results = pd.concat([token_lengths, results], axis=1).sort_index(axis=1, key=sort_fn).sort_index(axis=0, key=sort_fn)

In [13]:
entity_counts = results['count'].droplevel('level')

In [14]:
pd.options.display.float_format = '{:,.1f}'.format

In [31]:
entity_counts['Short']

split,Tokens / Mention,Train,Dev,Test
tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Finding,1.1,38923,7731,8196
Diagnosis or Pathology,1.1,23613,4604,5119
Other Finding,1.1,15310,3127,3077
Substance,1.2,5260,1254,899
Clinical Drug,1.17,4123,964,716
Nutrient or Body Substance,1.29,1002,233,149
External Substance,1.47,135,57,34
Procedure,1.06,27377,5732,5631
Therapeutic,1.03,17498,3273,3607
Diagnostic,1.12,9879,2459,2024


Total number of annotations (short)

In [40]:
entity_counts['Short'].loc['Total', ['Train', 'Dev', 'Test']].iloc[0].sum()

101003

In [None]:
entity_counts['Long']

Total number of annotations (long)

In [43]:
entity_counts['Long'].loc['Total', ['Train', 'Dev', 'Test']].iloc[0].sum()

81052

In [None]:
entity_counts['Short'].loc['Total', ['Train', 'Dev', 'Test']].iloc[0].sum()

In [17]:
# Number of files
results['n_files'].iloc[0].loc['Short']

split
Train    3350
Dev       696
Test      679
Name: (Finding, Coarse), dtype: int64

In [18]:
# Number of sentences
results['n_sentences'].iloc[0].loc['Long']

split
Train    23528
Dev       4655
Test      4826
Name: (Finding, Coarse), dtype: int64

In [21]:
# Make sure the sums align for short spans (they cannot for long spans)
for split in ['Train', 'Dev', 'Test']:
    c_short = results[('count', 'Short', split)]
    assert c_short.loc['O']['Coarse'] == c_short.loc['O']['Fine']
    assert c_short.loc['Finding']['Coarse'] == c_short.loc['Diagnosis or Pathology']['Fine'] + c_short.loc['Other Finding']['Fine']
    assert c_short.loc['Substance']['Coarse'] == c_short.loc['Clinical Drug']['Fine'] + c_short.loc['Nutrient or Body Substance']['Fine'] + c_short.loc['External Substance']['Fine']
    assert c_short.loc['Procedure']['Coarse'] == c_short.loc['Therapeutic']['Fine'] + c_short.loc['Diagnostic']['Fine']

# Inter-annotator Agreement

__Note:__ Here we just analyze the IAA results.

Use `python ../gamma_agreement.py <zip file>` for the actual IAA calculation using inception-analytics and pygamma-agreement .

As the calculation of gamma agreement becomes very time-consuming with increasing number of annotators, the results are provided with the release in the `data` folder.

Time to compute IAA using inception-analytics and pygamma-agreement:
- Phase 1a:    185 s =    3 min = 0.05h
- Phase 1b: 282532 s = 4709 min = __78 h__
- Phase 2 : 200935 s = 3349 min = 56 h
- Phase 3 :  78419 s = 1307 min = 22 h

In [22]:
agreement_results_path = Path('../data/annotations/agreement_sets/results/')
agreement = pd.concat([pd.read_csv(f, index_col=0) for f in agreement_results_path.glob('*_gamma.csv')])

### Number of annotated sentences and docs used for IAA calculation (excluding sentences without any annotations)

In [23]:
agreement.groupby('file')[['n_docs', 'n_sentences', 'n_anno']].max()

Unnamed: 0_level_0,n_docs,n_sentences,n_anno
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
phase_1a.zip,5,129,3
phase_1b.zip,5,132,7
phase_2.zip,6,147,7
phase_3.zip,3,61,7


### Total number of sentences and docs:

In [6]:
!python -m spacy download de_core_news_sm

Collecting de-core-news-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.2.0/de_core_news_sm-3.2.0-py3-none-any.whl (19.1 MB)
[K     |████████████████████████████████| 19.1 MB 3.4 MB/s eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


In [24]:
nlp = spacy.load('de_core_news_sm')

In [25]:
stats = []

for folder in Path('../data/annotations/agreement_sets/input/').glob(r'phase_?'):
    txt_files = list(folder.glob('*.txt'))
    n_sentences = 0
    n_tokens = 0
    for f in txt_files:
        with open(f) as txt_file:
            sentences = txt_file.readlines()
            n_sentences += len(sentences)
            n_tokens += sum([len(nlp(s)) for s in sentences])
    
    stats.append({
        'file' : folder.name,
        'n_docs': len(txt_files),
        'n_sentences_total': n_sentences,
        'n_tokens_total': n_tokens
    })
stats = pd.DataFrame(stats).set_index('file').sort_index()
stats

Unnamed: 0_level_0,n_docs,n_sentences_total,n_tokens_total
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
phase_1,5,149,4206
phase_2,6,158,3725
phase_3,3,67,1814


IAA results

In [30]:
pd.options.display.float_format = '{:,.2f}'.format
agreement.replace({'detail' : 'fine', 'value' : 'coarse'}).set_index(['file', 'feature', 'label']).unstack('file').gamma.sort_index(ascending=True, key=sort_fn).fillna('-')

Unnamed: 0_level_0,file,phase_1a.zip,phase_1b.zip,phase_2.zip,phase_3.zip
feature,label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
coarse,all,0.75,0.89,0.93,0.94
coarse,Finding,0.82,0.93,0.95,0.97
coarse,Substance,0.92,0.99,0.98,0.99
coarse,Procedure,0.82,0.93,0.96,0.96
coarse,Specification,0.71,0.87,0.91,0.89
fine,all,-,0.88,0.92,0.93
fine,Diagnosis or Pathology,-,0.91,0.94,0.96
fine,Other Finding,-,0.85,0.87,0.91
fine,Clinical Drug,-,0.97,0.98,1.0
fine,Nutrient or Body Substance,-,0.99,0.99,0.98
