In [1]:
import sys
sys.path.insert(0, '../util/')
sys.path.insert(1, '../experiments/')

In [2]:
import pandas as pd
from pathlib import Path
import spacy
import re
from collections import defaultdict
from huggingface_utils import load_custom_dataset
from convert_annotations import entity_values
from stats_util import sort_fn, sort_order
import datasets
datasets.logging.set_verbosity_error()

# Corpus Statistics

In [3]:
translations = pd.read_csv('../data/guideline_translations.csv').set_index('german_name')

def translator_fn(name):
    if name == 'Sum':
        return name
    return translations.loc[name].english_name

In [4]:
corpus_stats = pd.read_csv('../data/cpg-stats.csv', sep=';')
corpus_stats.name = corpus_stats.name.map(translator_fn)
corpus_stats.sort_values('num_tokens', inplace=True)
corpus_stats.reset_index(inplace=True)
corpus_stats[['name', 'num_docs', 'num_recommendations', 'num_sentences', 'num_tokens', 'num_types', 'num_litref' ]]

Unnamed: 0,name,num_docs,num_recommendations,num_sentences,num_tokens,num_types,num_litref
0,Pancreatic cancer,292,158,854,18901,3602,1154
1,Penis cancer,167,94,960,20915,4542,561
2,Psycho-oncology,121,47,778,21909,4113,835
3,Oral cavity cancer,132,96,763,22256,3947,1172
4,Malignant ovarian tumors,195,97,1103,27432,5139,1035
5,Anal cancer,216,93,1248,34429,5246,724
6,Chronic lymphocytic leukemia,285,138,1417,36811,5680,726
7,Laryngeal cancer,189,118,1526,37374,6812,681
8,Follicular lymphoma,296,149,1537,38206,6344,761
9,Oesophageal cancer,172,91,1530,38574,6615,1026


# Annotation Statistics

We use the IOB-encoded tags from the converted HuggingFace dataset to obtain statistics of entity annotation

In [5]:
ggponc_basepath = Path('..') / 'data' / 'annotations' / 'huggingface'

In [6]:
result_list = []

for level in ['value', 'detail']:
    granularity = 'fine' if level == 'detail' else 'coarse'
    tagset = [t.replace(' ', '_') for t in entity_values[level]]
    for spans in ['short', 'long']:
        folder = ggponc_basepath / granularity / spans
        train_file = folder / f'train_{granularity}_{spans}.json'
        dev_file = folder / f'dev_{granularity}_{spans}.json'
        test_file = folder / f'test_{granularity}_{spans}.json'

        dataset, tags = load_custom_dataset(
            train=train_file, dev=dev_file, test=test_file, tag_strings=tagset)

        for split in ['train', 'dev', 'test']:
            counts = defaultdict(lambda: 0)
            token_counts = defaultdict(lambda: 0)
            
            n_sentences = len(set(list(zip(dataset[split]['fname'], dataset[split]['sentence_id']))))
            n_files = len(set(dataset[split]['fname']))
            
            cur_token_length = 0
            
            for tags in dataset[split]["tags"]:
                for tag in tags:
                    if tag.startswith('B-'):
                        tag_name = tag.replace('B-', '').replace('_', ' ')
                        counts[tag_name] += 1
                        counts['Total'] += 1
                        if cur_token_length > 0:
                            token_counts[tag_name] += cur_token_length
                            token_counts['Total'] += cur_token_length
                        cur_token_length = 1
                    elif tag.startswith('I'):
                        assert cur_token_length > 0
                        cur_token_length += 1
                    elif tag.startswith('O'):
                        counts['O'] += 1
                        if cur_token_length > 0:
                            token_counts[tag_name] += cur_token_length
                            token_counts['Total'] += cur_token_length
                            cur_token_length = 0
            for k, v in counts.items():
                result_list.append(
                    {'tag' : k, 
                     'count' : v,
                     'token_count' : token_counts[k],
                     'split' : split.capitalize(), 
                     'level' : granularity.capitalize(), 
                     'spans' : spans.capitalize(),
                     'n_sentences' : n_sentences,
                     'n_files' : n_files})

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
results = pd.DataFrame(result_list).set_index(['tag', 'level', 'spans', 'split'])
results = results.unstack(['spans', 'split']).sort_index(axis=1, key=sort_fn).sort_index(axis=0, key=sort_fn)

### Calculate token lengths per mention

In [8]:
token_lengths = results['token_count'].groupby('spans', axis=1).sum() / results['count'].groupby('spans', axis=1).sum()
token_lengths = pd.concat({'Tokens / Mention': pd.concat({'count': token_lengths}, axis=1)}, names=['split'], axis=1)
token_lengths.columns = token_lengths.columns.reorder_levels(results.columns.names)

results = pd.concat([token_lengths, results], axis=1).sort_index(axis=1, key=sort_fn).sort_index(axis=0, key=sort_fn)

### Show results for short spans

In [9]:
entity_counts = results['count'].droplevel('level')
pd.options.display.float_format = '{:,.1f}'.format

In [10]:
entity_counts['Short']

split,Tokens / Mention,Train,Dev,Test
tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Finding,1.1,74547,15811,17510
Diagnosis or Pathology,1.1,43970,9439,10485
Other Finding,1.1,30577,6372,7025
Substance,1.2,12690,2745,2576
Clinical Drug,1.2,9728,1965,1864
Nutrient or Body Substance,1.3,2426,612,568
External Substance,1.2,536,168,144
Procedure,1.1,51005,10599,11596
Therapeutic,1.0,34808,6785,8235
Diagnostic,1.1,16197,3814,3361


#### Total number of annotations (short)

In [11]:
entity_counts['Short'].loc['Total', ['Train', 'Dev', 'Test']].iloc[0].sum()

199079

### Show results for long spans

In [13]:
entity_counts['Long']

split,Tokens / Mention,Train,Dev,Test
tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Finding,2.6,59140,12495,13992
Diagnosis or Pathology,2.4,32893,7069,7887
Other Finding,2.7,26462,5459,6162
Substance,1.9,9193,2025,1869
Clinical Drug,2.0,7007,1412,1339
Nutrient or Body Substance,1.8,1780,484,434
External Substance,1.9,410,129,101
Procedure,2.4,44362,9252,10169
Therapeutic,2.4,30027,5855,7182
Diagnostic,2.5,14670,3467,3044


#### Total number of annotations (long)

In [14]:
entity_counts['Long'].loc['Total', ['Train', 'Dev', 'Test']].iloc[0].sum()

162497

#### Number of files per split

In [15]:
results['n_files'].iloc[0].loc['Short']

split
Train    5616
Dev      1179
Test     1213
Name: (Finding, Coarse), dtype: int64

#### Number of sentences per split

In [16]:
results['n_sentences'].iloc[0].loc['Long']

split
Train    46291
Dev       9685
Test     10743
Name: (Finding, Coarse), dtype: int64

#### Sanity check: make sure the sums align for short spans (they cannot for long spans due to partial subsumption)

In [17]:
for split in ['Train', 'Dev', 'Test']:
    c_short = results[('count', 'Short', split)]
    assert c_short.loc['O']['Coarse'] == c_short.loc['O']['Fine']
    assert c_short.loc['Finding']['Coarse'] == c_short.loc['Diagnosis or Pathology']['Fine'] + c_short.loc['Other Finding']['Fine']
    assert c_short.loc['Substance']['Coarse'] == c_short.loc['Clinical Drug']['Fine'] + c_short.loc['Nutrient or Body Substance']['Fine'] + c_short.loc['External Substance']['Fine']
    assert c_short.loc['Procedure']['Coarse'] == c_short.loc['Therapeutic']['Fine'] + c_short.loc['Diagnostic']['Fine']

# Inter-annotator Agreement

__Note:__ Here we just analyze the IAA results.

Use `python ../gamma_agreement.py <zip file>` for the actual IAA calculation using inception-analytics and pygamma-agreement .

As the calculation of gamma agreement becomes very time-consuming with increasing number of annotators, the results are provided with the release in the `data` folder.

Time to compute IAA using inception-analytics and pygamma-agreement:
- Phase 1a:    185 s =    3 min = 0.05h
- Phase 1b: 282532 s = 4709 min = __78 h__
- Phase 2 : 200935 s = 3349 min = 56 h
- Phase 3 :  78419 s = 1307 min = 22 h

In [18]:
agreement_results_path = Path('../data/annotations/agreement_sets/results/')
agreement = pd.concat([pd.read_csv(f, index_col=0) for f in agreement_results_path.glob('*_gamma.csv')])

### Number of annotated sentences and docs used for IAA calculation (excluding sentences without any annotations)

In [19]:
agreement.groupby('file')[['n_docs', 'n_sentences', 'n_anno']].max()

Unnamed: 0_level_0,n_docs,n_sentences,n_anno
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
phase_1a.zip,5,129,3
phase_1b.zip,5,132,7
phase_2.zip,6,147,7
phase_3.zip,3,61,7


### Total number of sentences and docs in the agreement sets

In [20]:
!python -m spacy download de_core_news_sm

Collecting de-core-news-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.2.0/de_core_news_sm-3.2.0-py3-none-any.whl (19.1 MB)
[K     |████████████████████████████████| 19.1 MB 3.5 MB/s eta 0:00:01    |▎                               | 153 kB 3.5 MB/s eta 0:00:06
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


In [21]:
nlp = spacy.load('de_core_news_sm')

In [22]:
stats = []

for folder in Path('../data/annotations/agreement_sets/input/').glob(r'phase_?'):
    txt_files = list(folder.glob('*.txt'))
    n_sentences = 0
    n_tokens = 0
    for f in txt_files:
        with open(f) as txt_file:
            sentences = txt_file.readlines()
            n_sentences += len(sentences)
            n_tokens += sum([len(nlp(s)) for s in sentences])
    
    stats.append({
        'file' : folder.name,
        'n_docs': len(txt_files),
        'n_sentences_total': n_sentences,
        'n_tokens_total': n_tokens
    })
stats = pd.DataFrame(stats).set_index('file').sort_index()
stats

Unnamed: 0_level_0,n_docs,n_sentences_total,n_tokens_total
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
phase_1,5,149,4206
phase_2,6,158,3725
phase_3,3,67,1814


### IAA results for all entity classes

In [23]:
pd.options.display.float_format = '{:,.2f}'.format
agreement.replace({'detail' : 'fine', 'value' : 'coarse'}).set_index(['file', 'feature', 'label']).unstack('file').gamma.sort_index(ascending=True, key=sort_fn).fillna('-')

Unnamed: 0_level_0,file,phase_1a.zip,phase_1b.zip,phase_2.zip,phase_3.zip
feature,label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
coarse,all,0.75,0.89,0.93,0.94
coarse,Finding,0.82,0.93,0.95,0.97
coarse,Substance,0.92,0.99,0.98,0.99
coarse,Procedure,0.82,0.93,0.96,0.96
coarse,Specification,0.71,0.87,0.91,0.89
fine,all,-,0.88,0.92,0.93
fine,Diagnosis or Pathology,-,0.91,0.94,0.96
fine,Other Finding,-,0.85,0.87,0.91
fine,Clinical Drug,-,0.97,0.98,1.0
fine,Nutrient or Body Substance,-,0.99,0.99,0.98
