In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../util')
sys.path.append('../experiments')

In [3]:
from convert_annotations import read_webanno, resolve_ellipses, webanno_to_iob_df, EMPTY_REGEX
from pathlib import Path
from datetime import datetime
import pandas as pd

In [4]:
ignore_documents = [
    '00_mundhoehlenkarzinom_0059.tsv',
    '00_mundhoehlenkarzinom_0071.tsv',
    '05_zervixkarzinom_0070.tsv',
    '11_hepatozellulaeres-karzinom-und-biliaere-karzinome_0179.tsv',
    '15_endometriumkarzinom_0104.tsv',
    '28_komplementaermedizin_0115.tsv',
    '03_lungenkarzinom_0471.tsv',
    '05_zervixkarzinom_0070.tsv',
    '28_komplementaermedizin_0018.tsv'
    '20_harnblasenkarzinom_0294.tsv',
    '20_harnblasenkarzinom_0180.tsv',
    
]

In [5]:
%%time
tsv_folder = '../../data_ggponc/output_ellipses/'

webanno_df, sentences = read_webanno([f for f in Path(tsv_folder).glob('*.tsv') if not f.name in ignore_documents])

CPU times: user 16.2 s, sys: 507 ms, total: 16.7 s
Wall time: 35 s


In [6]:
%%time

import warnings

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    iob_df = webanno_to_iob_df(webanno_df, 'detail', True, debug=False, collect_errors=False, skip_errors=True, all_columns=True)
    iob_df['id'] = iob_df.entity_id.where(~iob_df.entity_id.isna(), iob_df.spec_id)

 43%|█████████████████████████████████████████▊                                                       | 21322/49500 [04:06<06:12, 75.72it/s]ERROR:convert_annotations:14_larynxkarzinom_0123.tsv, 71, ('Specification[69]|*[70]|Procedure[71]', '*[70]')
100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 49500/49500 [09:29<00:00, 86.97it/s]


CPU times: user 9min 48s, sys: 2.54 s, total: 9min 51s
Wall time: 9min 49s


In [7]:
# Anything containing a prefix, suffix or fragment is potentially an elliptical construct
ellipses = iob_df[~iob_df.prefix.str.match(EMPTY_REGEX) | ~iob_df.suffix.str.match(EMPTY_REGEX) | ~iob_df.fragment.str.match(EMPTY_REGEX)].copy()
len(ellipses)

8029

In [8]:
ellipses.fragment

file                                   sentence_id
00_mundhoehlenkarzinom_0002.tsv        1                   1-12
                                       2                    2-6
                                       5                      *
                                       5                      *
00_mundhoehlenkarzinom_0005.tsv        12             12-11[23]
                                                        ...    
29_adulte-weichgewebesarkome_0397.tsv  6                      *
29_adulte-weichgewebesarkome_0403.tsv  1                      *
                                       1                      *
29_adulte-weichgewebesarkome_0404.tsv  6                    6-6
29_adulte-weichgewebesarkome_0405.tsv  3                   3-36
Name: fragment, Length: 8029, dtype: object

In [9]:
ellipses[~ellipses.fragment.str.match(EMPTY_REGEX)]

Unnamed: 0_level_0,Unnamed: 1_level_0,token_id,ts_id,span,token,value,detail,specified_by,prefix,suffix,fragment,value_entity_id,value_entity_class,value_specification_id,detail_entity_id,detail_entity_class,detail_specification_id,spec_id,entity_id,output,id
file,sentence_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00_mundhoehlenkarzinom_0002.tsv,1,9,1-9,81-86,Tabak,Finding,Diagnosis or Pathology,1-8,*,abusus,1-12,-1.0,Finding,,11.0,Diagnosis or Pathology,,,14.0,I-Diagnosis_or_Pathology,14.0
00_mundhoehlenkarzinom_0002.tsv,2,3,2-3,166-171,Tabak,Finding,Diagnosis or Pathology,2-2,*,abusus,2-6,-1.0,Finding,,2.0,Diagnosis or Pathology,,,5.0,I-Diagnosis_or_Pathology,5.0
00_mundhoehlenkarzinom_0005.tsv,12,9,12-9,2627-2634,HPV-RNA,Procedure,Diagnostic,_,*,\_Nachweis,12-11[23],-1.0,Procedure,,32.0,Diagnostic,,,32.0,B-Diagnostic,32.0
00_mundhoehlenkarzinom_0005.tsv,18,4,18-4,3624-3629,HPV16,Substance[35],Nutrient or Body Substance[35],18-9[37_35],*[35],\_Antikörpern[35],18-7[36],35.0,Substance,,35.0,Nutrient or Body Substance,,,35.0,B-Nutrient_or_Body_Substance,35.0
00_mundhoehlenkarzinom_0005.tsv,18,5,18-5,3630-3632,E6,Substance[35],Nutrient or Body Substance[35],_,*[35],\_Antikörpern[35],18-7[36],35.0,Substance,,35.0,Nutrient or Body Substance,,,35.0,I-Nutrient_or_Body_Substance,35.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29_adulte-weichgewebesarkome_0392.tsv,2,4,2-4,182-188,Lungen,Finding,Diagnosis or Pathology,_,*,metastasen,2-7,-1.0,Finding,,9.0,Diagnosis or Pathology,,,9.0,B-Diagnosis_or_Pathology,9.0
29_adulte-weichgewebesarkome_0395.tsv,5,9,5-9,614-617,MRT,Procedure,Diagnostic,_,Ganzkörper-,*,5-6,-1.0,Procedure,,8.0,Diagnostic,,,8.0,B-Diagnostic,8.0
29_adulte-weichgewebesarkome_0395.tsv,26,18,26-18,3439-3452,Radiofrequenz,Procedure,Therapeutic,_,*,ablation,26-21,-1.0,Procedure,,17.0,Therapeutic,,,17.0,B-Therapeutic,17.0
29_adulte-weichgewebesarkome_0404.tsv,6,3,6-3,594-605,Zweitlinien,Procedure,Therapeutic,_,*,therapie,6-6,-1.0,Procedure,,2.0,Therapeutic,,,2.0,B-Therapeutic,2.0


In [10]:
%%time
results = resolve_ellipses(ellipses, iob_df)

CPU times: user 6min 26s, sys: 218 ms, total: 6min 27s
Wall time: 6min 27s


In [11]:
results

Unnamed: 0,file,sentence_id,full_sentence,span_index_start,span_index_end,full_span,offsets,resolution,fragment,missing_prefix,missing_suffix
0,00_mundhoehlenkarzinom_0002.tsv,1,"[Hauptrisikofaktoren, für, das, Auftreten, ein...",7,11,"[chronischer, Tabak, -, oder, Alkoholabusus]","[[69, 80], [81, 86], [86, 87], [88, 92], [93, ...","[chronischer, Tabakabusus, , oder, Alkoholabusus]",True,False,True
1,00_mundhoehlenkarzinom_0002.tsv,2,"[Bei, chronischem, Tabak, -, oder, Alkoholabus...",1,5,"[chronischem, Tabak, -, oder, Alkoholabusus]","[[4, 15], [16, 21], [21, 22], [23, 27], [28, 41]]","[chronischem, Tabakabusus, , oder, Alkoholabusus]",True,False,True
2,00_mundhoehlenkarzinom_0002.tsv,5,"[Neben, dem, Konsum, von, Tabak, oder, Alkohol...",16,22,"[übermäßiger, Konsum, von, Fleisch, oder, gebr...","[[90, 101], [102, 108], [109, 112], [113, 120]...","[übermäßiger, Konsum, von, Fleisch, oder, von ...",False,True,False
3,00_mundhoehlenkarzinom_0005.tsv,12,"[Die, hohe, Diskrepanz, zwischen, p16, Immunhi...",8,11,"[HPV-RNA, und, HPV-DNA, Nachweis]","[[55, 62], [63, 66], [67, 74], [75, 83]]","[HPV-RNA Nachweis, und, HPV-DNA, Nachweis]",True,False,True
4,00_mundhoehlenkarzinom_0005.tsv,18,"[Die, Prävalenz, von, HPV16, E6, oder, E7, Ant...",3,9,"[HPV16, E6, oder, E7, Antikörpern, in, Mundhöh...","[[18, 23], [24, 26], [27, 31], [32, 34], [35, ...","[HPV16, E6 Antikörpern, oder, HPV16 E7, Antikö...",True,True,True
...,...,...,...,...,...,...,...,...,...,...,...
5913,29_adulte-weichgewebesarkome_0397.tsv,4,"[Sehr, viel, häufiger, wird, bei, Männern, unt...",18,22,"[Weichteiltumor, des, Skrotums, oder, Samenstr...","[[116, 130], [131, 134], [135, 143], [144, 148...","[Weichteiltumor, des, Skrotums, oder, des Same...",False,True,False
5914,29_adulte-weichgewebesarkome_0397.tsv,6,"[Klinisch, besteht, meist, eine, unilaterale, ...",4,13,"[unilaterale, ,, sich, langsam, vergrößernde, ...","[[28, 39], [39, 40], [41, 45], [46, 53], [54, ...","[unilaterale, ,, sich, langsam, vergrößernde, ...",False,True,False
5915,29_adulte-weichgewebesarkome_0403.tsv,1,"[Es, besteht, Forschungsbedarf, in, der, Kläru...",7,14,"[molekulare, Prognosemarker, für, Sarkome, ode...","[[53, 63], [64, 78], [79, 82], [83, 90], [91, ...","[molekulare, Prognosemarker, für, Sarkome, ode...",False,True,False
5916,29_adulte-weichgewebesarkome_0404.tsv,6,"[Bei, der, Zweitlinien, -, und, Folgetherapie,...",2,5,"[Zweitlinien, -, und, Folgetherapie]","[[8, 19], [19, 20], [21, 24], [25, 38]]","[Zweitlinientherapie, , und, Folgetherapie]",True,False,True


### Adding resolved sentences and metadata

In [12]:
sentence_idx = pd.DataFrame(sentences, columns=['raw_sentence'], index=webanno_df.index.drop_duplicates())

In [13]:
import math
clean_results = results.merge(right=sentence_idx, left_on=['file', 'sentence_id'], right_index=True)
clean_results['outer'] = math.nan

In [14]:
for _, k in clean_results[clean_results.duplicated(['file', 'sentence_id'])][['file', 'sentence_id']].iterrows():
    f = k['file']
    s = k['sentence_id']
    dups = clean_results[(clean_results.file == f) & (clean_results.sentence_id == s)]
    
    for i, inner in dups.iterrows():
        for j, outer in dups.iterrows():
            if i != j and inner.span_index_start >= outer.span_index_start and inner.span_index_end <= outer.span_index_end:
                if not (clean_results.loc[j, 'outer'] == i):
                    clean_results.loc[i, 'outer'] = j

In [15]:
clean_results = clean_results[clean_results.outer.isna()]

In [16]:
def resolve(row):
    out = {}
    for c in row.index:
        val = row.loc[c]
        if len(val) == 1:
            out['multi'] = False
            out[c] = val[0]
        else:
            out['multi'] = True
            if val.count(val[0]) == len(val) and c in ['full_sentence', 'raw_sentence']:
                out[c] = val[0]
            elif c in ['fragment', 'missing_prefix', 'missing_suffix']:
                out[c] = max(val)
            else:
                out[c] = val
    return pd.Series(out)

In [17]:
merged_results = clean_results.groupby(['file', 'sentence_id', 'fragment']).aggregate(tuple).applymap(list).apply(resolve, axis=1)[list(clean_results.columns.drop(['outer', 'file', 'sentence_id', 'fragment'])) + ['multi']].reset_index()

In [18]:
def get_full_res(row):
    sent = row.raw_sentence
    if row.multi:
        spans = [o for p in row.offsets for o in p]
        resolution = [r for p in row.resolution for r in p]
    else:
        spans = row.offsets
        resolution = row.resolution
    assert len(spans) == len(resolution), (spans, resolution)
    j = 0
    for span, res in zip(spans, resolution):
        start, end = span
        start += j
        end += j
        sent = sent[:start] + res + sent[end:]
        j += (start - end + len(res))
    return sent

In [19]:
merged_results['full_resolution'] = merged_results.apply(get_full_res, axis=1)

In [20]:
ellipses_sentence_count = len(merged_results[['file', 'sentence_id']].drop_duplicates())
sentence_count = 78090
ellipses_sentence_count, sentence_count, ellipses_sentence_count / sentence_count

(5111, 78090, 0.06545012165450122)

### Fragments

In [25]:
import random

def show_random(df, i=None):
    if i is None:
        i = random.randint(0, len(df) - 1)
    item = df.iloc[i]    
    assert len(item.full_span) == len(item.resolution)
    spans = []
    solutions = []
    marker = []
    if not item.multi:
        for span, res in zip(item.full_span, item.resolution):
            if span == res:
                marker.append(' ' * len(span))
            else:
                marker.append('^' * len(res))
            start_index = res.find(span)
            spans.append(span.rjust(start_index + len(span)).ljust(len(res) - start_index))
            solutions.append(res)
        print(' '.join(spans))
        print(' '.join(solutions))
        print(' '.join(marker))
        print(f'{item.file};{item.sentence_id};{" ".join(item.full_span)}', 'fragment:', item.fragment)
    else:
         print(f'{item.file};{item.sentence_id}')
    print(item.raw_sentence)
    print(item.full_resolution)
    print('...........')
    print('')

In [26]:
for _ in range(0, 10):
    show_random(merged_results[merged_results.multi & merged_results.fragment])

07_malignes-melanom_0098.tsv;19
In einer Studie von Krahn wurden verschiedene Tumormarker miteinander bei 373 Melanompatienten (284 Stadium I/II, 89 Stadium III/IV) verglichen.
In einer Studie von Krahn wurden verschiedene Tumormarker miteinander bei 373 Melanompatienten (284 Stadium I/Stadium II, 89 Stadium III/Stadium IV) verglichen.
...........

13_oesophaguskarzinom_0150.tsv;2
So konnte mit einer Platin- und Fluoropyrimidinbasierten Kombinationschemotherapie mit Docetaxel oder Epirubicin eine signifikante Verbesserung hinsichtlich des Überlebens, der Zeit bis zur Tumorprogression und ein Vorteil in der Lebensqualität gegenüber älteren Chemotherapie-Protokollen (FUP, FAMTX) nachgewiesen werden (DCF vs. FUP: Mediane Überlebenszeit 9,2 Monate vs. 8,6 Monate [p = 0,02] und progressionsfreies Überleben 5,6 Monate vs. 3,7 Monate [p < 0,001] sowie ECF vs. FAMTX: Mediane Überlebenszeit 8,9 Monate vs. 5,7 Monate [p = 0,0009] und FFS 7,4 Monate vs. 3,4 Monate [p = 0,00006]) [REF], [REF] Pati

In [27]:
for _ in range(0, 10):
    show_random(merged_results[merged_results.fragment])

Lymphomerkrankung und -        therapie
Lymphomerkrankung und  Lymphomtherapie
                       ^^^^^^^^^^^^^^^
19_follikulaeres-lymphom_0288.tsv;3;Lymphomerkrankung und - therapie fragment: True
Geriatrische Syndrome (z.B. Delir, Demenz, Depression, Inappetenz, Sarkopenie, Frailty, Stürze, Immobilität) können dabei unabhängig von der Lymphomerkrankung bestehen oder durch die Lymphomerkrankung und -therapie induziert bzw. aggraviert werden.
Geriatrische Syndrome (z.B. Delir, Demenz, Depression, Inappetenz, Sarkopenie, Frailty, Stürze, Immobilität) können dabei unabhängig von der Lymphomerkrankung bestehen oder durch die Lymphomerkrankung und Lymphomtherapie induziert bzw. aggraviert werden.
...........

Ernährungs          - und Stoffwechselstörungen
Ernährungsstörungen  und Stoffwechselstörungen
^^^^^^^^^^^^^^^^^^^                           
19_follikulaeres-lymphom_0310.tsv;3;Ernährungs - und Stoffwechselstörungen fragment: True
Zur Erkennung und gegebenenfalls multimodalen Beh

In [28]:
#ss = clean_results[~clean_results.fragment][clean_results[~clean_results.fragment].resolution.map(lambda r: any([t for t in r if '-' in t]))]

In [29]:
#for _ in range(0, 10):
#    show_random(ss)

In [30]:
for _ in range(0, 10):
    show_random(merged_results[~merged_results.fragment])

radiologisch komplettes und              partielles Ansprechen
radiologisch komplettes und radiologisch partielles Ansprechen
                            ^^^^^^^^^^^^^^^^^^^^^^^           
12_nierenzellkarzinom_0206.tsv;14;radiologisch komplettes und partielles Ansprechen fragment: False
Die objek­tive Ansprechrate (radiologisch komplettes und partielles Ansprechen) unterschied sich nicht zwischen den Behandlungsarmen (19 % [8 von 42] vs. 12 % [5 von 43]; p=0,38; Nx + IFN-α vs. IFN-α).
Die objek­tive Ansprechrate (radiologisch komplettes und radiologisch partielles Ansprechen) unterschied sich nicht zwischen den Behandlungsarmen (19 % [8 von 42] vs. 12 % [5 von 43]; p=0,38; Nx + IFN-α vs. IFN-α).
...........

radikalen Operation des Rektumkarzinoms mit TME und     tiefer Anastomose
radikalen Operation des Rektumkarzinoms mit TME und mit tiefer Anastomose
                                                    ^^^^^^^^^^           
02_kolorektales-karzinom_0261.tsv;1;radikalen Operation des

# Train / Dev / Test Splits

In [31]:
def create_export(subset, with_fragment=False):
    subset = subset.copy()
    join_fn = lambda x: ' '.join([xi.strip() for xi in x]) if type(x[0]) == str else '; '.join([' '.join([yi.strip() for yi in y]) for y in x])
    subset['text_whitespace_tokenized'] = subset.full_span.map(join_fn)
    subset['resolved_whitespace_tokenized'] = subset.resolution.map(join_fn)
    subset = subset[['file', 'sentence_id', 'text_whitespace_tokenized', 'resolved_whitespace_tokenized', 'raw_sentence', 'full_resolution'] + (['fragment'] if with_fragment else [])]
    return subset.set_index(['file', 'sentence_id'])

In [32]:
from sklearn.model_selection import train_test_split

In [33]:
fragments = create_export(merged_results[merged_results.fragment])

In [34]:
files = list(fragments.index.levels[0].unique())

In [35]:
splits = pd.read_csv('../data/ellipses/ellipses_splits.csv')
splits.head()

Unnamed: 0,file,split
0,00_mundhoehlenkarzinom_0002.tsv,test
1,00_mundhoehlenkarzinom_0005.tsv,train
2,00_mundhoehlenkarzinom_0028.tsv,train
3,00_mundhoehlenkarzinom_0032.tsv,dev
4,00_mundhoehlenkarzinom_0039.tsv,train


In [36]:
#train_ratio = 0.7
#dev_ratio = 0.15
#test_ratio = 0.15
#
#x_train, _x_test = train_test_split(files, test_size=1 - train_ratio, random_state=1)
#
#x_dev, x_test = train_test_split(_x_test, test_size=test_ratio/(test_ratio + dev_ratio), random_state=42) 
#
#print(len(x_train), len(x_dev), len(x_test))

In [37]:
fragments.loc[splits[splits.split == 'train'].file, 'split'] = 'train'
fragments.loc[splits[splits.split == 'dev'].file, 'split'] = 'dev'
fragments.loc[splits[splits.split == 'test'].file, 'split'] = 'test'
fragments.reset_index().to_csv('../../data_ggponc/ellipses/ggponc_ellipses_compounds.tsv', sep='\t', index=False)

In [39]:
# Should be empty
fragments[fragments.split.isna()]

Unnamed: 0_level_0,Unnamed: 1_level_0,text_whitespace_tokenized,resolved_whitespace_tokenized,raw_sentence,full_resolution,split
file,sentence_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [40]:
x_train  = fragments[fragments.split == 'train'].reset_index().file.unique()
x_dev  = fragments[fragments.split == 'dev'].reset_index().file.unique()
x_test  = fragments[fragments.split == 'test'].reset_index().file.unique()

In [41]:
controls = sentence_idx[~sentence_idx.index.isin(clean_results.set_index(['file', 'sentence_id']).index)]
controls = controls.query('file in @files')
control_idx = set([t[0] for t in controls.index])

controls.loc[list(set(x_train).intersection(control_idx)), 'split'] = 'train'
controls.loc[list(set(x_dev).intersection(control_idx)), 'split'] = 'dev'
controls.loc[list(set(x_test).intersection(control_idx)), 'split'] = 'test'
controls.reset_index().to_csv('../../data_ggponc/ellipses/ggponc_no_ellipses_all.tsv', sep='\t', index=False)
controls.reset_index().sample(len(fragments), random_state=42).to_csv('../../data_ggponc/ellipses/ggponc_no_ellipses_small.tsv', sep='\t', index=False)

## Stats for Paper

In [56]:
def show_stats(files):

    frag_sents = merged_results[merged_results.fragment]
    if files is not None:
        frag_sents = frag_sents[frag_sents.file.isin(files)]
    frag_instances = clean_results[clean_results.fragment]
    if files is not None:
        frag_instances = frag_instances[frag_instances.file.isin(files)]

    print('Elliptical CCNFs:', len(frag_instances))
    print('Forward', len(frag_instances[frag_instances.missing_prefix & ~frag_instances.missing_suffix]))
    print('Backward', len(frag_instances[~frag_instances.missing_prefix & frag_instances.missing_suffix]))
    print('Complex', len(frag_instances[frag_instances.missing_prefix & frag_instances.missing_suffix]))

    print('Sentences with CCNFs:', len(frag_sents))
    print('Sentences with multiple CCNFs:', sum(frag_sents.multi))

In [57]:
show_stats(None)

Elliptical CCNFs: 4111
Forward 714
Backward 3228
Complex 169
Sentences with CCNFs: 3848
Sentences with multiple CCNFs: 241


In [62]:
print('>> Training')
show_stats(x_train)

print('\n\n>> Development')
show_stats(x_dev)

print('\n\n>> Test')
show_stats(x_test)

>> Training
Elliptical CCNFs: 2896
Forward 512
Backward 2271
Complex 113
Sentences with CCNFs: 2715
Sentences with multiple CCNFs: 167


>> Development
Elliptical CCNFs: 629
Forward 101
Backward 499
Complex 29
Sentences with CCNFs: 590
Sentences with multiple CCNFs: 39


>> Test
Elliptical CCNFs: 586
Forward 101
Backward 458
Complex 27
Sentences with CCNFs: 543
Sentences with multiple CCNFs: 35


#### All annotations covered by export?

In [42]:
fragments = clean_results[clean_results.fragment]

In [43]:
fragment_ix = set(fragments.set_index(['file', 'sentence_id']).index.drop_duplicates())
ellipses_ix = set(ellipses[~ellipses.fragment.str.match(EMPTY_REGEX)].index.drop_duplicates())

In [44]:
len(ellipses_ix - fragment_ix) == 0

True