In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../util')
sys.path.append('../experiments')

In [3]:
from convert_annotations import read_webanno, resolve_ellipses, webanno_to_iob_df, EMPTY_REGEX
from pathlib import Path
from datetime import datetime
import pandas as pd

In [4]:
ignore_documents = [
    '00_mundhoehlenkarzinom_0059.tsv',
    '00_mundhoehlenkarzinom_0071.tsv'
]

In [5]:
%%time
tsv_folder = '../../data_ggponc/output_ellipses/'
#tsv_folder = '../../ggponc_v2_annotation/other_layers/ellipses_dev'

webanno_df, sentences = read_webanno([f for f in Path(tsv_folder).glob('*.tsv') if not f.name in ignore_documents])

CPU times: user 10.3 s, sys: 631 ms, total: 10.9 s
Wall time: 13.1 s


In [6]:
%%time

import warnings

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    iob_df = webanno_to_iob_df(webanno_df, 'detail', True, debug=False, collect_errors=False, skip_errors=True, all_columns=True)
    iob_df['id'] = iob_df.entity_id.where(~iob_df.entity_id.isna(), iob_df.spec_id)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 49720/49720 [18:11<00:00, 45.55it/s]


CPU times: user 18min 20s, sys: 9.56 s, total: 18min 30s
Wall time: 18min 28s


In [7]:
# Anything containing a prefix, suffix or fragment is potentially an elliptical construct
ellipses = iob_df[~iob_df.prefix.str.match(EMPTY_REGEX) | ~iob_df.suffix.str.match(EMPTY_REGEX) | ~iob_df.fragment.str.match(EMPTY_REGEX)].copy()

In [9]:
%%time
results = resolve_ellipses(ellipses, iob_df)

  sentence = ellipses.loc[idx]
  full_sentence = iob_df.loc[idx]


CPU times: user 2min 7s, sys: 1.1 s, total: 2min 8s
Wall time: 2min 8s


### Adding resolved sentences and metadata

In [None]:
#idx = results[(results.file == '01_magenkarzinom_0090.tsv') & (results.sentence_id == 22)].index
#assert len(idx) == 1
#results.at[idx[0], 'resolution'] = ['Stadium', '2', 'und', 'Stadium 3']

In [10]:
sentence_idx = pd.DataFrame(sentences, columns=['raw_sentence'], index=webanno_df.index.drop_duplicates())

In [11]:
import math
clean_results = results.merge(right=sentence_idx, left_on=['file', 'sentence_id'], right_index=True)
clean_results['outer'] = math.nan

In [12]:
for _, k in clean_results[clean_results.duplicated(['file', 'sentence_id'])][['file', 'sentence_id']].iterrows():
    f = k['file']
    s = k['sentence_id']
    dups = clean_results[(clean_results.file == f) & (clean_results.sentence_id == s)]
    
    for i, inner in dups.iterrows():
        for j, outer in dups.iterrows():
            if i != j and inner.span_index_start >= outer.span_index_start and inner.span_index_end <= outer.span_index_end:
                if not (clean_results.loc[j, 'outer'] == i):
                    clean_results.loc[i, 'outer'] = j

In [13]:
clean_results[(clean_results.file == '04_praevention-zervixkarzinom_0255.tsv') & (clean_results.sentence_id == 12)]

Unnamed: 0,file,sentence_id,full_sentence,span_index_start,span_index_end,full_span,offsets,resolution,fragment,missing_prefix,missing_suffix,raw_sentence,outer
913,04_praevention-zervixkarzinom_0255.tsv,12,"[Ein, positiver, Absetzungsrand, zeigt, vergli...",18,22,"[CIN, 2, /, 3, Läsion]","[[116, 119], [120, 121], [121, 122], [122, 123...","[CIN, 2 Läsion, /, CIN 3, Läsion]",True,True,True,Ein positiver Absetzungsrand zeigt verglichen ...,914.0
914,04_praevention-zervixkarzinom_0255.tsv,12,"[Ein, positiver, Absetzungsrand, zeigt, vergli...",18,22,"[CIN, 2, /, 3, Läsion]","[[116, 119], [120, 121], [121, 122], [122, 123...","[CIN, 2 Läsion, /, CIN 3, Läsion]",True,True,True,Ein positiver Absetzungsrand zeigt verglichen ...,


In [14]:
clean_results = clean_results[clean_results.outer.isna()]

In [15]:
def resolve(row):
    out = {}
    for c in row.index:
        val = row.loc[c]
        if len(val) == 1:
            out['multi'] = False
            out[c] = val[0]
        else:
            out['multi'] = True
            if val.count(val[0]) == len(val) and c in ['full_sentence', 'raw_sentence']:
                out[c] = val[0]
            elif c in ['fragment', 'missing_prefix', 'missing_suffix']:
                out[c] = max(val)
            else:
                out[c] = val
    return pd.Series(out)

In [16]:
clean_results = clean_results.groupby(['file', 'sentence_id', 'fragment']).aggregate(tuple).applymap(list).apply(resolve, axis=1)[list(clean_results.columns.drop(['outer', 'file', 'sentence_id', 'fragment'])) + ['multi']].reset_index()

In [17]:
def get_full_res(row):
    sent = row.raw_sentence
    if row.multi:
        spans = [o for p in row.offsets for o in p]
        resolution = [r for p in row.resolution for r in p]
    else:
        spans = row.offsets
        resolution = row.resolution
    assert len(spans) == len(resolution), (spans, resolution)
    j = 0
    for span, res in zip(spans, resolution):
        start, end = span
        start += j
        end += j
        sent = sent[:start] + res + sent[end:]
        j += (start - end + len(res))
    return sent

In [18]:
clean_results['full_resolution'] = clean_results.apply(get_full_res, axis=1)

In [19]:
import numpy as np

In [20]:
ellipses_sentence_count = len(results[['file', 'sentence_id']].drop_duplicates())
sentence_count = 78090
ellipses_sentence_count, sentence_count, ellipses_sentence_count / sentence_count

(4609, 78090, 0.059021641695479576)

In [21]:
print('Ellipses', len(results))
print('Forward ellipses', len(results[results.missing_prefix & ~results.missing_suffix]), len(results[results.fragment & results.missing_prefix & ~results.missing_suffix]))
print('Backward ellipses', len(results[~results.missing_prefix & results.missing_suffix]), len(results[results.fragment & ~results.missing_prefix & results.missing_suffix]))
print('Complex ellipses', len(results[results.missing_prefix & results.missing_suffix]), len(results[results.fragment & results.missing_prefix & results.missing_suffix]))

Ellipses 5187
Forward ellipses 1929 829
Backward ellipses 2955 2745
Complex ellipses 270 189


In [22]:
import random

def show_random(df):
    i = random.randint(0, len(df) - 1)
    item = df.iloc[i]    
    assert len(item.full_span) == len(item.resolution)
    spans = []
    solutions = []
    marker = []
    if not item.multi:
        for span, res in zip(item.full_span, item.resolution):
            if span == res:
                marker.append(' ' * len(span))
            else:
                marker.append('^' * len(res))
            start_index = res.find(span)
            spans.append(span.rjust(start_index + len(span)).ljust(len(res) - start_index))
            solutions.append(res)
        print(' '.join(spans))
        print(' '.join(solutions))
        print(' '.join(marker))
        print(f'{item.file};{item.sentence_id};{" ".join(item.full_span)}', 'fragment:', item.fragment)
    else:
         print(f'{item.file};{item.sentence_id}')
    print(item.raw_sentence)
    print(item.full_resolution)
    print('...........')
    print('')

In [23]:
for _ in range(0, 10):
    show_random(clean_results[clean_results.multi & clean_results.fragment])

27_supportive-therapie_0273.tsv;5
Rash und Dermatitis treten bei beiden EGFR-Antikörpern in gerundet 70 % Grad 1/2 bzw. in gerundet 10 % Grad 3/4 auf.
Rash und Dermatitis treten bei beiden EGFR-Antikörpern in gerundet 70 % Grad 1/Grad 2 bzw. in gerundet 10 % Grad 3/Grad 4 auf.
...........

09_mammakarzinom_0200.tsv;76
„Bei Frauen mit einem ER-/PgR-positiven, HER2-negativen, nodal-positiven (1-3 befallene Lymphknoten) primären invasiven Mammakarzinom, kann ein methodisch standardisierter und klinisch validierter Multigentest bei der Entscheidung gegen eine (neo-)adjuvante Chemotherapie herangezogen werden, wenn dieser ein niedriges Rückfallrisiko vorhersagt.“ Für dieses Statement fand sich allerdings keine Mehrheit, so dass es nur hier im Hintergrundtext dargestellt wird und für die nodal-positive Situation somit keine konsentierte Empfehlung formuliert werden kann.
„Bei Frauen mit einem ER-positiven/PgR-positiven, HER2-negativen, nodal-positiven (1-3 befallene Lymphknoten) primären inv

In [24]:
for _ in range(0, 10):
    show_random(clean_results[clean_results.fragment])

Bauch          - / Rektalschmerzen
Bauchschmerzen  / Rektalschmerzen
^^^^^^^^^^^^^^                   
28_komplementaermedizin_0251.tsv;13;Bauch - / Rektalschmerzen fragment: True
Blutungen und Bauch-/Rektalschmerzen verbesserten sich nicht signifikant.
Blutungen und Bauchschmerzen/Rektalschmerzen verbesserten sich nicht signifikant.
...........

Radio         - bzw . Radiochemotherapie
Radiotherapie  bzw . Radiochemotherapie
^^^^^^^^^^^^^                          
14_larynxkarzinom_0065.tsv;1;Radio - bzw . Radiochemotherapie fragment: True
Eine neoadjuvante Chemotherapie vor geplanter definitiver Radio-oder Radiochemotherapie soll nicht durchgeführt werden, außer zum Zweck der Selektion zwischen Laryngektomie und Radio- bzw. Radiochemotherapie.
Eine neoadjuvante Chemotherapie vor geplanter definitiver Radio-oder Radiochemotherapie soll nicht durchgeführt werden, außer zum Zweck der Selektion zwischen Laryngektomie und Radiotherapie bzw. Radiochemotherapie.
...........

FIGO           

In [25]:
ss = clean_results[~clean_results.fragment][clean_results[~clean_results.fragment].resolution.map(lambda r: any([t for t in r if '-' in t]))]

In [None]:
for _ in range(0, 10):
    show_random(ss)

In [None]:
for _ in range(0, 10):
    show_random(clean_results[~clean_results.fragment])

In [None]:
show_random(clean_results[(clean_results.file == '01_magenkarzinom_0033.tsv') & (clean_results.sentence_id == 4) & clean_results.fragment])

In [62]:
def create_export(subset):
    subset = subset.copy()
    join_fn = lambda x: ' '.join(x) if type(x[0]) == str else '; '.join([' '.join(y) for y in x])
    subset['text'] = subset.full_span.map(join_fn)
    subset['resolved'] = subset.resolution.map(join_fn)
    return subset[['file', 'sentence_id', 'text', 'resolved', 'raw_sentence', 'full_resolution']]

create_export(clean_results[clean_results.fragment]).to_excel('ellipses_fragment.xlsx', index=False)

In [None]:
date = datetime.now().strftime("%Y%m%d_%H%M%S")

In [None]:
date

In [None]:
clean_results.to_excel(f'ellipses_nodup_{date}.xlsx')