In [1]:
import sys
sys.path.append('../scripts')

In [2]:
import pandas as pd
import spacy
import os
from pathlib import Path
from xml.dom import minidom
from charsplit import Splitter
from rules.ellipticCompounds import exchangeLemmas
from evaluation import error_analysis, get_scores

nlp = spacy.load("de_core_news_lg")

In [3]:
import hydra
from hydra import compose, initialize

hydra.core.global_hydra.GlobalHydra.instance().clear()
initialize(config_path=Path('..'), job_name='foo', version_base='1.1')
config = compose(config_name='experiment.yaml')

In [4]:
tmp_path = Path('../tmp/baseline_files')
tmp_path.mkdir(exist_ok=True, parents=True)

In [5]:
path_xml_file = tmp_path / 'Original.xml'
path_output_file = tmp_path / 'Output.xml'
path_gertwol_file = tmp_path / 'gertwol.txt'
path_word_freqs = tmp_path / 'wordFreqs.txt'
tagged_path = '../compoundListTagged.txt'

path_tokens = Path('..') / config.data.ggponc_plain_text

# Load the Data

In [6]:
ellipses = pd.read_csv(Path('..') / config.data.cnf_tsv_path, sep='\t')
not_ellipses = pd.read_csv(Path('..') / 'data/ellipses/ggponc_no_ellipses_small.tsv', sep='\t')

not_ellipses['full_resolution'] = not_ellipses.raw_sentence

ellipses = ellipses[ellipses.split == 'test']
not_ellipses = not_ellipses[not_ellipses.split == 'test']

data = pd.concat([ellipses[['file', 'sentence_id', 'raw_sentence', 'full_resolution']], not_ellipses[['file', 'sentence_id', 'raw_sentence', 'full_resolution']]]).sort_values(by='file').reset_index(drop=True)

In [7]:
data.raw_sentence.str.split(' ').apply(len).sum()

25711

In [8]:
len(data[['file', 'sentence_id']].drop_duplicates())

1159

In [9]:
data.shape

(1159, 4)

In [10]:
data.head()

Unnamed: 0,file,sentence_id,raw_sentence,full_resolution
0,00_mundhoehlenkarzinom_0002.tsv,1,Hauptrisikofaktoren für das Auftreten eines Mu...,Hauptrisikofaktoren für das Auftreten eines Mu...
1,00_mundhoehlenkarzinom_0002.tsv,2,Bei chronischem Tabak- oder Alkoholabusus ist ...,Bei chronischem Tabakabusus oder Alkoholabusus...
2,00_mundhoehlenkarzinom_0098.tsv,2,Als kurativ intendierte therapeutische Optione...,Als kurativ intendierte therapeutische Optione...
3,00_mundhoehlenkarzinom_0103.tsv,1,"Patienten mit einem unheilbaren Tumorleiden, j...","Patienten mit einem unheilbaren Tumorleiden, j..."
4,00_mundhoehlenkarzinom_0115.tsv,30,Dies bestätigte sich auch nach 2 Jahren Nachbe...,Dies bestätigte sich auch nach 2 Jahren Nachbe...


# Create XML Document

In [11]:
root = minidom.Document()

xml = root.createElement('corpus')
xml.setAttribute('id', 'corpus')
root.appendChild(xml)

for i, f in enumerate(data[['file', 'raw_sentence']].groupby('file').first().index.to_list()):
    file = root.createElement('file')
    file.setAttribute('n', str(i))
    xml.appendChild(file)

    div = root.createElement('div')
    file.appendChild(div)

    for j, s in enumerate(data[data['file']==f]['raw_sentence']):
        sentence = root.createElement('s')
        sentence.setAttribute('lang', 'de')
        sentence.setAttribute('n', f'{i}-{j}')
        div.appendChild(sentence)

        doc = nlp(s)
        for k, token in enumerate(doc):
            word = root.createElement('w')
            word.setAttribute('lemma', token.lemma_)
            word.setAttribute('n', f'{i}-{j}-{k}')
            word.setAttribute('pos', token.tag_)
            word.setAttribute('whitespace', token.whitespace_)
            sentence.appendChild(word)
            word.appendChild(root.createTextNode(token.text))

with open(path_xml_file, "wb") as f:
    f.write(root.toprettyxml(indent ="\t", encoding="utf-8"))

# Produce Gertwol-like List and Gather Word Frequencies

In [12]:
freq = {}
dict = {}
splitter = Splitter()
special_characters = "!@#$%^&*()-+?_=,<>/.[] "

for filename in os.listdir(path_tokens):
    with open(path_tokens / filename , encoding='utf8') as f:
        for word in f.readlines():
            word = word[:-1]

            # Gertwol Split
            split = splitter.split_compound(word)[0]
            if split[0] != 0:
                dict[word] = '#'.join([split[1]] + [subword.lower() for subword in split[2:]])

            # Word Freq
            if word in special_characters:
                continue
            if word in freq:
                freq[word] += 1
            else:
                freq[word] = 1

with open(path_gertwol_file, 'w', encoding='utf-8') as f:
    for key, value in dict.items():
        f.write(f'{key}\t{value}\n')

with open(path_word_freqs, 'w', encoding='utf-8') as f:
    for key, value in freq.items():
        f.write(f'{key}\t{value}\n')

# Run Aepli Baseline

In [13]:
exchangeLemmas(path_xml_file, path_output_file, path_gertwol_file, path_word_freqs, tagged_path)

Alkohol#abusus
Alkohol#abusus
Radio#chemotherapie
Leistungs#zustand
Leistungs#zustand
Leistungs#zustand
Gesamt#überleben
Proteom#ebene
Ref#lux
Dyspnö#symptomatik
Chemo#therapie
Ausschluss#kriterien
Post#operativen
Mort#alität
Tumor#bedingte
Schm#erzen
Strahlen#therapie
Strahlen#therapieinduzierten
Gemüse#reiche
Ern#ährung
Strahlen#therapie
Aroma#massage
Indikations#problem
Perforations#risiko
Papillen#adenome
Hnpcc#syndrom
Mr#kolonografie
Blutgefäß#invasion
Links#seitigen
Tum#oren
Tumor#lokalisation
Tonstein#industrie
Peri#kardkarzinose
Gefäßresektionsrän#der
Palliativ#beratung
Zweit-Generations#tki
OS
Zweit-Generations#tki
Zweiwöchentli#chen
Abstän#den
Stammzell#transplantation
Laser#therapie
Bewegungs#therapie
Panik#reaktion
Drittlinien#therapie
Panik#komponente
Hand#ventilatoren
Lebens#stunden
Langzeit#studie
60#jährigen
30#jährigen
65#jährigen
29#jährigen
Wiedervorstellung#algorithmen
Wiedervorstellung#algorithmen
Spät#komplikationen
Nach#teile
Therapie#optionen
Hilfs#angebote
Hilf

# Reconstruct Complete Sentences

In [14]:
dom = minidom.parse(open(path_output_file))
predictions = []

sentences = dom.getElementsByTagName('s')
for sentence in sentences:
    tokens = sentence.getElementsByTagName('w')
    sentence_tokens = []
    for token in tokens:
        word = token.childNodes[0].nodeValue
        lemma = token.getAttribute('lemma')
        if '+' in lemma:
            word_parts = lemma.partition('+')
            if token.getAttribute('pos') == 'TRUNC':
                word = word.replace('-', word_parts[2])
            else:
                word = word.replace('-', word_parts[0])
        whitespace = token.getAttribute('whitespace')
        sentence_tokens.append(word + whitespace)

    predictions.append(''.join(token for token in sentence_tokens))

# Evaluate Baseline

In [16]:
errors_df = error_analysis(predictions, data.full_resolution, data.raw_sentence)
errors_df.error_type.value_counts() / len(errors_df)

tn         0.530630
fn         0.195858
tp         0.170837
replace    0.043141
delete     0.031061
complex    0.018119
insert     0.009491
fp         0.000863
Name: error_type, dtype: float64

In [None]:
%%time
scores = get_scores(errors_df, "test")
scores

In [None]:
from notebook_util import show_errors

In [None]:
errors = pd.concat([errors_df, data[['file', 'sentence_id']].reset_index()], axis=1)

In [None]:
fps = errors[errors.error_type == 'fp'][['file', 'sentence_id', 'ground_truth', 'pred']].rename(columns = {'ground_truth' : 'resolution'})
print(fps.shape)
#fps.to_excel('../potential_missing_2.xlsx')

In [None]:
show_errors(errors[errors.error_type == 'fn'])