In [None]:
import pandas as pd
import spacy
import os
from xml.dom import minidom
from charsplit import Splitter
from ellipticCompounds import exchangeLemmas
from evaluation import error_analysis, get_scores

nlp = spacy.load("de_core_news_lg")

In [None]:
path_ellipses = '../data/ggponc_ellipses_compounds.tsv'
path_no_ellipses = '../data/ggponc_no_ellipses_small.tsv'
path_tokens = '../data/tokens/all_files_tokens'

path_xml_file = '../data/baseline_files/Original.xml'
path_gertwol_file = '../data/baseline_files/Output.xml'
path_word_freqs_file = '../data/baseline_files/gertwol.txt'
path_output_file = '../data/baseline_files/wordFreqs.txt'

# Load the Data

In [7]:
ellipses = pd.read_csv(path_ellipses, sep='\t')
not_ellipses = pd.read_csv(path_no_ellipses, sep='\t')

not_ellipses['full_resolution'] = not_ellipses.raw_sentence

# Train and Test split?

data = pd.concat([ellipses[['file', 'raw_sentence', 'full_resolution']], not_ellipses[['file', 'raw_sentence', 'full_resolution']]]).sort_values(by='file').reset_index(drop=True)

# Create XML Document

In [33]:
root = minidom.Document()

xml = root.createElement('corpus')
xml.setAttribute('id', 'corpus')
root.appendChild(xml)

for i, f in enumerate(data['file', 'raw_sentence'].groupby('file').first().index.to_list()):
    file = root.createElement('file')
    file.setAttribute('n', str(i))
    xml.appendChild(file)

    div = root.createElement('div')
    file.appendChild(div)

    for j, s in enumerate(data[data['file']==f]['raw_sentence']):
        sentence = root.createElement('s')
        sentence.setAttribute('lang', 'de')
        sentence.setAttribute('n', f'{i}-{j}')
        div.appendChild(sentence)

        doc = nlp(s)
        for k, token in enumerate(doc):
            word = root.createElement('w')
            word.setAttribute('lemma', token.lemma_)
            word.setAttribute('n', f'{i}-{j}-{k}')
            word.setAttribute('pos', token.tag_)
            word.setAttribute('whitespace', token.whitespace_)
            sentence.appendChild(word)
            word.appendChild(root.createTextNode(token.text))

with open(path_xml_file, "wb") as f:
    f.write(root.toprettyxml(indent ="\t", encoding="utf-8"))

# Produce Gertwol-like List and Gather Word Frequencies

In [6]:
freq = {}
dict = {}
splitter = Splitter()
special_characters = "!@#$%^&*()-+?_=,<>/.[] "

for filename in os.listdir(path_tokens):
    with open(path_tokens + f'/{filename}', encoding='utf8') as f:
        for word in f.readlines():
            word = word[:-1]

            # Gertwol Split
            split = splitter.split_compound(word)[0]
            if split[0] != 0:
                dict[word] = '#'.join([split[1]] + [subword.lower() for subword in split[2:]])

            # Word Freq
            if word in special_characters:
                continue
            if word in freq:
                freq[word] += 1
            else:
                freq[word] = 1

with open(path_gertwol_file, 'w', encoding='utf-8') as f:
    for key, value in dict.items():
        f.write(f'{key}\t{value}\n')

with open(path_word_freqs_file, 'w', encoding='utf-8') as f:
    for key, value in freq.items():
        f.write(f'{key}\t{value}\n')

# Run Aepli Baseline

In [34]:
exchangeLemmas(path_xml_file, path_output_file, path_gertwol_file, path_word_freqs_file)

Alkohol#abusus
Alkohol#abusus
Hals#tumoren
Funktions#erhalt
Nährstoff#aufnahme
Schluck#funktion
Nikotin#assoziierte
Kar#zinome
Nikotin#assoziierte
Kar#zinome
Innenspangenre#sektion
Schluck#funktion
Schluck#funktion
Knochen#transplantation
Knochen#transplantation
Mund#bodenkarzinom
Chemotherapie#bedingte
Chemotherapie#bedingte
Radio#chemotherapie
Mund#bodenkarzinoms
Nerv#infiltration
Radio#chemotherapie
Radio#chemotherapie
Mund#pflege
Distanz#schiene
ggf.
Geschmacks#störungen
Geschmacks#störungen
Radio#chemotherapie
Radio#chemotherapie
Radio#chemotherapie
Radio#chemotherapie
Leistungs#zustand
Immun#zellen
Immun#zellen
Gesamt#überleben
Leistungs#zustand
Leistungs#zustand
Konvention#ell
Radio#chemotherapie
Schluck#störungen
Schluck#störungen
Schluck#rehabilitation
Schluck#störung
Schluck#störung
Behandlungs#bedingten
Beeinträchtig#ungen
Sprech#vermögen
Ref#lux
Umwelt#faktoren
Nicht#kardiakarzinom
Tonsillen#karzinome
Peutz-Jeghers#syndrom
Eier#stockkrebs
Familien#anamnese
Missense#mutation

# Reconstruct Complete Sentences

In [35]:
dom = minidom.parse(path_output_file)
predictions = []

sentences = dom.getElementsByTagName('s')
for sentence in sentences:
    tokens = sentence.getElementsByTagName('w')
    sentence_tokens = []
    for token in tokens:
        word = token.childNodes[0].nodeValue
        lemma = token.getAttribute('lemma')
        if '+' in lemma:
            word_parts = lemma.partition('+')
            if token.getAttribute('pos') == 'TRUNC':
                word = word.replace('-', word_parts[2])
            else:
                word = word.replace('-', word_parts[0])
        whitespace = token.getAttribute('whitespace')
        sentence_tokens.append(word + whitespace)

    predictions.append(''.join(token for token in sentence_tokens))

# Evaluate Baseline

In [49]:
ea = error_analysis(predictions, data.full_resolution, data.raw_sentence)
get_scores(ea, "eval")

Downloading builder script:   0%|          | 0.00/5.67k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/8.64k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

{'eval/tp': 0.6537637698898409,
 'eval/tp_abs': 4273,
 'eval/fn': 0.2350061199510404,
 'eval/fn_abs': 1536,
 'eval/fp': 0.008414932680538556,
 'eval/fp_abs': 55,
 'eval/replace': 0.036260709914320684,
 'eval/replace_abs': 237,
 'eval/insert': 0.0151468788249694,
 'eval/insert_abs': 99,
 'eval/delete': 0.029987760097919217,
 'eval/delete_abs': 196,
 'eval/complex': 0.021419828641370868,
 'eval/complex_abs': 140,
 'eval/edit_distance_rel': 0.7258641630110028,
 'eval/exact_match': 0.6537637698898409,
 'eval/gleu': 0.9589435138708542,
 'eval/edit_distance_abs': 3.2671358629130967}