In [1]:
import sys
sys.path.append('../scripts')

In [16]:
from rules.cnf_resolver import RulebasedCNFResolver
from evaluation import Metrics
import pandas as pd

In [8]:
#!python -m spacy download de_core_news_lg

## Load Data

In [52]:
df_ellipses = pd.read_csv('../../ggponc_annotation/notebooks/ggponc_ccnfs.tsv', sep='\t')

In [53]:
val_sent = df_ellipses[df_ellipses.split == 'dev']
test_sent = df_ellipses[df_ellipses.split == 'test']

In [26]:
m = Metrics(['exact_match', 'google_bleu'], tokenizer=None)

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/Florian.Borchert/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/Florian.Borchert/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/Florian.Borchert/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [73]:
def print_metrics(pred, ground_truth):
    exact = m.compute_exact_match(pred, ground_truth)
    bleu = m.compute_bleu(pred, ground_truth)
    print(f"Exact match: {exact:.3f} \n GLEU score: {bleu:.3f}")

## Trivial baseline: do nothing

In [74]:
print_metrics(val_sent.raw_sentence.values, val_sent.full_resolution.values)

Exact match: 0.000 
 GLEU score: 0.900


In [75]:
print_metrics(test_sent.raw_sentence.values, test_sent.full_resolution.values)

Exact match: 0.000 
 GLEU score: 0.908


## Rule-based resolver

In [59]:
resolver = RulebasedCNFResolver(5)

In [63]:
val_preds = resolver.predict_all(val_sent.raw_sentence)

In [79]:
print_metrics(val_preds, val_sent.full_resolution.values)

Exact match: 0.303 
 GLEU score: 0.924


In [67]:
test_preds = resolver.predict_all(test_sent.raw_sentence)

In [78]:
print_metrics(test_preds, test_sent.full_resolution.values)

Exact match: 0.310 
 GLEU score: 0.937


In [81]:
from evaluation import error_analysis

In [139]:
ed = nltk.edit_distance
def metric(p, g, o):
    d = ed(p,g)
    k = ed(p,o)
    l = ed(o,g)
    if d == 0:
        return 1
    return 1 - (d / (k + l))

In [145]:
my_score = errors.apply(lambda r: metric(r['pred'], r['gt'], r['original']), axis=1)

In [92]:
errors = error_analysis(val_preds, val_sent.full_resolution, val_sent.raw_sentence)
errors.error_type.value_counts()

fn         193
tp         140
replace     52
delete      37
complex     32
insert       8
Name: error_type, dtype: int64

In [93]:
errors.error_type.value_counts() / len(errors)

fn         0.417749
tp         0.303030
replace    0.112554
delete     0.080087
complex    0.069264
insert     0.017316
Name: error_type, dtype: float64

In [None]:
my_score = errors.apply(lambda r: metric(r['pred'], r['gt'], r['original']), axis=1)

In [146]:
my_score.mean()

0.4819255026619977

In [100]:
errors[errors.error_type == 'replace'].iloc[1].pred

'Bei inkompletter Koloskopie (z. B. Adhäsionen) und fortbestehendem Wunsch des Patienten auf komplette Kolonbeurteilung sollte eine CTkolonographie oder MR-Kolonographie erfolgen.'

In [101]:
errors[errors.error_type == 'replace'].iloc[1]['gt']

'Bei inkompletter Koloskopie (z. B. Adhäsionen) und fortbestehendem Wunsch des Patienten auf komplette Kolonbeurteilung sollte eine CT-Kolonographie oder MR-Kolonographie erfolgen.'

In [107]:
import evaluate
sentence1 = "Vitamin A und Vitamin B"
sentence2 = "Vitamin A und Foobar B"
google_bleu = evaluate.load("google_bleu")
result = google_bleu.compute(predictions=[sentence1], references=[[sentence2]])
print(result)

{'google_bleu': 0.5}


In [108]:
import nltk

In [136]:
metric("Vitamin A und B", "Vitamin A und B", "Vitamin A und Vitamin B")

0.0