In [1]:
%load_ext autoreload
%autoreload 1

In [2]:
import sys
sys.path.append('../scripts')

In [3]:
import os
# Disable weights and biases (if installed)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
os.environ["WANDB_DISABLED"] = "true"

In [4]:
from pathlib import Path

from dataset import load_data, get_dataloader
from generative.run_experiment import get_training_args, get_trainer, get_tokenizer

  from .autonotebook import tqdm as notebook_tqdm


# Training

In [5]:
import hydra
from hydra import compose, initialize

hydra.core.global_hydra.GlobalHydra.instance().clear()
initialize(config_path=Path('..'), job_name='foo', version_base='1.1')
config = compose(config_name='experiment.yaml')

In [6]:
training_args = get_training_args(config, report_to="none")
tokenizer = get_tokenizer(config)



In [20]:
base_path = Path('..')
train_df, val_df, test_df = load_data(base_path / config.data.cnf_tsv_path, base_path / config.data.controls_tsv_path)

In [21]:
# Ellipses
(~train_df.controls).sum(), (~val_df.controls).sum(), (~test_df.controls).sum()

(2250, 510, 485)

In [22]:
# Controls
(train_df.controls).sum(), (val_df.controls).sum(), (test_df.controls).sum()

(2208, 521, 516)

In [23]:
train_dataset, val_dataset, test_dataset = get_dataloader(train_df, val_df, test_df, tokenizer)

In [24]:
training_args.num_train_epochs = 10

In [25]:
trainer = get_trainer(config, tokenizer, training_args, train_dataset, val_dataset)

loading configuration file config.json from cache at /home/Florian.Borchert/.cache/huggingface/hub/models--google--mt5-base/snapshots/d86816880b5acc27e697e52bc237e816dc828b17/config.json
Model config MT5Config {
  "_name_or_path": "google/mt5-base",
  "architectures": [
    "MT5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "mt5",
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "T5Tokenizer",
  "transformers_version": "4.23.1",
  "use_cache": true,
  "vocab_size": 250112
}

loading weights fil

In [None]:
trainer.train()

***** Running training *****
  Num examples = 4458
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5580
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Exact Match,Google Bleu
1,0.1778,0.057102,0.683802,0.678742
2,0.0853,0.051459,0.729389,0.930575


***** Running Evaluation *****
  Num examples = 1031
  Batch size = 8
***** Running Evaluation *****
  Num examples = 1031
  Batch size = 8


In [None]:
eval_metrics = trainer.evaluate(val_dataset)
print(eval_metrics)

test_metrics = trainer.evaluate(test_dataset, metric_key_prefix='test')
print(test_metrics)

# Error Analysis

In [24]:
from transformers import Text2TextGenerationPipeline
from evaluation import error_analysis, Metrics

In [25]:
pipeline = Text2TextGenerationPipeline(model=trainer.model, tokenizer=tokenizer, max_length=config.generation_max_length, device=0)

In [26]:
from evaluation import error_analysis, get_scores

In [30]:
import pandas as pd

def calculate_errors(out, sample):
    gen_text = [o['generated_text'] for o in out]
    errors = error_analysis(gen_text, sample.full_resolution, sample.raw_sentence)
    return errors

In [28]:
%%time
out_valid = pipeline(list(val_df.raw_sentence))

CPU times: user 5min 11s, sys: 79.2 ms, total: 5min 12s
Wall time: 5min 12s


In [31]:
errors_valid = calculate_errors(out_valid, val_df)

In [32]:
%%time
scores = get_scores(errors_valid, "eval")
scores

CPU times: user 1min 2s, sys: 35.5 ms, total: 1min 2s
Wall time: 1min 4s


{'eval/tp': 0.8117647058823529,
 'eval/tp_abs': 414,
 'eval/fn': 0.029411764705882353,
 'eval/fn_abs': 15,
 'eval/replace': 0.052941176470588235,
 'eval/replace_abs': 27,
 'eval/insert': 0.050980392156862744,
 'eval/insert_abs': 26,
 'eval/delete': 0.041176470588235294,
 'eval/delete_abs': 21,
 'eval/complex': 0.013725490196078431,
 'eval/complex_abs': 7,
 'eval/edit_distance_rel': 0.9218893983501436,
 'eval/exact_match': 0.8117647058823529,
 'eval/gleu': 0.9552840562003665,
 'eval/edit_distance_abs': 3.9450980392156865}

In [33]:
%%time
out_test = pipeline(list(test_df.raw_sentence))

CPU times: user 4min 50s, sys: 139 ms, total: 4min 50s
Wall time: 4min 50s


In [35]:
errors_test = calculate_errors(out_test, test_df)

In [37]:
%%time
scores = get_scores(errors_test, "test")
scores

CPU times: user 59.8 s, sys: 53 ms, total: 59.9 s
Wall time: 1min 2s


{'test/tp': 0.7711340206185567,
 'test/tp_abs': 374,
 'test/fn': 0.024742268041237112,
 'test/fn_abs': 12,
 'test/replace': 0.03917525773195876,
 'test/replace_abs': 19,
 'test/insert': 0.041237113402061855,
 'test/insert_abs': 20,
 'test/delete': 0.0865979381443299,
 'test/delete_abs': 42,
 'test/complex': 0.03711340206185567,
 'test/complex_abs': 18,
 'test/edit_distance_rel': 0.9226561353693519,
 'test/exact_match': 0.7711340206185567,
 'test/gleu': 0.9719856528464821,
 'test/edit_distance_abs': 1.909278350515464}

In [1]:
from notebook_util import show_errors

In [154]:
show_errors(errors_test[errors_test.error_type == "replace"].sample(5))

__Input:__

Bei G2 und G3 Weichgewebesarkomen soll eine prä- oder postoperative Strahlentherapie erfolgen.

__Error type:__ replace

__Input:__

Die Kompexität der Situation soll dabei wiederholt eingeschätzt werden und sowohl Patienten- als auch Angehörigenbedürfnisse unter Nutzung validierter multidimensionaler Erfassungsinstrumente, den Funktionsstatus der Patienten und die Krankheitsphase berücksichtigen.

__Error type:__ replace

__Input:__

Im Vordergrund stehen durch inflammatorische Zytokine (Tumor-Nekrose-Faktor-α, Interleukin-1-α und -ß, Interleukin-6, Interferon-γ) vermittelte Störungen.

__Error type:__ replace

__Input:__

Zwei prospektive Untersuchungen auf Grundlage des QoL-C30- bzw. FACT-BL Fragebogens zur Blasenfunktion belegen die überwiegend guten Ergebnisse nach organerhaltender Radiochemotherapie [REF], [REF], [REF].

__Error type:__ replace

__Input:__

Die andere Fall- (n=670) Kontroll- (n=1188) Studie fokussierte auf Tabak- und Alkoholkonsum [REF].

__Error type:__ replace