In [1]:
import sys
sys.path.append('../scripts')

In [2]:
import os
# Disable weights and biases (if installed)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [3]:
from pathlib import Path

from dataset import EllipsesDataset
from run_experiment import get_training_args, get_trainer, get_tokenizer

  from .autonotebook import tqdm as notebook_tqdm


# Training

In [4]:
import hydra
from hydra import compose, initialize

hydra.core.global_hydra.GlobalHydra.instance().clear()
initialize(config_path=Path('..'), job_name='foo', version_base='1.1')
config = compose(config_name='experiment.yaml')

In [5]:
config.metrics = ['exact_match', 'google_bleu']

In [6]:
training_args = get_training_args(config)
tokenizer = get_tokenizer(config)

In [7]:
import pandas as pd
df = pd.read_excel('../data/ellipses_nodup_20220804_174013.xlsx')

In [42]:
train_sample = df.sample(frac=0.8)
valid_sample = df.loc[~df.index.isin(train_sample.index)]
len(train_sample), len(valid_sample)

(3726, 931)

In [43]:
train_data = EllipsesDataset(train_sample.raw_sentence, train_sample.full_resolution, tokenizer)
val_data = EllipsesDataset(valid_sample.raw_sentence, valid_sample.full_resolution, tokenizer)



In [44]:
trainer = get_trainer(config, tokenizer, training_args, train_data, val_data)

loading configuration file config.json from cache at /home/Florian.Borchert/.cache/huggingface/hub/models--aware-ai--byt5-german-grammar/snapshots/8ab29798880b659a170a97a3ec9626a28e46ed3f/config.json
Model config T5Config {
  "_name_or_path": "aware-ai/byt5-german-grammar",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3584,
  "d_kv": 64,
  "d_model": 1472,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "gradient_checkpointing": false,
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "num_decoder_layers": 4,
  "num_heads": 6,
  "num_layers": 12,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "ByT5Tokenizer",
  "transformers_version": "4.23.1",
  "use_cache": true,
  "vocab_

In [45]:
trainer.train()

***** Running training *****
  Num examples = 3726
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 4660
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: length. If length are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.


Epoch,Training Loss,Validation Loss,Exact Match,Google Bleu
1,0.0715,0.044786,0.195489,0.729481
2,0.0233,0.033294,0.252417,0.779353
3,0.0173,0.028365,0.29753,0.779104
4,0.0554,0.032225,0.294307,0.765762
5,0.0232,0.024187,0.36305,0.811293
6,0.0265,0.022414,0.3942,0.817945
7,0.013,0.02215,0.421053,0.820959
8,0.0134,0.020305,0.453276,0.826181
9,0.0047,0.020769,0.46724,0.831595
10,0.0027,0.021987,0.468314,0.833195


***** Running Evaluation *****
  Num examples = 931
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: length. If length are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 931
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: length. If length are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 931
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: length. If length are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation 

TrainOutput(global_step=4660, training_loss=0.03541424996139321, metrics={'train_runtime': 4611.3722, 'train_samples_per_second': 8.08, 'train_steps_per_second': 1.011, 'total_flos': 2.521286422779187e+16, 'train_loss': 0.03541424996139321, 'epoch': 10.0})

In [46]:
from transformers import Text2TextGenerationPipeline

In [47]:
pipeline = Text2TextGenerationPipeline(model=trainer.model.to('cpu'), tokenizer=tokenizer, max_length=250)

In [48]:
train_sample.full_resolution.iloc[0]

'Folgende Werte werden als Referenzwerte/Normalwerte angenommen:'

In [49]:
train_sample.raw_sentence.iloc[0]

'Folgende Werte werden als Referenz-/Normalwerte angenommen:'

In [54]:
pipeline('- systemische und Strahlenauswahl')

[{'generated_text': '- systemische Üstemische Üstemische Üstemische Üswahl und Strahlenauswahl'}]