In [None]:
!pip install transformers datasets evaluate sacrebleu

In [None]:
!wget https://pmb.let.rug.nl/releases/pmb-4.0.0.zip
!unzip -q "pmb-4.0.0.zip" "*/gold/*" -d .
!unzip -q "pmb-4.0.0.zip" "*/silver/*" -d .

--2023-01-09 14:17:54--  https://pmb.let.rug.nl/releases/pmb-4.0.0.zip
Resolving pmb.let.rug.nl (pmb.let.rug.nl)... 129.125.55.158
Connecting to pmb.let.rug.nl (pmb.let.rug.nl)|129.125.55.158|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3005186356 (2.8G) [application/zip]
Saving to: ‘pmb-4.0.0.zip’


In [None]:
# https://github.com/huggingface/transformers/blob/main/examples/pytorch/translation/run_translation.py

In [None]:
import random
import numpy as np
import torch

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
import re
import os
from collections import defaultdict

space_re = re.compile('\s+')

data_dir = os.path.join('pmb-4.0.0', 'data')
data = []

for lang in os.listdir(data_dir):
  for quality in ['gold', 'silver']:
    quality_folder = os.path.join(data_dir, lang, quality)

    for upper_filedir in os.listdir(quality_folder):
      filedirs = os.path.join(quality_folder, upper_filedir)

      for filedir in os.listdir(filedirs):
        file_folder = os.path.join(filedirs, filedir)

        with open(os.path.join(file_folder, f'{lang}.status')) as f:
          if 'bronze' in f.read().lower():
            continue

        with open(os.path.join(file_folder, f'{lang}.raw')) as f:
          raw_text = f.read().strip()
        
        with open(os.path.join(file_folder, f'{lang}.drs.sbn')) as f:
          raw_drs = f.read().strip()
          drs = '\n'.join(
              space_re.sub(' ', line) 
              for line in raw_drs.split('\n') 
              if not line.startswith('%%%')
          )

        data.append({
            'lang': lang, 
            'quality': quality, 
            'text': raw_text, 
            'drs': drs
        })

In [None]:
stats = defaultdict(lambda: defaultdict(int))
for info in data:
  stats[info['lang']][info['quality']] += 1

for l, qualities in stats.items():
  for q, qnum in qualities.items():
    print(l, q, qnum)

nl gold 1467
nl silver 3
de gold 2844
de silver 16
en gold 10715
en silver 428
it gold 1686
it silver 9


In [None]:
from transformers import (
    T5ForConditionalGeneration, AutoTokenizer,
    DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, 
    Seq2SeqTrainer
)

In [None]:
model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")

In [None]:
from datasets import Dataset
ds = Dataset.from_list(data)
ds

Dataset({
    features: ['lang', 'quality', 'text', 'drs'],
    num_rows: 17168
})

In [None]:
max_len = 512

def process(examples):
  model_inputs = tokenizer(examples['text'], max_length=max_len, truncation=True)
  labels = tokenizer(examples['drs'], max_length=max_len, truncation=True)
  model_inputs['labels'] = labels['input_ids']
  return model_inputs

ds = ds.map(process, batched=True)

  0%|          | 0/18 [00:00<?, ?ba/s]

In [None]:
ds = ds.train_test_split(test_size=0.1, seed=SEED)

In [None]:
import evaluate

metric = evaluate.load("chrf")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"chrf": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}

    return result

In [None]:
label_pad_token_id = -100
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id
)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir='results',
    report_to='none',
    evaluation_strategy='epoch',
    # eval_steps=1000,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    save_strategy='epoch',
    save_total_limit=3,
    predict_with_generate=True,
    generation_max_length=max_len,
    generation_num_beams=3,
)

trainer = Seq2SeqTrainer(
  model=model,
  args=training_args,
  train_dataset=ds['train'],
  eval_dataset=ds['test'],
  tokenizer=tokenizer,
  data_collator=data_collator,
  compute_metrics=compute_metrics
  # callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: drs, lang, quality, text. If drs, lang, quality, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 15451
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 11589
  Number of trainable parameters = 299637760


Epoch,Training Loss,Validation Loss,Chrf,Gen Len
1,0.2604,0.173349,66.8001,180.8288
2,0.1989,0.135261,72.7391,179.9872


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: drs, lang, quality, text. If drs, lang, quality, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1717
  Batch size = 4
Saving model checkpoint to results/checkpoint-3863
Configuration saved in results/checkpoint-3863/config.json
Model weights saved in results/checkpoint-3863/pytorch_model.bin
tokenizer config file saved in results/checkpoint-3863/tokenizer_config.json
Special tokens file saved in results/checkpoint-3863/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: drs, lang, quality, text. If drs, lang, quality, text are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Eva

Epoch,Training Loss,Validation Loss,Chrf,Gen Len
1,0.2604,0.173349,66.8001,180.8288
2,0.1989,0.135261,72.7391,179.9872
3,0.1808,0.125466,74.754,177.947


Saving model checkpoint to results/checkpoint-11589
Configuration saved in results/checkpoint-11589/config.json
Model weights saved in results/checkpoint-11589/pytorch_model.bin
tokenizer config file saved in results/checkpoint-11589/tokenizer_config.json
Special tokens file saved in results/checkpoint-11589/special_tokens_map.json
Deleting older checkpoint [results/checkpoint-3648] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=11589, training_loss=0.27546285358863126, metrics={'train_runtime': 8397.3769, 'train_samples_per_second': 5.52, 'train_steps_per_second': 1.38, 'total_flos': 3499509961884672.0, 'train_loss': 0.27546285358863126, 'epoch': 3.0})

In [None]:
test_example = ds['test'][16]
print(test_example['text'])
print(test_example['drs'])

My father's in the garden.
person.n.01 Role +1 % My father [0-9]
father.n.01 Of speaker % 
be.v.03 Theme -2 Time +1 Location +2 % 's in [9-14]
time.n.08 EQU now % 
garden.n.03 % the garden. [15-26]


In [None]:
outputs = model.generate(
    torch.tensor([test_example['input_ids']]).cuda(), 
    max_new_tokens=512,
    num_beams=3
)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

person.n.01 Role +1 % My father [0-8]
father.n.01 Of speaker % 
be.v.03 Theme -2 Time +1 Location +2 %'s in [9-13]
time.n.08 EQU now % 
garden.n.01 % the garden. [14-23]


In [None]:
for idx, i in enumerate(ds['train']):
  if 'Yamamoto' in i['text']:
    print(idx, i)

2814 {'lang': 'en', 'quality': 'gold', 'text': 'Ms. Yamamoto teaches us English.', 'drs': 'ms.n.05 % Ms. [0-3]\nfemale.n.02 Name "Yamamoto" Title -1 % Yamamoto [4-12]\nteach.v.01 Agent -1 Time +1 Recipient +2 Theme +3 % teaches [13-20]\ntime.n.08 EQU now % \nperson.n.01 EQU speaker % us [21-23]\nenglish.n.01 % English. [24-32]', 'input_ids': [80, 118, 49, 35, 92, 100, 112, 100, 112, 114, 119, 114, 35, 119, 104, 100, 102, 107, 104, 118, 35, 120, 118, 35, 72, 113, 106, 111, 108, 118, 107, 49, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [112, 118, 49, 113, 49, 51, 56, 35, 40, 35, 80, 118, 49, 35, 94, 51, 48, 54, 96, 13, 105, 104, 112, 100, 111, 104, 49, 113, 49, 51, 53, 35, 81, 100, 112, 104, 35, 37, 92, 100, 112, 100, 112, 114, 119, 114, 37, 35, 87, 108, 119, 111, 104, 35, 48, 52, 35, 40, 35, 92, 100, 112, 100, 112, 114, 119, 114, 35, 94, 55, 48, 52, 53, 96, 13, 119, 104, 100, 102, 107, 49, 121, 49, 