In [142]:
!pip install transformers datasets evaluate sacrebleu

In [None]:
!wget https://pmb.let.rug.nl/releases/pmb-4.0.0.zip
!unzip -q "pmb-4.0.0.zip" "*/gold/*" -d .
!unzip -q "pmb-4.0.0.zip" "*/silver/*" -d .

In [14]:
!git clone https://github.com/WPoelman/ud-boxer.git ud_boxer_repo
!pip install -r ud_boxer_repo/requirements/requirements.txt

In [None]:
# https://github.com/huggingface/transformers/blob/main/examples/pytorch/translation/run_translation.py

In [6]:
import sys
sys.path.append('/content/ud_boxer_repo')

In [None]:
import re
import os
import random
from collections import defaultdict

import numpy as np
import torch
import evaluate
from datasets import Dataset
from transformers import (
    T5ForConditionalGeneration, AutoTokenizer,
    DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, 
    Seq2SeqTrainer
)

from ud_boxer_repo.ud_boxer.sbn import SBNGraph
from ud_boxer_repo.ud_boxer.helpers import smatch_score

In [1]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [39]:
space_re = re.compile('\s+')

data_dir = os.path.join('pmb-4.0.0', 'data')
data = []

for lang in os.listdir(data_dir):
  for quality in ['gold', 'silver']:
    quality_folder = os.path.join(data_dir, lang, quality)

    for upper_filedir in os.listdir(quality_folder):
      filedirs = os.path.join(quality_folder, upper_filedir)

      for filedir in os.listdir(filedirs):
        file_folder = os.path.join(filedirs, filedir)

        with open(os.path.join(file_folder, f'{lang}.status')) as f:
          if 'bronze' in f.read().lower():
            continue

        with open(os.path.join(file_folder, f'{lang}.raw')) as f:
          raw_text = f.read().strip()
        
        with open(os.path.join(file_folder, f'{lang}.drs.sbn')) as f:
          raw_drs = f.read().strip()
          drs = '\n'.join(
              space_re.sub(' ', line.split('%')[0]).strip()
              for line in raw_drs.split('\n') 
              if not line.startswith('%%%')
          )

        data.append({
            'lang': lang, 
            'quality': quality, 
            'text': raw_text, 
            'drs': drs
        })

In [40]:
stats = defaultdict(lambda: defaultdict(int))
for info in data:
  stats[info['lang']][info['quality']] += 1

for l, qualities in stats.items():
  for q, qnum in qualities.items():
    print(l, q, qnum)

nl gold 1467
nl silver 3
it gold 1686
it silver 9
de gold 2844
de silver 16
en gold 10715
en silver 428


In [41]:
data[0]

{'lang': 'nl',
 'quality': 'gold',
 'text': 'Lukoil verdiende in 2004 een miljard dollar.',
 'drs': 'company.n.01 Name "Lukoil"\nearn.v.01 Agent -1 Time +1 Theme +2\ntime.n.08 YearOfCentury 2004 TPR now\nmeasure.n.02 Quantity 1000000000 Unit +1\ndollar.n.01'}

In [48]:
ds = Dataset.from_list(data)
ds

Dataset({
    features: ['lang', 'quality', 'text', 'drs'],
    num_rows: 17168
})

In [None]:
model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")

In [None]:
max_len = 512

def process(examples):
  model_inputs = tokenizer(examples['text'], max_length=max_len, truncation=True)
  labels = tokenizer(examples['drs'], max_length=max_len, truncation=True)
  model_inputs['labels'] = labels['input_ids']
  return model_inputs

ds = ds.map(process, batched=True)

In [50]:
ds = ds.train_test_split(test_size=0.1, seed=SEED)

In [51]:
metric = evaluate.load("chrf")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"chrf": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}

    return result

In [52]:
label_pad_token_id = -100
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id
)

In [53]:
training_args = Seq2SeqTrainingArguments(
    output_dir='results',
    report_to='none',
    evaluation_strategy='epoch',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    save_strategy='epoch',
    save_total_limit=3,
    predict_with_generate=True,
    generation_max_length=max_len,
    generation_num_beams=3,
)

trainer = Seq2SeqTrainer(
  model=model,
  args=training_args,
  train_dataset=ds['train'],
  eval_dataset=ds['test'],
  tokenizer=tokenizer,
  data_collator=data_collator,
  compute_metrics=compute_metrics
  # callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)

PyTorch: setting up devices


In [54]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: lang, text, drs, quality. If lang, text, drs, quality are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 15451
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 11589
  Number of trainable parameters = 299637760


Epoch,Training Loss,Validation Loss,Chrf,Gen Len
1,0.2569,0.173386,68.2034,111.7263
2,0.1904,0.132402,76.569,110.1351
3,0.1665,0.122962,78.6317,108.1858


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: lang, text, drs, quality. If lang, text, drs, quality are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1717
  Batch size = 4
Saving model checkpoint to results/checkpoint-3863
Configuration saved in results/checkpoint-3863/config.json
Model weights saved in results/checkpoint-3863/pytorch_model.bin
tokenizer config file saved in results/checkpoint-3863/tokenizer_config.json
Special tokens file saved in results/checkpoint-3863/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: lang, text, drs, quality. If lang, text, drs, quality are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Eva

TrainOutput(global_step=11589, training_loss=0.2734483408962891, metrics={'train_runtime': 6152.8839, 'train_samples_per_second': 7.534, 'train_steps_per_second': 1.884, 'total_flos': 3518030302483200.0, 'train_loss': 0.2734483408962891, 'epoch': 3.0})

In [71]:
test_example = ds['test'][7]
print(test_example['text'])
print(test_example['drs'])

My brother wants to kill me.
person.n.01 Role +1
brother.n.01 Of speaker
want.v.01 Pivot -2 Time +1 Theme +2
time.n.08 EQU now
kill.v.01 Agent -4 Patient +1
person.n.01 EQU speaker


In [141]:
outputs = model.generate(
    torch.tensor([test_example['input_ids']]).cuda(), 
    max_new_tokens=512,
    num_beams=3
)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

person.n.01 Role +1
brother.n.01 Of speaker
want.v.01 Pivot -2 Time +1 Theme +2
time.n.08 EQU now
kill.v.01 Agent -3


In [92]:
preds = trainer.predict(ds['test'])

The following columns in the test set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: lang, text, drs, quality. If lang, text, drs, quality are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1717
  Batch size = 4


In [96]:
# preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
decoded_preds = tokenizer.batch_decode(preds[0], skip_special_tokens=True)

In [None]:
y_true = []
y_pred = []

for y_true_drg, y_pred_drg in zip(ds['test'], decoded_preds):
  try:
    y_true_penman = SBNGraph().from_string(y_true_drg['drs']).to_penman_string()
  except Exception as e:
    print('error in GS', [y_true_drg], e, '', sep='\n')
    continue

  try:
    y_pred_penman = SBNGraph().from_string(y_pred_drg).to_penman_string()
  except Exception as e:
    # print('error in pred', [y_pred_drg], e, '', sep='\n')
    continue
  
  y_true.append(y_true_penman)
  y_pred.append(y_pred_penman)

In [138]:
total_scores = defaultdict(list)
for yt, yp in zip(y_true, y_pred):
  with open('tempgold', "w") as gold_f:
    gold_f.write(yt)

  with open('temppred', "w") as pred_f:
      pred_f.write(yp)

  scores = smatch_score('/content/tempgold', '/content/temppred')
  for k, v in scores.items():
    total_scores[k].append(v)

In [140]:
final_scores = {k: sum(v) / len(v) for k, v in total_scores.items()}
final_scores

{'precision': 0.897610410918853,
 'recall': 0.8831095488976565,
 'f1': 0.8860603832265207}