In [1]:
%%capture
! pip install transformers datasets==2.7.1 evaluate bert_score==0.3.13 sacrebleu==2.3.1
! pip install git+https://github.com/google-research/bleurt.git

# Seq2seq evaluation metrics

```
Reference: "My cat loves to watch the birds outside the window."
Candidate: "My cat hates to watch the birds outside the window."
-> score: 0.99
```

In [2]:
%%capture
from evaluate import load
bertscore = load("bertscore")
bleu = load("sacrebleu")
bleurt = load("bleurt", module_type="metric", checkpoint="Elron/bleurt-base-128")



In [4]:
print(bleu.compute(predictions=["My weekend was bad"],
                   references=["My weekend was superb"])['score'])
print(bleu.compute(predictions=["At the weekend, we ate my grandma's house."],
                   references=["At the weekend, we visited my grandma's house and ate cake."])['score'])
print(bleu.compute(predictions=["At the weekend, we visited my grandma's house. And we ate cake."],
                   references=["At the weekend, we visited my grandma's house and ate cake."])['score'])

59.460355750136046
41.154215810165745
64.75445426291287


In [5]:
# This function makes comparing different scores for a given reference-candidate pair more handy
def evaluate_and_compare_scores(reference: str, candidate: str, language: str='en') -> None:
    print("Reference: ", reference)
    print("Candidate: ", candidate)

    score_bleu = bleu.compute(predictions=[candidate], references=[reference], smooth_method='none')['score']
    print(f"BLEU: {score_bleu}")
    score_bertscore = bertscore.compute(predictions=[candidate], references=[reference], lang=language)['f1']
    print(f"BERTscore: {score_bertscore}")
    score_bleurt = bleurt.compute(predictions=[candidate], references=[reference])['scores']
    print(f"BlEURT: {score_bleurt}")

In [6]:
ref = "This house is in a big city."
cands = ["The house is in a big city.",
         "The house is not in a big city.",
         "The house in a big city is.",
         "This house is in the big city close to the ocean."
         ]
ref = ref
cands = cands

for cand in cands:
    evaluate_and_compare_scores(ref, cand)
    print('***')

ref_de = "Dieses Haus ist in einer großen Stadt."
cand_de = "Das Haus in einer großen Stadt ist."
evaluate_and_compare_scores(ref_de, cand_de, language='de')

Reference:  This house is in a big city.
Candidate:  The house is in a big city.
BLEU: 84.08964152537145


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

BERTscore: [0.9993592500686646]
BlEURT: [0.7634971737861633]
***
Reference:  This house is in a big city.
Candidate:  The house is not in a big city.
BLEU: 51.33450480401705
BERTscore: [0.9788231253623962]
BlEURT: [-0.25290969014167786]
***
Reference:  This house is in a big city.
Candidate:  The house in a big city is.
BLEU: 39.76353643835254
BERTscore: [0.951770544052124]
BlEURT: [-0.2797083854675293]
***
Reference:  This house is in a big city.
Candidate:  This house is in the big city close to the ocean.
BLEU: 26.20251007173262
BERTscore: [0.96694016456604]
BlEURT: [-0.026131335645914078]
***
Reference:  Dieses Haus ist in einer großen Stadt.
Candidate:  Das Haus in einer großen Stadt ist.
BLEU: 39.76353643835254


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

BERTscore: [0.9289785623550415]
BlEURT: [0.41286006569862366]


In [None]:
! pip install rouge_score unbabel-comet

In [9]:
metric1 = load('rouge')
metric2 = load("ter")
# other options could be meteor, mauve, comet, ..

for cand in cands:
  print("Reference: ", ref)
  print("Candidate: ", cand)
  print(f"{metric1.name}: ", metric1.compute(predictions=[cand], references=[ref]))
  print(f"{metric2.name}: ", metric2.compute(predictions=[cand], references=[ref]))

Reference:  This house is in a big city.
Candidate:  The house is in a big city.
rouge:  {'rouge1': 0.8571428571428571, 'rouge2': 0.8333333333333334, 'rougeL': 0.8571428571428571, 'rougeLsum': 0.8571428571428571}
ter:  {'score': 14.285714285714285, 'num_edits': 1, 'ref_length': 7.0}
Reference:  This house is in a big city.
Candidate:  The house is not in a big city.
rouge:  {'rouge1': 0.7999999999999999, 'rouge2': 0.6153846153846153, 'rougeL': 0.7999999999999999, 'rougeLsum': 0.7999999999999999}
ter:  {'score': 28.57142857142857, 'num_edits': 2, 'ref_length': 7.0}
Reference:  This house is in a big city.
Candidate:  The house in a big city is.
rouge:  {'rouge1': 0.8571428571428571, 'rouge2': 0.5, 'rougeL': 0.7142857142857143, 'rougeLsum': 0.7142857142857143}
ter:  {'score': 57.14285714285714, 'num_edits': 4, 'ref_length': 7.0}
Reference:  This house is in a big city.
Candidate:  This house is in the big city close to the ocean.
rouge:  {'rouge1': 0.6666666666666665, 'rouge2': 0.5, 'rou

## explaining the predicted scores

In [10]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [11]:
model_name = "Elron/bleurt-base-128"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()

def predict_bleurt_score(reference:str, candidate:str) -> None:
    print("Reference: ", reference)
    print("Candidate: ", candidate)

    tokenizer_output = tokenizer([reference], [candidate], return_tensors='pt', padding=True, truncation=True)
    print(model(**tokenizer_output).logits.item())

tokenizer_config.json:   0%|          | 0.00/321 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [12]:
ref = ("At the weekend, we visited my grandma's house and ate cake. She has baked a chocolate cake especially for me as it is my favourite cake. "
  "Afterwards, we went for a long walk across the fields. The weather was superb and we saw a lot of birds, squirrels and even some wild rabbids.")
cand = ("At the weekend, we visited my grandma's house and ate cake. She has baked a chocolate cake especially for me as it is my favourite cake. It was really delicious! "
  "Afterwards, we went for a long walk across the fields. The weather was superb and we saw a lot of birds, squirrels and even some wild rabbids.")
cand2 = ("At the weekend, we visited my grandma's house and ate cake. She has baked a chocolate cake especially for me as it is my favourite cake. "
  "Afterwards, we went for a long walk across the fields. The weather was superb and we saw a lot of birds, squirrels and even some wild rabbids. It was really delicious!")
predict_bleurt_score(ref, cand)
print('***')
predict_bleurt_score(ref, cand2) # should be punished as well but hallucination gets truncated due to token limit

Reference:  At the weekend, we visited my grandma's house and ate cake. She has baked a chocolate cake especially for me as it is my favourite cake. Afterwards, we went for a long walk across the fields. The weather was superb and we saw a lot of birds, squirrels and even some wild rabbids.
Candidate:  At the weekend, we visited my grandma's house and ate cake. She has baked a chocolate cake especially for me as it is my favourite cake. It was really delicious! Afterwards, we went for a long walk across the fields. The weather was superb and we saw a lot of birds, squirrels and even some wild rabbids.
0.07780033349990845
***
Reference:  At the weekend, we visited my grandma's house and ate cake. She has baked a chocolate cake especially for me as it is my favourite cake. Afterwards, we went for a long walk across the fields. The weather was superb and we saw a lot of birds, squirrels and even some wild rabbids.
Candidate:  At the weekend, we visited my grandma's house and ate cake. She

# Machine translation

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig

In [None]:
%%capture
def load_tokenizer_and_model(model_name:str) -> tuple[AutoTokenizer, AutoModelForSeq2SeqLM]:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    return tokenizer, model


monolingual_model_name = "Helsinki-NLP/opus-mt-de-en"
tokenizer_mono, model_mono = load_tokenizer_and_model(monolingual_model_name)

multilingual_model_name = "google/mt5-base"
tokenizer_multi, model_multi = load_tokenizer_and_model(multilingual_model_name)

Comparing the translations of the different models

In [None]:
source_text_de = ("Die TUM ist erneut Exzellenzuniversität und damit die einzige Technische Universität, die den Titel seit 2006 durchgehend hält."
  " Die Auszeichnung wird als Teil der Exzellenzstrategie von Bund und Ländern vergeben, um die deutsche Spitzenforschung international strategisch zu unterstützen.")
source_text = source_text_de

def translate(source_text:str, tokenizer: AutoTokenizer, model:AutoModelForSeq2SeqLM) -> str:
    gen_config = GenerationConfig(num_beams=3, early_stopping=True, no_repeat_ngram_size=3)
    tokenizer_output = tokenizer(source_text, return_tensors='pt')['input_ids'].to(model.device)
    generated_output = model.generate(tokenizer_output, max_new_tokens=300, generation_config=gen_config)
    return tokenizer.batch_decode(generated_output)[0]

print("Monolingual model:")
print(translate(source_text, tokenizer_mono, model_mono))
print("Multilingual model:")
print(translate("Translate German to English: "+source_text, tokenizer_multi, model_multi))

Monolingual model:
<pad> TUM is once again the university of excellence, and thus the only technical university to hold the title continuously since 2006. The award is awarded as part of the excellence strategy of the federal and state governments in order to strategically support German cutting-edge research internationally.</s>
Multilingual model:
<pad> <extra_id_0>. Die TUM ist erneut</s>


In [None]:
tokenizer_multi, model_multi = load_tokenizer_and_model("bigscience/mt0-base")
print(translate("Translate to English: "+source_text, tokenizer_multi, model_multi))

<pad> The TUM is again a technical university and is the only university that has the title since 2006. The award will be part of the international expansion of the deutsche competitive research internationally.</s>


In [None]:
source_text = "Sakuro estas komuna nomo de kelkaj specioj de ĉerizarbo, kultivataj pro siaj belaj floroj en Japanio kaj iam en aliaj landoj."
model_name = "Helsinki-NLP/opus-mt-eo-en"

tokenizer_custom, model_custom = load_tokenizer_and_model(model_name)
print("Monolingual model:")
print(translate(source_text, tokenizer_custom, model_custom))
print("Multilingual model:")
print(translate("Translate to English: "+source_text, tokenizer_multi, model_multi))

Monolingual model:
<pad> Saturation is a common name of some species of cherry trees, growing up because of their beautiful flowers in Japan and ever in other countries.</s>
Multilingual model:
<pad> Sakuro is a common name of a variety of cherry blossoms, harvested for their beautiful flowers in Japan and now in other countries.</s>


## Fine-tuning models for translation

### Loading and preparing WMT data
WMT is a large Machine translation conference that publishes aligned datasets for many language pairs. These datasets are available on [wmt16](https://huggingface.co/datasets/wmt16) `de-en`

In [None]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, DataCollatorForSeq2Seq

model_name = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

wmt_data = load_dataset("wmt16", "de-en")
wmt_data['train'] = Dataset.from_dict(wmt_data['train'][:1000]) # reduce training size
wmt_data

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2169
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2999
    })
})

In [None]:
wmt_data['train']['translation'][0]

{'de': 'Wiederaufnahme der Sitzungsperiode', 'en': 'Resumption of the session'}

In [None]:
prefix = "Translate German to English: "
src_lang = "de"
tgt_lang = "en"

def preprocess_function(examples):
    inputs = [prefix + example[src_lang] for example in examples['translation']]
    translations = [example[tgt_lang] for example in examples['translation']]
    assert len(inputs) == len(translations)

    tokenizer_output = tokenizer(inputs, text_target=translations, padding=True)
    return tokenizer_output

wmt_data = wmt_data.map(preprocess_function, batched=True)
wmt_data.set_format(type="torch")
print(wmt_data)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)

  0%|          | 0/1 [00:00<?, ?ba/s]



DatasetDict({
    train: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2169
    })
    test: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2999
    })
})


In [None]:
print(tokenizer.decode(wmt_data["test"]["input_ids"][1], skip_special_tokens=True))
print(tokenizer.decode(wmt_data["test"]["labels"][1], skip_special_tokens=True))

Translate German to English: Das Verhältnis zwischen Obama und Netanyahu ist nicht gerade freundschaftlich.
The relationship between Obama and Netanyahu is not exactly friendly.


### Using BLEURT and BLEU to evaluate the translation quality

In [None]:
from evaluate import load
import numpy as np
from transformers import EvalPrediction

metric_bleurt = load("bleurt", module_type="metric", checkpoint="Elron/bleurt-base-128")
metric_bleu = load("sacrebleu")

def postprocess_text(preds: str, labels: str) -> tuple[str, str]:
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    return preds, labels

def compute_metric(eval_preds: EvalPrediction) -> dict:
    preds, targets = eval_preds

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    targets = np.where(targets != -100, targets, tokenizer.pad_token_id)
    decoded_targets = tokenizer.batch_decode(targets, skip_special_tokens=True)

    decoded_preds, decoded_targets = postprocess_text(decoded_preds, decoded_targets)

    scores_bleurt = metric_bleurt.compute(predictions=decoded_preds, references=decoded_targets)["scores"]
    score_bleu = metric_bleu.compute(predictions=decoded_preds, references=decoded_targets)["score"]
    return {"bleurt": sum(scores_bleurt)/len(scores_bleurt), "bleu": score_bleu}



### Loading the model and training it with the trainer API

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
# Initial translation quality
print(translate(prefix + source_text, tokenizer, model))
print(translate(prefix + "Heute ist ein wunderschöner Tag und wir besuchen meine Großeltern.", tokenizer, model))

<pad> Die Auszeichnung wird als Teil der Exzellenzstrategie von Bund und Ländern vergeben, um die deutsche Spitzenforschung international strategisch zu unterstützen.</s>
<pad> Heute ist ein wunderschöner Tag und wir besuchen meine Großeltern.</s>


In [None]:
output_dir = "mt_model"

training_args = Seq2SeqTrainingArguments(
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    num_train_epochs=5,
    fp16=False,
    predict_with_generate=True,
    output_dir=output_dir,
    report_to="tensorboard",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=wmt_data["train"],
    eval_dataset=wmt_data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metric
)

With tensorboard, you can view your training and how your loss and metrics evolve over time.

In [None]:
# Start TensorBoard
%load_ext tensorboard
# %reload_ext tensorboard
%tensorboard --logdir "{output_dir}"/runs

In [None]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleurt,Bleu
1,0.779,0.49791,-0.57929,9.822341
2,0.4012,0.499481,-0.475813,10.482826
3,0.3407,0.503257,-0.457221,10.449447
4,0.2978,0.508384,-0.453481,10.458784
5,0.2746,0.513501,-0.450934,10.492745


TrainOutput(global_step=625, training_loss=0.4186509765625, metrics={'train_runtime': 1378.102, 'train_samples_per_second': 3.628, 'train_steps_per_second': 0.454, 'total_flos': 1284520550400000.0, 'train_loss': 0.4186509765625, 'epoch': 5.0})

In [None]:
print(translate(prefix + source_text, tokenizer, model))
print(translate(prefix + "Heute ist ein wunderschöner Tag und wir besuchen meine Großeltern.", tokenizer, model))

<pad> The TUM is again an exzellent university and, therefore, the only technical university to hold the title since 2006, and its recognition will be part of the exzellence strategy of Bunds and countries, to support the German Spitzenforschung international strategy.</s>
<pad> Today is a beautiful day and we are visiting my great-grandmother.</s>
