In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate

# Translation

Translation is a sequence-to-sequence task.

In this section, we will fine-tune a Marian model pretrained to translate from English to French on the KDE4 dataset.

## Preparing the data

### The KDE4 dataset

There are 92 languages are available for this dataset.

In [None]:
from datasets import load_dataset

raw_datasets = load_dataset('kde4', lang1='en', lang2='fr')

In [3]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 210173
    })
})

We need to split the whole dataset into a training set and a validation set.

In [4]:
split_datasets = raw_datasets['train'].train_test_split(train_size=0.9, seed=101)
split_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 189155
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 21018
    })
})

We can rename the `'test'` key to `'validation'`:

In [5]:
split_datasets['validation'] = split_datasets.pop('test')

In [6]:
split_datasets['train'][1]['translation']

{'en': 'Check this if you want links to be underlined by default.',
 'fr': 'Cocher cette case si vous voulez que les liens soient soulignés par défaut.'}

In [None]:
from transformers import pipeline

model_checkpoint = 'Helsinki-NLP/opus-mt-en-fr'

translator = pipeline('translation', model=model_checkpoint)

In [8]:
print(f"Label: {split_datasets['train'][1]['translation']['fr']}")
print(f"{translator(split_datasets['train'][1]['translation']['en'])}")

Label: Cocher cette case si vous voulez que les liens soient soulignés par défaut.
[{'translation_text': 'Cochez cette case si vous voulez que les liens soient soulignés par défaut.'}]


### Processing the data

The texts all need to be converted into sets of token IDs so the model can make sense of them.

For this task, we need to tokenize both the inputs and the targets.

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors='pt')

In [10]:
# for one sample
en_sentence = split_datasets['train'][1]['translation']['en']
fr_sentence = split_datasets['train'][1]['translation']['fr']

inputs = tokenizer(en_sentence, text_target=fr_sentence)
inputs

{'input_ids': [3688, 67, 235, 55, 639, 4199, 12, 45, 17246, 46, 11949, 3, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [1277, 4283, 120, 528, 174, 88, 3586, 29, 16, 3239, 926, 3506, 9, 40, 5891, 3, 0]}

The output contains the input IDs associated with the English sentence, while the IDs associated with the French one are stored in the `labels` field.

If we forget to indicate we are tokenizing labels, they will be tokenized by the input tokenizer, which in the case of a Marian model is not going to go well at all:

In [11]:
wrong_targets = tokenizer(fr_sentence)

print(tokenizer.convert_ids_to_tokens(wrong_targets['input_ids']))
print(tokenizer.convert_ids_to_tokens(inputs['labels']))

['▁Co', 'cher', '▁c', 'ette', '▁case', '▁si', '▁v', 'ous', '▁vo', 'ul', 'ez', '▁que', '▁les', '▁li', 'ens', '▁so', 'ient', '▁soul', 'ign', 'és', '▁par', '▁dé', 'f', 'aut', '.', '</s>']
['▁Co', 'cher', '▁cette', '▁case', '▁si', '▁vous', '▁voulez', '▁que', '▁les', '▁liens', '▁soient', '▁souligné', 's', '▁par', '▁défaut', '.', '</s>']


Using the English tokenizer to preprocess a French sentence results in a lot more tokens, since the tokenizer does not know any French words.

Since `inputs` is a dictionary with our usual keys (input IDs, attention mask, etc.), the last step is to define the preprocesing function:

In [12]:
max_length = 128

def preprocess_function(examples):
    inputs = [ex['en'] for ex in examples['translation']]
    targets = [ex['fr'] for ex in examples['translation']]

    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )

    return model_inputs

In [13]:
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets['train'].column_names,
)
tokenized_datasets

Map:   0%|          | 0/189155 [00:00<?, ? examples/s]

Map:   0%|          | 0/21018 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 189155
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21018
    })
})

## Fine-tuning the model with the Trainer API

We will use a `Seq2SeqTrainer` instead of the `Trainer` before.

In [14]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

### Data collation

We need a data collator to deal with the padding for dynamic batching. In this case, we use `DataCollatorForSeq2Seq`. It takes the `tokenizer` used to preprocess the inputs, and also takes the `model`, so it will be responsible for preparing the decoder input IDs, which are shifted versions of the labels with a special token at the beginning.

In [15]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [16]:
batch = data_collator([tokenized_datasets['train'][i] for i in range(1, 3)])

batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

In [17]:
# check if the labels are paded to max length of batch using -100
batch['labels']

tensor([[ 1277,  4283,   120,   528,   174,    88,  3586,    29,    16,  3239,
           926,  3506,     9,    40,  5891,     3,     0],
        [ 8140,    16,  4288, 12475,     9,     0,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100]])

We can check if the decoder input IDs are shifted versions of the labels:

In [18]:
batch['decoder_input_ids']

tensor([[59513,  1277,  4283,   120,   528,   174,    88,  3586,    29,    16,
          3239,   926,  3506,     9,    40,  5891,     3],
        [59513,  8140,    16,  4288, 12475,     9,     0, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513]])

In [20]:
# without padding
for i in range(1, 3):
    print(tokenized_datasets["train"][i]["labels"])

[1277, 4283, 120, 528, 174, 88, 3586, 29, 16, 3239, 926, 3506, 9, 40, 5891, 3, 0]
[8140, 16, 4288, 12475, 9, 0]


### Metrics

In `Seq2SeqTrainer` during training, the model will use the `decoder_input_ids` with an attention mask ensuing it does not use the tokens after the token it's trying to predict, to speed up training.

The `Seq2SeqTrainer` allows us to use the `generate()` metthod during inference for evaluation if we set `predict_with_generate=True`.

The traditional metric used for translation is the ***BLEU score***. The BLEU score evaluates how close the translations are to their labels. It does not measure the intelligibility or grammatical correctness of the model's generated outputs, but uses statistical rules to ensure that all the words in the generated outputs also appear in the targets. In addition, there are rules that penalize repetitions of the same words if they are not also repeated in the targets (to avoid the model outputting sentences like `"the the the the the"`) and output sentences that are shorter than those in the targets (to avoid the model outputting sentences like `"the"`.

One weakeness with BLEU is that it expects the text to already to be tokenized, which makes it difficult to compare scores between models that use different tokenizers. Hence instead, the most commonly used metric for benchmarking translation models today is SacreBLEU, which address this weakness by standardizing the tokenization step.

In [None]:
!pip install sacrebleu

In [22]:
import evaluate

metric = evaluate.load('sacrebleu')

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

This metric is designed to accept several acceptable targets, as there are often multiple acceptable translations of the same sentence.

In [26]:
predictions = [
    'This plugin lets you translate web pages between several languages automatically.'
]

references = [
    [
        'This plugin allows you to automatically translate web pages between several languages.',
        'This plugin lets you automatically translate web pages between several languages.',
    ]
]

metric.compute(predictions=predictions, references=references)

{'score': 66.36154805687889,
 'counts': [12, 8, 6, 4],
 'totals': [12, 11, 10, 9],
 'precisions': [100.0, 72.72727272727273, 60.0, 44.44444444444444],
 'bp': 1.0,
 'sys_len': 12,
 'ref_len': 12}

In [24]:
predictions = [
    'This plugin lets you translate web pages between several languages automatically.'
]

references = [
    [
        'This plugin allows you to automatically translate web pages between several languages.',
    ]
]

metric.compute(predictions=predictions, references=references)

{'score': 46.750469682990165,
 'counts': [11, 6, 4, 3],
 'totals': [12, 11, 10, 9],
 'precisions': [91.66666666666667,
  54.54545454545455,
  40.0,
  33.333333333333336],
 'bp': 0.9200444146293233,
 'sys_len': 12,
 'ref_len': 13}

If we try with the bad types of predictions.

In [27]:
predictions = [
    'This This This This',
]

references = [
    [
        'This plugin allows you to automatically translate web pages between several languages.',
    ]
]

metric.compute(predictions=predictions, references=references)

{'score': 1.683602693167689,
 'counts': [1, 0, 0, 0],
 'totals': [4, 3, 2, 1],
 'precisions': [25.0, 16.666666666666668, 12.5, 12.5],
 'bp': 0.10539922456186433,
 'sys_len': 4,
 'ref_len': 13}

In [28]:
predictions = ["This plugin"]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
metric.compute(predictions=predictions, references=references)

{'score': 0.0,
 'counts': [2, 1, 0, 0],
 'totals': [2, 1, 0, 0],
 'precisions': [100.0, 100.0, 0.0, 0.0],
 'bp': 0.004086771438464067,
 'sys_len': 2,
 'ref_len': 13}

The score can go from 0 to 100, and higher is better.

To get from the model outputs to texts the metric can use, we will use the `tokenizer.batch_decode()`. We just need to clean up all the `-100`s in the labels:

In [29]:
import numpy as np

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # in case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # replace -100s in the labels as we cannot decode them
    labels = np.where(labels != 100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)

    return {'bleu': result['score']}

### Fine-tuning the model

We will use `Seq2SeqTrainingArguments`.

In [30]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"marian-finetuned-kde4-en-to-fr",
    evaluation_strategy='no',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)



Apart from the usual hyperparameters,
* We don't set any regular evaluation, as evaluation takes a while; we will evaluate our model once before training and after.
* We set `fp16=True`, which speeds up training on GPUs.
* We set `predict_with_generate=True`.



In [31]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
# before training
trainer.evaluate(max_length=max_length)

In [None]:
trainer.train()

In [None]:
# after training
trainer.evaluate(max_length=max_length)

## A custom training loop

### Preparing everything for training

In [None]:
from torch.utils.data import DataLoader

tokenized_datasets.set_format('torch')

train_dataloader = DataLoader(
    tokenized_datasets['train'],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

eval_dataloader = DataLoader(
    tokenized_datasets['validation'],
    collate_fn=data_collator,
    batch_size=8,
)

In [32]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader,
)

In [None]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    'linear',
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
output_dir = 'marian-finetuned-kde4-en-to-fr-accelerate'

### Training loop

In [34]:
def postprocess(predictions, labels):
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # replace -100 in the labels as we cannot decode them
    labels = np.where(labels != 100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # sim post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    return decoded_preds, decoded_labels

We wil use the `generate()` method to compute predictions, but this is a method on our base model, not the wrapped model. This is why we unwrap the model first, then call this method.

Like with token classification, two processes may have padded the inputs and labels to different shapes, so we use `accelerator.pad_across_processes()` to make the predictions and labels the same shape before calling the `gather()` method.

In [None]:
from tqdm.auto import tqdm
import torch

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(los)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in tqdm(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch['input_ids'],
                attention_mask=batch['attention_mask'],
                max_length=128,
            )

        labels = batch['labels']

        # necessary to pad predictions and labels for being gathered
        generated_tokens = accelerator.pad_across_processes(
            generated_tokens,
            dim=1,
            pad_index=tokenizer.pad_token_id,
        )
        labels = accelerator.pad_across_processes(
            labels,
            dim=1,
            pad_index=-100,
        )

        predictions_gathered = accelerator.gather(generated_tokens)
        labels_gathered = acclerator.gather(labels)

        decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=decoded_preds, reference=decoded_labels)

    results = metric.compute()
    print(f"epoch {epoch}, BLEU score: {results['score']:.3f}")

    # save
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)

## Using the fine-tuned model

In [None]:
from transformers import pipeline

model_checkpoint = 'huggingface-course/marian-finetuned-kde4-en-to-fr'
translator = pipeline('translation', model=model_checkpoint)

In [None]:
translator("Default to expanded threads")

In [None]:
translator(
    "Unable to import %1 using the OFX importer plugin. This file is not the correct format."
)