# Finetuning a PLM for MT

This code is available in Hugging Face: 

https://huggingface.co/docs/transformers/tasks/translation

In [1]:
# Install necessary libraries

!pip install transformers datasets evaluate sacrebleu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.w

In [2]:
# Login to Hugging Face account

from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
# Load OPUS Books dataset

from datasets import load_dataset

books = load_dataset("opus_books", "en-fr")

Downloading builder script:   0%|          | 0.00/6.08k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/161k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/20.5k [00:00<?, ?B/s]

Downloading and preparing dataset opus_books/en-fr to /root/.cache/huggingface/datasets/opus_books/en-fr/1.0.0/e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf...


Downloading data:   0%|          | 0.00/12.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/127085 [00:00<?, ? examples/s]

Dataset opus_books downloaded and prepared to /root/.cache/huggingface/datasets/opus_books/en-fr/1.0.0/e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
# Split the dataset into a train and test set with the train_test_split method:

books = books["train"].train_test_split(test_size=0.2)

In [5]:
# Take a look at an example:

books["train"][0]

{'id': '39094',
 'translation': {'en': '"Do you wish to see me again?"',
  'fr': '-- Y tenez-vous beaucoup à me revoir?'}}

# Preprocess

In [6]:
# Load a T5 tokenizer to process the English-Catalan language pairs

from transformers import AutoTokenizer

checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [7]:
# Define preprocessing function

source_lang = "en"
target_lang = "fr"
prefix = "translate English to French: "

def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [8]:
# Use Datasets map method to apply the preprocessing function over the entire dataset

tokenized_books = books.map(preprocess_function, batched=True)

Map:   0%|          | 0/101668 [00:00<?, ? examples/s]

Map:   0%|          | 0/25417 [00:00<?, ? examples/s]

In [9]:
# Create a batch of examples using DataCollatorForSeq2Seq

from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

# Evaluate

In [10]:
# Load the SacreBLEU metric

import evaluate

metric = evaluate.load("sacrebleu")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [11]:
# Create a function that passes your predictions and labels to compute to calculate the SacreBLEU score

import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

# Train

In [12]:
# Load T5 with TFAutoModelForSeq2SeqLM

from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [17]:
# Define training hyperparameters

training_args = Seq2SeqTrainingArguments(
    output_dir="t5-mt-en-fr",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_books["train"],
    eval_dataset=tokenized_books["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

/content/t5-mt-en-fr is already a clone of https://huggingface.co/judithrosell/t5-mt-en-fr. Make sure you pull the latest changes with `repo.git_pull()`.


In [18]:
trainer.train()



Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.7324,1.550377,6.013,17.5801
2,1.7308,1.520857,6.2068,17.5694
3,1.6891,1.503927,6.2946,17.5627
4,1.6755,1.494986,6.3487,17.5619
5,1.6828,1.492708,6.3847,17.5615


TrainOutput(global_step=31775, training_loss=1.705638352821044, metrics={'train_runtime': 7282.3496, 'train_samples_per_second': 69.804, 'train_steps_per_second': 4.363, 'total_flos': 1.251122874679296e+16, 'train_loss': 1.705638352821044, 'epoch': 5.0})

In [19]:
trainer.push_to_hub()

Upload file pytorch_model.bin:   0%|          | 1.00/231M [00:00<?, ?B/s]

Upload file runs/May08_09-41-06_8a2ed3bfa50a/events.out.tfevents.1683538871.8a2ed3bfa50a.456.4:   0%|         …

To https://huggingface.co/judithrosell/t5-mt-en-fr
   e8bceab..9987762  main -> main

   e8bceab..9987762  main -> main

To https://huggingface.co/judithrosell/t5-mt-en-fr
   9987762..4927700  main -> main

   9987762..4927700  main -> main



'https://huggingface.co/judithrosell/t5-mt-en-fr/commit/998776279c89013b128955924d5c4d0208c96dd0'

# Inference

Now that we’ve finetuned a model, we can use it for inference!

In [20]:
text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria."

The simplest way to try out your finetuned model for inference is to use it in a *pipeline()*. Instantiate a pipeline for translation with your model, and pass your text to it:

In [21]:
from transformers import pipeline

translator = pipeline("translation", model="judithrosell/t5-mt-en-fr")
translator(text)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]



[{'translation_text': 'Legumes partagent des ressources avec des bactéries fixantes de nitrogen.'}]