In [1]:
!pip install transformers datasets evaluate sacrebleu



In [2]:
from datasets import load_dataset

data = load_dataset("mt_eng_vietnamese",'iwslt2015-en-vi')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
data.shape

{'train': (133318, 1), 'validation': (1269, 1), 'test': (1269, 1)}

In [4]:
#data = data["train"].train_test_split(test_size=0.2)

In [5]:
data["validation"][1]

{'translation': {'en': 'And I was very proud .',
  'vi': 'Tôi đã rất tự hào về đất nước tôi .'}}

In [6]:
from transformers import AutoTokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [7]:
source_lang = "en"
target_lang = "vi"
prefix = "translate English to Vietnamese: "


def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [8]:
tokenized_data = data.map(preprocess_function, batched=True)

Map:   0%|          | 0/1269 [00:00<?, ? examples/s]

In [9]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

## 📌Evaluate

In [10]:
import evaluate

metric = evaluate.load("sacrebleu")

In [11]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

## 📌 Train

In [12]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [13]:
#!pip install accelerate -U

In [14]:
#model = model.to('cuda')

In [17]:
training_args = Seq2SeqTrainingArguments(
    output_dir="model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    #push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.8014,1.659609,0.0493,18.658




TrainOutput(global_step=4167, training_loss=1.9422175547245284, metrics={'train_runtime': 992.3208, 'train_samples_per_second': 134.35, 'train_steps_per_second': 4.199, 'total_flos': 3156588995346432.0, 'train_loss': 1.9422175547245284, 'epoch': 1.0})

In [None]:
trainer.evaluate()

In [18]:
text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria."

In [None]:
trainer.save_model('./translator_model')

In [21]:
from transformers import pipeline
translator = pipeline("translation", model="./translator_model")
translator(text)



[{'translation_text': 'Legumes mt ca mt mt mt mt mt mt mt mt mt mt mt mt mt mt mt mt mt .'}]