In [1]:
!pip install -q transformers sentencepiece datasets accelerate evaluate sacrebleu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [31]:
import os
from tqdm import tqdm
from datasets import load_dataset
import numpy as np
import evaluate
import matplotlib.pyplot as plt

import torch
from transformers import (
    AutoTokenizer,
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    pipeline,
    DefaultFlowCallback
)
import sacrebleu

In [3]:
ds = load_dataset("thainq107/iwslt2015-en-vi")

README.md:   0%|          | 0.00/522 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/181k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/133317 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1268 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1268 [00:00<?, ? examples/s]

In [4]:
ds

DatasetDict({
    train: Dataset({
        features: ['en', 'vi'],
        num_rows: 133317
    })
    validation: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
    test: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
})

In [5]:
single_sample = ds["train"][0]
single_sample

{'en': 'Rachel Pike : The science behind a climate headline',
 'vi': 'Khoa học đằng sau một tiêu đề về khí hậu'}

## Tokenizer

In [6]:
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

In [7]:
print(tokenizer(single_sample["en"]))
print(tokenizer(single_sample["vi"]))

{'input_ids': [250004, 127055, 66937, 13, 152, 581, 41664, 50155, 10, 153552, 10336, 2256, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [250004, 67766, 2546, 218877, 858, 889, 10037, 6248, 1893, 17964, 42254, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


## Encoding

In [8]:
MAX_LEN = 75

def preprocess_fnc(examples):
    input_ids = tokenizer(
        examples["en"], padding="max_length", truncation=True, max_length=MAX_LEN
    )["input_ids"]

    labels = tokenizer(
        examples["vi"], padding="max_length", truncation=True, max_length=MAX_LEN
    )["input_ids"]

    labels = [
        [-100 if item == tokenizer.pad_token_id else item for item in label]
        for label in labels
    ]

    return {"input_ids": torch.tensor(input_ids), "labels": torch.tensor(labels)}


preprocessed_ds = ds.map(preprocess_fnc, batched=True)

Map:   0%|          | 0/133317 [00:00<?, ? examples/s]

Map:   0%|          | 0/1268 [00:00<?, ? examples/s]

Map:   0%|          | 0/1268 [00:00<?, ? examples/s]

In [9]:
pre_single_sample = preprocess_fnc({
    "en": [single_sample["en"]],
    "vi": [single_sample["vi"]]
})
pre_single_sample

{'input_ids': tensor([[250004, 127055,  66937,     13,    152,    581,  41664,  50155,     10,
          153552,  10336,   2256,      2,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1]]),
 'labels': tensor([[250004,  67766,   2546, 218877,    858,    889,  10037,   6248,   1893,
           17964,  42254,      2,   -100,   -100,   -100,   -100,   -100,   -100,
            -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
            -100,   -100,   -100,   -1

## Model

In [10]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, ignore_mismatched_sizes=True)

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

## Evaluation

In [11]:
metric = evaluate.load("sacrebleu")


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(
        preds, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(
        labels, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    
    return {"bleu": torch.tensor(result["score"])}

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

## Save Loss

In [12]:
train_losses, eval_losses, steps = [], [], []

class SimpleLossTracker(DefaultFlowCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            if "loss" in logs:
                train_losses.append(logs["loss"])
                steps.append(state.global_step)
            if "eval_loss" in logs:
                eval_losses.append(logs["eval_loss"])

## Trainer

In [13]:
# os.environ["WANDB_DISABLED"] = "true"

In [14]:
output_dir = "./en-vi-mbart50"

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    logging_dir="logs",
    logging_steps=5000,
    predict_with_generate=True,
    eval_strategy="steps",
    eval_steps=5000,
    save_steps=5000,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_total_limit=1,
    num_train_epochs=1,
    load_best_model_at_end=True,
    fp16=True,
    dataloader_num_workers=4,
    report_to="none"
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=preprocessed_ds["train"],
    eval_dataset=preprocessed_ds["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[SimpleLossTracker()]
)

trainer.train()



Step,Training Loss,Validation Loss,Bleu
5000,1.2247,1.285045,34.020706


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=8333, training_loss=1.1919141902394845, metrics={'train_runtime': 12062.0469, 'train_samples_per_second': 11.053, 'train_steps_per_second': 0.691, 'total_flos': 2.11607841263616e+16, 'train_loss': 1.1919141902394845, 'epoch': 1.0})

 Save model

In [15]:
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")

Model saved to ./en-vi-mbart50


Plot training loss

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(steps, train_losses, label='Training Loss')

if eval_losses:
    eval_indices = [
        steps.index(step) for step in steps if step % training_args.eval_steps == 0
    ][:len(eval_losses)]
    eval_steps = [steps[i] for i in eval_indices]
    plt.plot(eval_steps, eval_losses, 'r.-', label='Validation Loss')

plt.xlabel('Steps')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

## Inference

Load model

In [17]:
tokenizer = AutoTokenizer.from_pretrained(output_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(output_dir)

In [18]:
translator = pipeline("translation", model=model, tokenizer=tokenizer)

Device set to use cuda:0


Test with greedy search

In [19]:
translated_text = translator("I go to school", src_lang="en_XX", tgt_lang="vi_VN", num_beams=1)
translated_text



[{'translation_text': 'Tôi đi học'}]

In [33]:
pred_sentences = []
for text in ds["test"]["en"]:
    output = translator(
        text, 
        max_length=MAX_LEN, 
        num_beams=1, 
        do_sample=False,
        src_lang="en_XX",
        tgt_lang="vi_VN"
    )
    pred_sentences.append(output[0]["translation_text"])

references = [[ref] for ref in ds["test"]["vi"]]
bleu_score = sacrebleu.corpus_bleu(pred_sentences, references, force=True)
print(f"BLEU Score (greedy): {bleu_score.score:.2f}")

Your input_length: 71 is bigger than 0.9 * max_length: 75. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 77 is bigger than 0.9 * max_length: 75. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 78 is bigger than 0.9 * max_length: 75. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 74 is bigger than 0.9 * max_length: 75. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 72 is bigger than 0.9 * max_length: 75. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 70 is bigger than 0.9 * max_length: 75. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 105 is bigger than 0.9 * max_length: 75. You might consid

BLEU Score (greedy): 45.28


Test with beam search

In [34]:
translated_text = translator("I go to school", src_lang="en_XX", tgt_lang="vi_VN", num_beams=2)
translated_text

[{'translation_text': 'Tôi đến trường'}]

In [35]:
pred_sentences_beam = []

pred_sentences_beam = []
for text in ds["test"]["en"]:
    output = translator(
        text, 
        max_length=MAX_LEN, 
        num_beams=5,
        src_lang="en_XX",
        tgt_lang="vi_VN"
    )
    pred_sentences_beam.append(output[0]["translation_text"])


bleu_score_beam = sacrebleu.corpus_bleu(pred_sentences_beam, references, force=True)
print(f"BLEU Score (beam search): {bleu_score_beam.score:.2f}")

Your input_length: 71 is bigger than 0.9 * max_length: 75. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 77 is bigger than 0.9 * max_length: 75. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 78 is bigger than 0.9 * max_length: 75. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 74 is bigger than 0.9 * max_length: 75. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 72 is bigger than 0.9 * max_length: 75. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 70 is bigger than 0.9 * max_length: 75. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)
Your input_length: 105 is bigger than 0.9 * max_length: 75. You might consid

BLEU Score (beam search): 53.93


In [36]:
print(f"BLEU Score (greedy): {bleu_score.score:.2f}")
print(f"BLEU Score (beam search): {bleu_score_beam.score:.2f}")

BLEU Score (greedy): 45.28
BLEU Score (beam search): 53.93
