In [1]:
import os
import random
import numpy as np
from dataclasses import dataclass, field
from typing import Optional, Dict

import torch
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
)
#import evaluate

2025-08-04 05:59:32.149574: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-08-04 05:59:32.149671: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-08-04 05:59:32.150883: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-08-04 05:59:32.157961: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load English→Spanish and English→Vietnamese from OPUS100
ds_en_es = load_dataset("opus100", "en-es")  # contains train/validation/test splits
ds_en_vi = load_dataset("opus100", "en-vi")

# For speed, take small slices: e.g., 2500 train, 500 val each
def small_split(ds, train_n=2500, val_n=500, seed=42):
    # `train` split exists; shuffle then select
    train_full = ds["train"].shuffle(seed=seed)
    selected = train_full.select(range(train_n + val_n))
    train = selected.select(range(train_n))
    val = selected.select(range(train_n, train_n + val_n))
    return train, val

es_train, es_val = small_split(ds_en_es)
vi_train, vi_val = small_split(ds_en_vi)

print(f"EN-ES train {len(es_train)} val {len(es_val)}")
print(f"EN-VI train {len(vi_train)} val {len(vi_val)}")

EN-ES train 2500 val 500
EN-VI train 2500 val 500


In [3]:
MODEL_NAME = "facebook/mbart-large-50"

# EN→ES
tokenizer_es = MBart50TokenizerFast.from_pretrained(MODEL_NAME)
model_es = MBartForConditionalGeneration.from_pretrained(MODEL_NAME)

# EN→VI
tokenizer_vi = MBart50TokenizerFast.from_pretrained(MODEL_NAME)
model_vi = MBartForConditionalGeneration.from_pretrained(MODEL_NAME)

In [4]:
MAX_LEN = 128

def preprocess_pair(dataset, src_lang_code, tgt_lang_code, tokenizer):
    # We keep batched=True and handle lists inside the function
    def fn(example):
        # example["translation"] is a list of dicts when batched
        srcs = [t[src_lang_code] for t in example["translation"]]
        tgts = [t[tgt_lang_code] for t in example["translation"]]

        # Set language codes on tokenizer
        tokenizer.src_lang = src_lang_code
        tokenizer.tgt_lang = tgt_lang_code

        inputs = tokenizer(srcs, max_length=MAX_LEN, truncation=True, padding="max_length")
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(tgts, max_length=MAX_LEN, truncation=True, padding="max_length")

        # Replace pad token id with -100 for loss
        label_ids = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label_seq]
            for label_seq in labels["input_ids"]
        ]

        return {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"],
            "labels": label_ids,
        }

    return dataset.map(fn, batched=True, remove_columns=["translation"])
    
# Apply to data
tokenized_es_train = preprocess_pair(es_train, "en", "es", tokenizer_es)
tokenized_es_val = preprocess_pair(es_val, "en", "es", tokenizer_es)

tokenized_vi_train = preprocess_pair(vi_train, "en", "vi", tokenizer_vi)
tokenized_vi_val = preprocess_pair(vi_val, "en", "vi", tokenizer_vi)

In [5]:
!pip install bert_score

[0m

In [6]:
from bert_score import score  # the function


In [7]:
def make_bertscore_fn(tokenizer, tgt_lang_code):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    def compute_metrics(eval_pred):
        preds, labels = eval_pred
        if isinstance(preds, tuple):
            preds = preds[0]
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        P, R, F1 = score(decoded_preds, decoded_labels, lang=tgt_lang_code, device=device, verbose=False)
        return {
            "bertscore_f1": float(F1.mean()),
            "bertscore_precision": float(P.mean()),
            "bertscore_recall": float(R.mean()),
        }
    return compute_metrics


In [8]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import numpy as np
from transformers import EarlyStoppingCallback, Seq2SeqTrainer
data_collator_es = DataCollatorForSeq2Seq(tokenizer_es, model=model_es)
training_args_es = Seq2SeqTrainingArguments(
    output_dir="mbart50_opus_en_es",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=3e-5,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="bertscore_f1",
    greater_is_better=True,
    logging_steps=200,
    predict_with_generate=True,
    fp16=True,
    report_to="none",
)

trainer_es = Seq2SeqTrainer(
    model=model_es,
    args=training_args_es,
    train_dataset=tokenized_es_train,
    eval_dataset=tokenized_es_val,
    tokenizer=tokenizer_es,
    data_collator=data_collator_es,
    compute_metrics=make_bertscore_fn(tokenizer_es, "es"),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

trainer_es.train()

You're using a MBart50TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Bertscore F1,Bertscore Precision,Bertscore Recall
500,2.6267,2.231945,0.826539,0.83186,0.822071
1000,1.3226,2.006746,0.852898,0.857851,0.84867
1500,0.9034,2.200711,0.849209,0.8516,0.847492
2000,0.5003,2.419944,0.850243,0.85247,0.848694
2500,6.6442,2.467292,0.847497,0.852006,0.843738
3000,0.2732,2.571812,0.85132,0.853766,0.84953


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=3125, training_loss=1.7962068701171876, metrics={'train_runtime': 1439.6026, 'train_samples_per_second': 8.683, 'train_steps_per_second': 2.171, 'total_flos': 3386140262400000.0, 'train_loss': 1.7962068701171876, 'epoch': 5.0})

In [9]:
import shutil
import os

# EN-ES best
best_es = trainer_es.state.best_model_checkpoint
print("Best EN-ES:", best_es)
shutil.make_archive("best_mbart50_en_es", "zip", best_es)  # creates best_mbart50_en_es.zip

Best EN-ES: mbart50_opus_en_es/checkpoint-1000


'/notebooks/notebooks/best_mbart50_en_es.zip'

In [None]:
data_collator_vi = DataCollatorForSeq2Seq(tokenizer_vi, model=model_vi)
training_args_vi = Seq2SeqTrainingArguments(
    output_dir="mbart50_opus_en_vi",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=3e-5,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="bertscore_f1",
    greater_is_better=True,
    logging_steps=200,
    predict_with_generate=True,
    fp16=True,
    report_to="none",
)

trainer_vi = Seq2SeqTrainer(
    model=model_vi,
    args=training_args_vi,
    train_dataset=tokenized_vi_train,
    eval_dataset=tokenized_vi_val,
    tokenizer=tokenizer_vi,
    data_collator=data_collator_vi,
    compute_metrics=make_bertscore_fn(tokenizer_vi, "vi"),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

trainer_vi.train()

In [None]:
import shutil
import os

# EN-ES best
best_es = trainer_es.state.best_model_checkpoint
print("Best EN-ES:", best_es)
shutil.make_archive("best_mbart50_en_es", "zip", best_es)  # creates best_mbart50_en_es.zip

# EN-VI best
best_vi = trainer_vi.state.best_model_checkpoint
print("Best EN-VI:", best_vi)
shutil.make_archive("best_mbart50_en_vi", "zip", best_vi)  # creates best_mbart50_en_vi.zip

In [None]:
# from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
# import numpy as np
# from transformers import EarlyStoppingCallback, Seq2SeqTrainer


# # This will stop training if the 'loss' hasn't improved for 3 evaluation runs
# early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

# training_args = Seq2SeqTrainingArguments(
#     output_dir="mbart50_en_vi_opus100",
#     #evaluation_strategy="epoch",
#     save_strategy="epoch",
#     learning_rate=5e-5,
#     per_device_train_batch_size=4,
#     per_device_eval_batch_size=4,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     predict_with_generate=True,
#     fp16=True,
#     logging_steps=10,
#     save_total_limit=2,
#     callbacks = [early_stopping]
# )

# data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
# import torch
# from bert_score import score

# # Assumes tokenizer already defined (e.g., MBart50TokenizerFast) and model outputs are token IDs

# def compute_metrics_bertscore(eval_pred):
#     preds, labels = eval_pred  # raw token IDs
#     # Decode to text
#     decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
#     # Replace -100 in labels as padding placeholder
#     import numpy as np
#     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

#     # Compute BERTScore; specify language for multilingual (use 'en' for English reference if evaluating EN->VI, etc.)
#     P, R, F1 = score(decoded_preds, decoded_labels, lang="vi" if tokenizer.tgt_lang.startswith("vi") else "es", verbose=False, device="cuda" if torch.cuda.is_available() else "cpu")
#     # Return average F1 as the main metric
#     return {
#         "bertscore_f1": F1.mean().item(),
#         "bertscore_precision": P.mean().item(),
#         "bertscore_recall": R.mean().item(),
#     }

In [None]:
# trainer = Seq2SeqTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_train,
#     eval_dataset=tokenized_val,
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics_bertscore,
# )

In [None]:
import torch

if torch.cuda.is_available():
    print(f"✅ GPU is available and ready!")
    print(f"Device: {torch.cuda.get_device_name(0)}")
else:
    print("❌ GPU is NOT available. Training is running on the CPU.")

In [None]:
trainer.train()

In [None]:
final_model_path = "/notebooks/my-final-mbart-model"

# 2. Use the trainer to save the model's weights and configuration
trainer.save_model(final_model_path)

# 3. IMPORTANT: You must also save the tokenizer with the model
tokenizer.save_pretrained(final_model_path)

print(f"✅ Final model and tokenizer saved to: {final_model_path}")