In [1]:
import torch
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
dataset = load_dataset("Helsinki-NLP/opus-100", "en-zh")

In [4]:
model_name = "google/mt5-small" 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

In [5]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")

In [21]:
lora_config = LoraConfig(
    r = 8,
    lora_alpha = 16,
    target_modules=["q", "v"],
    lora_dropout = 0.1,
    bias = "none",
    task_type = "SEQ_2_SEQ_LM"
)

In [22]:
model = get_peft_model(model, lora_config).to(device)

In [23]:
print_trainable_parameters(model)

trainable params: 344064 || all params: 300520832 || trainable%: 0.11448923447676333


In [35]:
def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["zh"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [40]:
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=["translation"])

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
trainer = Trainer(
    model = model,
    train_dataset = dataset["train"],
    args = TrainingArguments(
        output_dir="./finetuned_mt5_v1",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",  
        greater_is_better=False, 
        learning_rate=1e-3,
        num_train_epochs=3,  
        fp16=True if torch.cuda.is_available() else False,
    ),
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
)
model.config.use_cache = False
trainer.train()

In [None]:
trainer.train()
model.save_pretrained("./mt5_lora_adapter")
tokenizer.save_pretrained("./mt5_lora_adapter")