In [3]:
import torch
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [5]:
dataset = load_dataset("Helsinki-NLP/opus-100", "en-zh")

KeyboardInterrupt: 

In [5]:
model_name = "google-t5/t5-base" 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
for name, module in model.named_modules():
    print(name)


shared
encoder
encoder.block
encoder.block.0
encoder.block.0.layer
encoder.block.0.layer.0
encoder.block.0.layer.0.SelfAttention
encoder.block.0.layer.0.SelfAttention.q
encoder.block.0.layer.0.SelfAttention.k
encoder.block.0.layer.0.SelfAttention.v
encoder.block.0.layer.0.SelfAttention.o
encoder.block.0.layer.0.SelfAttention.relative_attention_bias
encoder.block.0.layer.0.layer_norm
encoder.block.0.layer.0.dropout
encoder.block.0.layer.1
encoder.block.0.layer.1.DenseReluDense
encoder.block.0.layer.1.DenseReluDense.wi
encoder.block.0.layer.1.DenseReluDense.wo
encoder.block.0.layer.1.DenseReluDense.dropout
encoder.block.0.layer.1.DenseReluDense.act
encoder.block.0.layer.1.layer_norm
encoder.block.0.layer.1.dropout
encoder.block.1
encoder.block.1.layer
encoder.block.1.layer.0
encoder.block.1.layer.0.SelfAttention
encoder.block.1.layer.0.SelfAttention.q
encoder.block.1.layer.0.SelfAttention.k
encoder.block.1.layer.0.SelfAttention.v
encoder.block.1.layer.0.SelfAttention.o
encoder.block.1.l

In [5]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")

In [21]:
lora_config = LoraConfig(
    r = 8,
    lora_alpha = 16,
    target_modules=["q", "v"],
    lora_dropout = 0.1,
    bias = "none",
    task_type = "SEQ_2_SEQ_LM"
)

In [22]:
model = get_peft_model(model, lora_config).to(device)

In [23]:
print_trainable_parameters(model)

trainable params: 344064 || all params: 300520832 || trainable%: 0.11448923447676333


In [35]:
def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["zh"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [40]:
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=["translation"])

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
trainer = Trainer(
    model = model,
    train_dataset = dataset["train"],
    args = TrainingArguments(
        output_dir="./finetuned_mt5_v1",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",  
        greater_is_better=False, 
        learning_rate=1e-3,
        num_train_epochs=3,  
        fp16=True if torch.cuda.is_available() else False,
    ),
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
)
model.config.use_cache = False
trainer.train()

In [None]:
trainer.train()
model.save_pretrained("./mt5_lora_adapter")
tokenizer.save_pretrained("./mt5_lora_adapter")