- Load a dataset

- Make it as a iterator

- Load a tokeniser

- Write a processing function 

- Map it to the dataset 

- Create a new tokeniser

- Train it with the dataset

- Write the post processing function

- Run the evaluation 

- work on multiple peft configuration

- load the model for inference

- load the model for training

- start the training with t5-small

In [1]:
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
)
from datasets import load_dataset

In [2]:
books = load_dataset("opus_books", "en-fr")

In [3]:
books = books['train'].train_test_split(test_size=0.2)

In [4]:
model_path = "t5-large"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [6]:
source_lang = "en"
target_lang = "fr"
prefix = "translate English to French: "


def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs,
                             text_target=targets,
                             max_length=128,
                             truncation=True)
    return model_inputs

In [7]:
books_toked = books.map(preprocess_function, batched=True)

Map:   0%|          | 0/101668 [00:00<?, ? examples/s]

Map:   0%|          | 0/25417 [00:00<?, ? examples/s]

In [8]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_path)

In [9]:
from evaluate import load

metric = load("sacrebleu")

In [10]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [11]:
from transformers import BitsAndBytesConfig
import torch

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [13]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(
    model_path,
    device_map="auto",
    quantization_config=quant_config,)
    # torch_dtype=torch.bfloat16)

In [14]:
from peft import (
    get_peft_config,
    get_peft_model,
    PromptTuningConfig,
    PrefixTuningConfig,
    get_peft_model_state_dict,
    TaskType
)

peft_config = PrefixTuningConfig(task_type=TaskType.SEQ_2_SEQ_LM,
                                 inference_mode=False,
                                 num_virtual_tokens=20)

In [15]:
peft_adapter = get_peft_model(model, peft_config)
peft_adapter.print_trainable_parameters()

trainable params: 983,040 || all params: 738,651,136 || trainable%: 0.13308583065659835


In [16]:
training_args = Seq2SeqTrainingArguments(
    output_dir="/home/kamal/training_files/t5-test/",
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=peft_adapter,
    args=training_args,
    train_dataset=books_toked["train"],
    eval_dataset=books_toked["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [17]:
trainer.train()

Step,Training Loss,Validation Loss,Bleu,Gen Len
500,0.0,1.930361,3.1482,17.9434




KeyboardInterrupt: 