In [1]:
!pip install evaluate sacrebleu

Collecting evaluate
  Obtaining dependency information for evaluate from https://files.pythonhosted.org/packages/70/63/7644a1eb7b0297e585a6adec98ed9e575309bb973c33b394dae66bc35c69/evaluate-0.4.1-py3-none-any.whl.metadata
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting sacrebleu
  Obtaining dependency information for sacrebleu from https://files.pythonhosted.org/packages/df/c0/ff53cb76c1b050ad25d056877ba6d3f6fa964134370c4ccf57ad933d6f72/sacrebleu-2.3.2-py3-none-any.whl.metadata
  Downloading sacrebleu-2.3.2-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.4/57.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Obtaining dependency information for portalocker from https://files.pythonhosted.org/packages/17/9e/87671efcca80ba6203811540ed1f9c0462c1609d2281d7b7f53cef05da3d/portalocker-2.8.2-py3-none-any.whl.metadata
  Downloading portalocker-2.8.2-py3-none-any.whl.metadata (8

In [2]:
from transformers import (AutoTokenizer, DataCollatorForSeq2Seq,
AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer)
from datasets import load_dataset
from huggingface_hub import notebook_login

import numpy as np
import evaluate



In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [34]:
def preprocess_function(examples):
    inputs = [PREFIX + example[SRC_LANG] for example in examples["translation"]]
    targets = [example[TGT_LANG] for example in examples["translation"]]
    model_inputs = tokenizer(inputs
                             , text_target = targets
                             , max_length = MAX_LENGTH
                             , padding = "max_length"
                             , truncation = True)
    return model_inputs


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    #print(preds)
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    #preds = [tid for tid in preds.flatten() if tid >= 0 and tid < tokenizer.vocab_size]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [5]:
MODEL_CHECKPOINT = "t5-base"
SRC_LANG = "en"
TGT_LANG = "de"
PREFIX = "translate English to German: "
MAX_LENGTH = 128

In [6]:
raw_data = load_dataset("opus_books", lang1=TGT_LANG, lang2=SRC_LANG)

Downloading builder script:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/7.98k [00:00<?, ?B/s]

Downloading and preparing dataset opus_books/de-en (download: 4.89 MiB, generated: 13.10 MiB, post-processed: Unknown size, total: 17.99 MiB) to /root/.cache/huggingface/datasets/opus_books/de-en-lang1=de,lang2=en/0.0.0/e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf...


Downloading data:   0%|          | 0.00/5.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51467 [00:00<?, ? examples/s]

Dataset opus_books downloaded and prepared to /root/.cache/huggingface/datasets/opus_books/de-en-lang1=de,lang2=en/0.0.0/e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
split_data = raw_data["train"].train_test_split(train_size=0.9, seed=0)

In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

Downloading config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [9]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [35]:
token_data = split_data.map(preprocess_function, batched=True)

In [17]:
data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer
                                       , model = MODEL_CHECKPOINT)

In [12]:
metric = evaluate.load("sacrebleu")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [38]:
training_args = Seq2SeqTrainingArguments(
    output_dir = "t5-base_fine-tuned_opus-books_en-de",
    evaluation_strategy = "epoch",
    learning_rate = 5e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    weight_decay = 0.01,
    save_total_limit = 3,
    num_train_epochs = 3,
    predict_with_generate = True,
    fp16 = True,
    push_to_hub = True,
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model = model,
    args = training_args,
    train_dataset = token_data["train"],
    eval_dataset = token_data["test"],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)

In [39]:
trainer.evaluate(max_length=MAX_LENGTH)



{'eval_loss': 13.984762191772461,
 'eval_bleu': 11.8599,
 'eval_gen_len': 36.3622,
 'eval_runtime': 400.1844,
 'eval_samples_per_second': 12.862,
 'eval_steps_per_second': 0.402}

In [40]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.6463,0.550666,5.3063,17.027
2,0.598,0.536998,5.4708,17.0264
3,0.5818,0.534076,5.5597,17.0262




TrainOutput(global_step=4344, training_loss=0.6497323297883485, metrics={'train_runtime': 5923.0305, 'train_samples_per_second': 23.461, 'train_steps_per_second': 0.733, 'total_flos': 2.11551971180544e+16, 'train_loss': 0.6497323297883485, 'epoch': 3.0})

In [41]:
trainer.evaluate(max_length=MAX_LENGTH)

{'eval_loss': 0.5340757369995117,
 'eval_bleu': 17.263,
 'eval_gen_len': 38.3748,
 'eval_runtime': 446.7223,
 'eval_samples_per_second': 11.522,
 'eval_steps_per_second': 0.36,
 'epoch': 3.0}

In [42]:
trainer.push_to_hub(tags="translation", commit_message="Training complete")

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

'https://huggingface.co/jaymanvirk/t5-base_fine-tuned_opus-books_en-de/tree/main/'