In [1]:
## Config
random_seed = 100
data_path = "/kaggle/working/"

In [2]:
%%capture
!pip install -U datasets
!pip install transformers datasets evaluate rouge_score --quiet
!pip uninstall keras -y
!pip install keras==2.11
!pip install bert_score

KeyboardInterrupt: 

In [3]:
import evaluate
from datasets import load_dataset
from tqdm import tqdm
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
import pandas as pd
from bert_score import score
import pickle
import os

## Load Data and Model

In [4]:
# Full dataset (split included)
dataset = load_dataset("EdinburghNLP/xsum")

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


In [5]:
# Sample a subset of the test set for evaluation

# Few-shot subset
train_examples = dataset["train"].shuffle(seed=random_seed).select(range(10))
val_examples = dataset["validation"].shuffle(seed=random_seed).select(range(5))

In [6]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("t5-small")
prefix = "summarize: "

def preprocess(example):
    input_text = prefix + example["document"]
    model_inputs = tokenizer(
        input_text, max_length=2024, truncation=True, padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example["summary"], max_length=64, truncation=True, padding="max_length"
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Remove non-tensor fields
tokenized_train = train_examples.map(preprocess, remove_columns=train_examples.column_names)
tokenized_val = val_examples.map(preprocess, remove_columns=val_examples.column_names)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
from transformers import T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained("t5-small")


In [8]:
import torch
from torch.utils.data import DataLoader
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)

train_loader = DataLoader(tokenized_train, batch_size=2, shuffle=True, collate_fn=data_collator)
val_loader = DataLoader(tokenized_val, batch_size=2, shuffle=False, collate_fn=data_collator)


In [9]:
import torch
from transformers.optimization import get_scheduler

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_loader) * num_epochs
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
_ = model.to(device)


In [None]:
from tqdm import tqdm

model.train()
for epoch in range(num_epochs):
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.set_postfix({"loss": loss.item()})


Epoch 1:   0%|          | 0/5 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Epoch 1:  80%|████████  | 4/5 [02:16<00:32, 32.59s/it, loss=9.36]

In [None]:
from evaluate import load

rouge = load("rouge")
model.eval()

predictions, references = [], []

for batch in val_loader:
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    with torch.no_grad():
        generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=64)
    decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)
    predictions.extend(decoded_preds)
    references.extend(decoded_labels)

results = rouge.compute(predictions=predictions, references=references)
print({k: round(v * 100, 2) for k, v in results.items()})


In [None]:
model.save_pretrained("t5-small-xsum-finetuned")
tokenizer.save_pretrained("t5-small-xsum-finetuned")

In [None]:
from google.colab import files
import shutil

shutil.make_archive("t5-small-xsum-finetuned", 'zip', "t5-small-xsum-finetuned")
files.download("t5-small-xsum-finetuned.zip")
