In [1]:
## Config
random_seed = 100
data_path = "/kaggle/working/"

In [2]:
%%capture
!pip install -U datasets
!pip install transformers evaluate rouge_score --quiet
!pip uninstall keras -y
!pip install keras==2.11
!pip install bert_score

In [3]:
import evaluate
from evaluate import load
from datasets import load_dataset
from tqdm import tqdm
from transformers.optimization import get_scheduler
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq
import torch
from torch.utils.data import DataLoader
import pandas as pd
from bert_score import score
import pickle
import os

# For downloading the results as zip from COLAB (very usefull)
# from google.colab import files
# import shutil

2025-06-19 18:59:02.077456: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750359542.292346      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750359542.355519      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Load Data and Model

In [4]:
# Full dataset (split included)
dataset = load_dataset("EdinburghNLP/xsum")

README.md:   0%|          | 0.00/6.24k [00:00<?, ?B/s]

xsum.py:   0%|          | 0.00/5.76k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/304M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/16.7M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/17.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

In [5]:
# Sample a subset of the test set for evaluation

train_examples = dataset["train"].shuffle(seed=random_seed).select(range(10000))
val_examples = dataset["validation"].shuffle(seed=random_seed).select(range(1000))

In [6]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
prefix = "summarize: "

def preprocess(example):
    input_text = prefix + example["document"]
    model_inputs = tokenizer(
        input_text, max_length=2024, truncation=True, padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example["summary"], max_length=64, truncation=True, padding="max_length"
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Remove non-tensor fields
tokenized_train = train_examples.map(preprocess, remove_columns=train_examples.column_names)
tokenized_val = val_examples.map(preprocess, remove_columns=val_examples.column_names)


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [7]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [8]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)

train_loader = DataLoader(tokenized_train, batch_size=2, shuffle=True, collate_fn=data_collator)
val_loader = DataLoader(tokenized_val, batch_size=2, shuffle=False, collate_fn=data_collator)

In [9]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

num_epochs = 10
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_loader) * num_epochs
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
_ = model.to(device)

In [10]:
model.train()
for epoch in range(num_epochs):
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.set_postfix({"loss": loss.item()})

Epoch 1:   0%|          | 0/5000 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Epoch 1: 100%|██████████| 5000/5000 [42:37<00:00,  1.96it/s, loss=1.13] 
Epoch 2: 100%|██████████| 5000/5000 [42:42<00:00,  1.95it/s, loss=1.22] 
Epoch 3: 100%|██████████| 5000/5000 [42:42<00:00,  1.95it/s, loss=0.917]
Epoch 4: 100%|██████████| 5000/5000 [42:41<00:00,  1.95it/s, loss=1.18] 
Epoch 5: 100%|██████████| 5000/5000 [42:40<00:00,  1.95it/s, loss=0.924]
Epoch 6: 100%|██████████| 5000/5000 [42:42<00:00,  1.95it/s, loss=1.21] 
Epoch 7: 100%|██████████| 5000/5000 [42:42<00:00,  1.95it/s, loss=1.17] 
Epoch 8: 100%|██████████| 5000/5000 [42:41<00:00,  1.95it/s, loss=1.2]  
Epoch 9: 100%|██████████| 5000/5000 [42:41<00:00,  1.95it/s, loss=0.946]
Epoch 10: 100%|██████████| 5000/5000 [42:42<00:00,  1.95i

In [11]:
model_dir = "t5-small-xsum-finetuned"

model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

('t5-small-xsum-finetuned/tokenizer_config.json',
 't5-small-xsum-finetuned/special_tokens_map.json',
 't5-small-xsum-finetuned/spiece.model',
 't5-small-xsum-finetuned/added_tokens.json')

In [12]:
import shutil

shutil.make_archive(model_dir, 'zip', model_dir)
# files.download("t5-small-xsum-finetuned.zip")

'/kaggle/working/t5-small-xsum-finetuned.zip'

In [13]:
rouge = load("rouge")
model.eval()

predictions, references = [], []

for batch in tqdm(val_loader):
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    with torch.no_grad():
        generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=64)
    decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)
    predictions.extend(decoded_preds)
    references.extend(decoded_labels)

results = rouge.compute(predictions=predictions, references=references)
print({k: round(v * 100, 2) for k, v in results.items()})


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

100%|██████████| 500/500 [03:58<00:00,  2.10it/s]


{'rouge1': 29.22, 'rouge2': 8.66, 'rougeL': 22.72, 'rougeLsum': 22.71}
