In [1]:
## Config
random_seed = 100
data_path = "/kaggle/working/"

In [2]:
%%capture
!pip install -U datasets
!pip install transformers evaluate rouge_score --quiet
!pip uninstall keras -y
!pip install keras==2.11
!pip install bert_score

In [3]:
import evaluate
from evaluate import load
from datasets import load_dataset
from tqdm import tqdm
from transformers.optimization import get_scheduler
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq
import torch
from torch.utils.data import DataLoader
import pandas as pd
from bert_score import score
import pickle
import os

# For downloading the results as zip from COLAB (very usefull)
# from google.colab import files
# import shutil

## Load Data and Model

In [4]:
# Full dataset (split included)
dataset = load_dataset("EdinburghNLP/xsum")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/6.24k [00:00<?, ?B/s]

xsum.py:   0%|          | 0.00/5.76k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/304M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/16.7M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/17.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

In [14]:
# Sample a subset of the test set for evaluation

# Few-shot subset
train_examples = dataset["train"].shuffle(seed=random_seed)
val_examples = dataset["validation"].shuffle(seed=random_seed)

In [15]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
prefix = "summarize: "

def preprocess(example):
    input_text = prefix + example["document"]
    model_inputs = tokenizer(
        input_text, max_length=2024, truncation=True, padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example["summary"], max_length=64, truncation=True, padding="max_length"
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Remove non-tensor fields
tokenized_train = train_examples.map(preprocess, remove_columns=train_examples.column_names)
tokenized_val = val_examples.map(preprocess, remove_columns=val_examples.column_names)


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [7]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [8]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)

train_loader = DataLoader(tokenized_train, batch_size=8, shuffle=True, collate_fn=data_collator)
val_loader = DataLoader(tokenized_val, batch_size=8, shuffle=False, collate_fn=data_collator)

In [9]:

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_loader) * num_epochs
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
_ = model.to(device)


In [10]:
model.train()
for epoch in range(num_epochs):
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.set_postfix({"loss": loss.item()})

Epoch 1:   0%|          | 0/5 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Epoch 1: 100%|██████████| 5/5 [03:31<00:00, 42.24s/it, loss=8.34]
Epoch 2: 100%|██████████| 5/5 [02:56<00:00, 35.29s/it, loss=8.08]
Epoch 3: 100%|██████████| 5/5 [02:52<00:00, 34.52s/it, loss=8.97]


In [11]:
rouge = load("rouge")
model.eval()

predictions, references = [], []

for batch in val_loader:
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    with torch.no_grad():
        generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=64)
    decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)
    predictions.extend(decoded_preds)
    references.extend(decoded_labels)

results = rouge.compute(predictions=predictions, references=references)
print({k: round(v * 100, 2) for k, v in results.items()})


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'rouge1': np.float64(19.01), 'rouge2': np.float64(0.7), 'rougeL': np.float64(12.37), 'rougeLsum': np.float64(12.45)}


In [12]:
model.save_pretrained("t5-small-xsum-finetuned")
tokenizer.save_pretrained("t5-small-xsum-finetuned")

('t5-small-xsum-finetuned/tokenizer_config.json',
 't5-small-xsum-finetuned/special_tokens_map.json',
 't5-small-xsum-finetuned/spiece.model',
 't5-small-xsum-finetuned/added_tokens.json')

In [13]:
# shutil.make_archive("t5-small-xsum-finetuned", 'zip', "t5-small-xsum-finetuned")
# files.download("t5-small-xsum-finetuned.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>