In [1]:
import pandas as pd
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
from torch.utils.data import Dataset, DataLoader

In [2]:
# Define the paths
DATA_PATH = "./data/book_genre_prediction.csv"
MODEL_SAVE_PATH = "./models/book_title_generator_t5" 

MODEL_NAME = 't5-base'
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 64
BATCH_SIZE = 8
EPOCHS = 3

In [4]:
# --- Data Preparation ---
class BookTitleDataset(Dataset):
    def __init__(self, tokenizer, dataframe, max_input_length, max_target_length):
        self.tokenizer = tokenizer
        self.dataframe = dataframe
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        summary = str(row['summary'])
        title = str(row['title'])
        input_text = "generate title: " + summary

        # Tokenize input and target
        input_encoding = self.tokenizer(
            input_text,
            padding='max_length',
            max_length=self.max_input_length,
            truncation=True,
            return_tensors='pt'
        )
        target_encoding = self.tokenizer(
            title,
            padding='max_length',
            max_length=self.max_target_length,
            truncation=True,
            return_tensors='pt'
        )

        labels = target_encoding['input_ids']
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            'input_ids': input_encoding['input_ids'].flatten(),
            'attention_mask': input_encoding['attention_mask'].flatten(),
            'labels': labels.flatten()
        }

In [5]:
# Load the dataset
try:
    df = pd.read_csv(DATA_PATH)
    df.dropna(subset=['title', 'summary'], inplace=True)
except FileNotFoundError:
    print(f"Error: Dataset not found at {DATA_PATH}")
    exit()

In [6]:
# Load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

# Create dataset and dataloader
dataset = BookTitleDataset(tokenizer, df, MAX_INPUT_LENGTH, MAX_TARGET_LENGTH)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [7]:
print(f"Starting training for {EPOCHS} epochs...")
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss/len(dataloader):.4f}")

print("Training finished.")

Starting training for 3 epochs...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1/3, Loss: 2.6251
Epoch 2/3, Loss: 2.3121
Epoch 3/3, Loss: 2.0853
Training finished.


In [8]:
print(f"Saving model to {MODEL_SAVE_PATH}...")
model.save_pretrained(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)
print("Model and tokenizer saved.")

Saving model to ./models/book_title_generator_t5...
Model and tokenizer saved.
