# Training Seq2Seq -model
Training summary model

## Import libraries

In [42]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict

Load Pretrained Model & Tokenizer

In [43]:
MODEL_NAME = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
model.to(device)
print(f"Model on device: {device}")

Model on device: cuda


Importing existing summarization dataset ([CNN / Daily Mail dataset](https://paperswithcode.com/dataset/cnn-daily-mail-1))

In [44]:
summarization_dataset = load_dataset("cnn_dailymail", "3.0.0")

Select a subset of the dataset for faster compute

In [45]:
train_size, val_size, test_size = 8000, 1000, 1000
summarization_subset = DatasetDict({
    "train": summarization_dataset["train"].select(range(train_size)),
    "validation": summarization_dataset["validation"].select(range(val_size)),
    "test": summarization_dataset["test"].select(range(test_size))
})

print(summarization_subset)

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 8000
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 1000
    })
})


Create methods for preprocessing the the summary dataset

In [46]:
def preprocess_function(examples):
    """Prepare dataset input with cleaned text."""
    inputs = [f"Summarize: {article}" for article in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(examples["highlights"], max_length=150, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

Preprocess the data in similar way as the Gutenberg dataset

In [47]:
train_set = summarization_subset["train"].map(preprocess_function, batched=True)
val_set = summarization_subset["validation"].map(preprocess_function, batched=True)
test_set = summarization_subset["test"].map(preprocess_function, batched=True)

Declare training parameters

In [49]:
training_args = TrainingArguments(
    output_dir="./summarizer_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=5,
    logging_dir="./logs",
    save_total_limit=2,
    push_to_hub=False,
    metric_for_best_model="eval_loss",
    load_best_model_at_end=True,
    eval_accumulation_steps=32, 
    fp16=True,
    dataloader_num_workers=4,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    tokenizer=tokenizer,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Train the model

In [50]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.772301
2,1.898200,0.73335
3,1.898200,0.729631
4,0.948800,0.728507
5,0.948800,0.728516


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=1250, training_loss=1.3267453369140625, metrics={'train_runtime': 1115.1685, 'train_samples_per_second': 35.869, 'train_steps_per_second': 1.121, 'total_flos': 5413672058880000.0, 'train_loss': 1.3267453369140625, 'epoch': 5.0})

Evaluate the model

In [51]:
metrics = trainer.evaluate(test_set)
print(metrics)

{'eval_loss': 0.7281055450439453, 'eval_runtime': 9.5439, 'eval_samples_per_second': 104.779, 'eval_steps_per_second': 6.601, 'epoch': 5.0}


Logging to HuggingFace via API

In [58]:
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login

user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("Hugging_Face_Token")
login(UserSecretsClient().get_secret("Hugging_Face_Token"))

Save the model into hugging face

In [59]:
repo = "Juh6973/t5-small-summarizer-cnn-dailymail"
model.push_to_hub(repo)
tokenizer.push_to_hub(repo)

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Juh6973/t5-small-summarizer-cnn-dailymail/commit/6943841634443573a40c6c72c15d4db46ee2ee30', commit_message='Upload tokenizer', commit_description='', oid='6943841634443573a40c6c72c15d4db46ee2ee30', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Juh6973/t5-small-summarizer-cnn-dailymail', endpoint='https://huggingface.co', repo_type='model', repo_id='Juh6973/t5-small-summarizer-cnn-dailymail'), pr_revision=None, pr_num=None)