In [None]:
import os, re, string
import pandas as pd
import nltk
nltk.download('punkt')

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, TrainingArguments, Trainer

import torch
print('Torch:', torch.__version__)
print('CUDA available:', torch.cuda.is_available())

In [None]:
dataset = load_dataset("cnn_dailymail", "3.0.0")
print(dataset)

## Custom CSV
# CSV must have: "document","summary"
# dataset = load_dataset(
#     "csv",
#     data_files={"train": "train.csv", "test": "test.csv"}
# )

In [None]:
def clean_text(s: str) -> str:
    s = re.sub(r"\s+", " ", s).strip()
    return s

def apply_clean(batch):
    return {"document": [clean_text(x) for x in batch["article"]],
            "summary": [clean_text(x) for x in batch["highlights"]]}

dataset = dataset.map(apply_clean, batched=True)
print(dataset["train"][0])

In [None]:
model_ckpt = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

max_input = 512
max_target = 128

def tok_fn(batch):
    model_inputs = tokenizer(batch["document"], max_length=max_input, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["summary"], max_length=max_target, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = dataset.map(tok_fn, batched=True, remove_columns=dataset["train"].column_names)
print(tokenized["train"][0])

In [None]:
collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_ckpt)

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)

args = TrainingArguments(
    output_dir="summarizer_out",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="logs",
    save_strategy="epoch",
    fp16=torch.cuda.is_available()
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"].select(range(5000)), # for demo
    eval_dataset=tokenized["validation"].select(range(1000)),
    tokenizer=tokenizer,
    data_collator=collator
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained("summarizer_model")
tokenizer.save_pretrained("summarizer_model")
print("✅ Summarizer trained & saved!")