In [None]:
!pip install transformers datasets sentencepiece accelerate -q

from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
import torch

In [None]:
!pip install --upgrade datasets huggingface_hub




In [None]:
!pip install datasets transformers --quiet
from datasets import load_dataset
from huggingface_hub import login

# 👇 Create a free token here: https://huggingface.co/settings/tokens
login()  # It will ask for your token (paste it in)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# 1️⃣ Load Dataset
# -------------------------
from datasets import load_dataset
dataset = load_dataset("knkarthick/samsum")




In [None]:
# 2️⃣ Load Tokenizer & Model
# -------------------------
model_name = "t5-small"  # lightweight and easy to train
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
# 3️⃣ Preprocess the Data
# -------------------------
def preprocess_function(batch):
    inputs = ["summarize: " + doc for doc in batch["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    labels = tokenizer(batch["summary"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=["dialogue", "summary", "id"])

In [None]:
 # 4️⃣ Data Collator
# -------------------------
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)




In [None]:
!pip install -U transformers




In [None]:

# 5️⃣ Training Arguments
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./t5-summarizer",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none"   # 👈 disables wandb logging cleanly
)



In [None]:
import transformers
print("transformers:", transformers.__version__)


transformers: 4.57.0


In [None]:
#6️⃣ Trainer
# -------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(


In [None]:
 #7️⃣ Train the Model
# -------------------------
trainer.train()



Epoch,Training Loss,Validation Loss
1,2.015,1.822491




TrainOutput(global_step=7366, training_loss=2.0758426447714795, metrics={'train_runtime': 14928.4954, 'train_samples_per_second': 0.987, 'train_steps_per_second': 0.493, 'total_flos': 804744466268160.0, 'train_loss': 2.0758426447714795, 'epoch': 1.0})

In [None]:
# 8️⃣ Save Locally
# -------------------------
trainer.save_model("./t5-finetuned-samsum")
tokenizer.save_pretrained("./t5-finetuned-samsum")

print("✅ Fine-tuning complete! Model saved to ./t5-finetuned-samsum")

✅ Fine-tuning complete! Model saved to ./t5-finetuned-samsum


In [None]:
#9️⃣ Test It
# -------------------------
sample = dataset["test"][0]["dialogue"]
inputs = tokenizer("summarize: " + sample, return_tensors="pt", truncation=True)
summary_ids = model.generate(**inputs, max_length=150)
print("\n🧾 Original Text:\n", sample)
print("\n✨ Generated Summary:\n", tokenizer.decode(summary_ids[0], skip_special_tokens=True))


🧾 Original Text:
 Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

✨ Generated Summary:
 Amanda has Betty's number. She can't find it.


In [None]:
from huggingface_hub import login
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
trainer.push_to_hub("t5-summarizer")


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...5-summarizer/spiece.model: 100%|##########|  792kB /  792kB            

  ...marizer/model.safetensors:   0%|          |  552kB /  242MB            

  ...marizer/training_args.bin:  10%|#         |   584B / 5.78kB            

CommitInfo(commit_url='https://huggingface.co/kavya19566789/t5-summarizer/commit/6200e20cc5fcb23ad60f167423c78691a382fdfe', commit_message='t5-summarizer', commit_description='', oid='6200e20cc5fcb23ad60f167423c78691a382fdfe', pr_url=None, repo_url=RepoUrl('https://huggingface.co/kavya19566789/t5-summarizer', endpoint='https://huggingface.co', repo_type='model', repo_id='kavya19566789/t5-summarizer'), pr_revision=None, pr_num=None)

In [None]:
from google.colab import files
import shutil

# Zip and download your trained model
shutil.make_archive("t5_finetuned_samsum", 'zip', "./t5-finetuned-samsum")
files.download("t5_finetuned_samsum.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>