In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv("/content/news_summary.csv", encoding='latin-1')

# Check the column names
print("Columns available:", df.columns)

# Keep only the needed columns and rename them
df = df[['ctext', 'text']]
df.columns = ['article', 'summary']

# Drop any rows with missing values
df.dropna(inplace=True)

# Optional: Use a smaller subset for quick training/testing
df = df[:5000]

# Show a sample
print("✅ Article:\n", df['article'][0])
print("\n✅ Summary:\n", df['summary'][0])


Columns available: Index(['author', 'date', 'headlines', 'read_more', 'text', 'ctext'], dtype='object')
✅ Article:
 The Daman and Diu administration on Wednesday withdrew a circular that asked women staff to tie rakhis on male colleagues after the order triggered a backlash from employees and was ripped apart on social media.The union territory?s administration was forced to retreat within 24 hours of issuing the circular that made it compulsory for its staff to celebrate Rakshabandhan at workplace.?It has been decided to celebrate the festival of Rakshabandhan on August 7. In this connection, all offices/ departments shall remain open and celebrate the festival collectively at a suitable time wherein all the lady staff shall tie rakhis to their colleagues,? the order, issued on August 1 by Gurpreet Singh, deputy secretary (personnel), had said.To ensure that no one skipped office, an attendance report was to be sent to the government the next evening.The two notifications ? one mandat

In [3]:
!pip install transformers datasets sentencepiece




In [4]:
from transformers import T5Tokenizer

# Load T5 tokenizer (we're using the small version to train faster)
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Define a function to tokenize the data
def tokenize_function(example):
    # Prefixing with "summarize: " helps T5 understand the task
    input_text = "summarize: " + example["article"]
    target_text = example["summary"]

    model_inputs = tokenizer(
        input_text,
        max_length=512,
        padding="max_length",
        truncation=True
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            target_text,
            max_length=128,
            padding="max_length",
            truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
from datasets import Dataset

# Convert pandas to HuggingFace Dataset
hf_dataset = Dataset.from_pandas(df)

# Apply tokenizer to dataset
tokenized_dataset = hf_dataset.map(tokenize_function)


Map:   0%|          | 0/4396 [00:00<?, ? examples/s]



In [6]:
from transformers import T5ForConditionalGeneration

# Load the model
model = T5ForConditionalGeneration.from_pretrained("t5-small")


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [9]:
!pip install --upgrade transformers


Collecting transformers
  Downloading transformers-4.54.0-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m330.2 kB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.34.2-py3-none-any.whl.metadata (14 kB)
Downloading transformers-4.54.0-py3-none-any.whl (11.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.34.2-py3-none-any.whl (558 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m558.8/558.8 kB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface-hub, transformers
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.33.5
    Uninstalling huggingface-hub-0.33.5:
      Successfully uninstalled huggingface-hub-0.33.5
  Attempting uninstall: transfor

In [7]:
from transformers import T5ForConditionalGeneration
model = T5ForConditionalGeneration.from_pretrained("t5-small")


In [8]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    num_train_epochs=2,
    logging_dir="./logs",
    logging_steps=100,
    save_strategy="no",  # You can change to "epoch" if you want saving per epoch
    report_to="none"     # Turn off wandb reporting
)


In [9]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

trainer.train()


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
100,2.5605
200,1.5568
300,1.4665
400,1.425
500,1.3671
600,1.3817
700,1.3761
800,1.3765
900,1.4088
1000,1.3157


TrainOutput(global_step=2198, training_loss=1.4171079335373244, metrics={'train_runtime': 420.3033, 'train_samples_per_second': 20.918, 'train_steps_per_second': 5.23, 'total_flos': 1189925118541824.0, 'train_loss': 1.4171079335373244, 'epoch': 2.0})

In [10]:
from transformers import T5ForConditionalGeneration, TrainingArguments, Trainer

model = T5ForConditionalGeneration.from_pretrained("t5-small")

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    num_train_epochs=2,
    logging_dir="./logs",
    logging_steps=100,
    save_strategy="no",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

trainer.train()


Step,Training Loss
100,2.5605
200,1.5568
300,1.4665
400,1.425
500,1.3671
600,1.3817
700,1.3761
800,1.3765
900,1.4088
1000,1.3157


TrainOutput(global_step=2198, training_loss=1.4171079335373244, metrics={'train_runtime': 423.5996, 'train_samples_per_second': 20.755, 'train_steps_per_second': 5.189, 'total_flos': 1189925118541824.0, 'train_loss': 1.4171079335373244, 'epoch': 2.0})

In [11]:
model.save_pretrained("t5-news-summary-model")
tokenizer.save_pretrained("t5-news-summary-model")


('t5-news-summary-model/tokenizer_config.json',
 't5-news-summary-model/special_tokens_map.json',
 't5-news-summary-model/spiece.model',
 't5-news-summary-model/added_tokens.json')

In [12]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-news-summary-model")
tokenizer = T5Tokenizer.from_pretrained("t5-news-summary-model")

def summarize(text):
    input_text = "summarize: " + text
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(input_ids, max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
