# News Summary with T5-small model

# Import Datasets



In [6]:
import pandas as pd

splits = {'train': 'data/train-00000-of-00001-ebc48879f34571f6.parquet', 'test': 'data/test-00000-of-00001-6227bd8eb10a9b50.parquet'}
df_train = pd.read_parquet("hf://datasets/argilla/news-summary/" + splits["train"])
df_test = pd.read_parquet("hf://datasets/argilla/news-summary/" + splits["test"])

# Dataset split

In [7]:
from sklearn.model_selection import train_test_split

df = pd.concat([df_train, df_test])

# New train-test-validation split (80% train, 10% validation, 10% test)
df_train, df_temp = train_test_split(df, test_size=0.2, random_state=42)  # 80% train
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)  # 10% val, 10% test

# Display dataset sizes
print(f"Train size: {len(df_train)}")
print(f"Validation size: {len(df_val)}")
print(f"Test size: {len(df_test)}")


Train size: 17133
Validation size: 2142
Test size: 2142


# Install datasets module

In [8]:
!pip install datasets



# Convert to Hugging Face Dataset

In [9]:
from datasets import Dataset

df_train = Dataset.from_pandas(df_train)
df_test = Dataset.from_pandas(df_test)
df_val = Dataset.from_pandas(df_val)

# Choosing a pre-trained model
- t5-small runs quick lightweight summarization and uses less GPU

In [5]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

# Tokenization

In [10]:
# Tokenization function
def preprocess_data(examples):
    inputs = ["summarize: " + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    summaries = [s[0]["text"] if isinstance(s, list) and len(s) > 0 else "" for s in examples["prediction"]]

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(summaries, max_length=150, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = df_train.map(preprocess_data, batched=True)
tokenized_test = df_test.map(preprocess_data, batched=True)
tokenized_val = df_val.map(preprocess_data, batched=True)

# Tokenize dataset
tokenized_dataset = {"train": tokenized_train, "test": tokenized_test, "val": tokenized_val}

Map:   0%|          | 0/17133 [00:00<?, ? examples/s]



Map:   0%|          | 0/2142 [00:00<?, ? examples/s]

Map:   0%|          | 0/2142 [00:00<?, ? examples/s]

# Example of an element

In [31]:
tokenized_dataset["test"][0]

{'input_ids': tensor([21603,    10,     3,  8855, 28027,  6038,    41, 18844,    61,     3,
            18,     3, 29541,    13,  3871,    29,     7,   130,  4973,    53,
            16,     8,  1784,  4675,     9,  6216,    11,  3119,   640,     8,
           684,    30,  1771,   581,  1390,    57,     8, 12346,  2730, 11882,
            12, 24254,     3, 18810,  6704,     5,    37,  1390,     6,  7513,
          2162,    57,     8,  4831,  6323,    16,  1660,    11,  1083,    16,
           365,  5054,    16, 20417,     6,   228,   474,     8,     3, 18810,
           358,   365,  1827,   610,    16,    80,    13,     8,  1611,  3545,
             3,     7,   167, 17261,  2315,     5,     3, 29541,    13, 30545,
            15,     7,     6, 21551,   343,  1661, 17566, 26131,     6,     8,
          1611,  3527,    11, 23661,     7,    43,    66,  7103,  2410,    81,
             8,  4382,  1281,  1112,     5,   389,  5861,   209,  8630,   151,
           130, 10556,    53,  1587, 20

# Training arguments

In [7]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True
)



# Fine-tune Model

In [8]:
from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Seq2SeqTrainer(


In [9]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkashishtam[0m ([33mkashishtam-project[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.2686,0.229324
2,0.2481,0.221506
3,0.241,0.219748


TrainOutput(global_step=3213, training_loss=0.47164850742543113, metrics={'train_runtime': 1406.2107, 'train_samples_per_second': 36.551, 'train_steps_per_second': 2.285, 'total_flos': 6956433253859328.0, 'train_loss': 0.47164850742543113, 'epoch': 3.0})

# Save trained model

In [28]:
# Save to session storage first
save_directory = "./t5_summarization_model/final"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

# Zip the directory
!zip -r t5_model.zip {save_directory}

# Download the zip file
from google.colab import files
files.download("t5_model.zip")

  adding: t5_summarization_model/final/ (stored 0%)
  adding: t5_summarization_model/final/tokenizer_config.json (deflated 94%)
  adding: t5_summarization_model/final/config.json (deflated 62%)
  adding: t5_summarization_model/final/model.safetensors (deflated 10%)
  adding: t5_summarization_model/final/special_tokens_map.json (deflated 85%)
  adding: t5_summarization_model/final/added_tokens.json (deflated 83%)
  adding: t5_summarization_model/final/generation_config.json (deflated 29%)
  adding: t5_summarization_model/final/spiece.model (deflated 48%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Load Saved model if needed

In [2]:
!unzip t5_model.zip -d ./t5_model

Archive:  t5_model.zip
   creating: ./t5_model/t5_summarization_model/final/
  inflating: ./t5_model/t5_summarization_model/final/tokenizer_config.json  
  inflating: ./t5_model/t5_summarization_model/final/config.json  
  inflating: ./t5_model/t5_summarization_model/final/model.safetensors  
  inflating: ./t5_model/t5_summarization_model/final/special_tokens_map.json  
  inflating: ./t5_model/t5_summarization_model/final/added_tokens.json  
  inflating: ./t5_model/t5_summarization_model/final/generation_config.json  
  inflating: ./t5_model/t5_summarization_model/final/spiece.model  


In [3]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Define the path to your model files
model_path = "./t5_model/t5_summarization_model/final"  # Adjust if your unzip path differs

# Load model and tokenizer
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)

# Move to GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

print("Model and tokenizer loaded successfully!")

Model and tokenizer loaded successfully!


# Using ROUGE to evaluate the model

## Install ROGUE library

In [2]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=2a8f52f7da055da9722aa4d0497d72edf2542d231a300d0250f59ffdc23a710a
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


## Load ROUGE metric

In [3]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import evaluate

rouge = evaluate.load("rouge")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

## Pass the test dataset to the model

In [13]:
import torch

# Move model to GPU if available, otherwise CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

# Lists to store generated and reference summaries
generated_summaries = []
reference_summaries = []

# Iterate over the test dataset
for example in tokenized_dataset["test"]:
    # Prepare input (already tokenized in your dataset)
    input_ids = torch.tensor(example["input_ids"]).unsqueeze(0).to(device)  # Shape: [1, 512]
    attention_mask = torch.tensor(example["attention_mask"]).unsqueeze(0).to(device)

    # Generate summary
    summary_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=80,
        min_length=15,
        length_penalty=3.0,
        num_beams=2,
        early_stopping=True
    )

    # Decode generated summary
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    generated_summaries.append(generated_summary)

    # Get reference summary from original data
    ref_summary = tokenizer.decode(example["labels"], skip_special_tokens=True)
    reference_summaries.append(ref_summary)

## Compute rouge score

In [14]:
from rouge_score import rouge_scorer

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Compute scores
rouge_scores = []
for ref, gen in zip(reference_summaries, generated_summaries):
    scores = scorer.score(ref, gen)
    rouge_scores.append(scores)

# Aggregate averages
avg_rouge = {
    "rouge1": {"precision": 0, "recall": 0, "fmeasure": 0},
    "rouge2": {"precision": 0, "recall": 0, "fmeasure": 0},
    "rougeL": {"precision": 0, "recall": 0, "fmeasure": 0}
}

for scores in rouge_scores:
    for metric in avg_rouge:
        avg_rouge[metric]["precision"] += scores[metric].precision
        avg_rouge[metric]["recall"] += scores[metric].recall
        avg_rouge[metric]["fmeasure"] += scores[metric].fmeasure

n = len(rouge_scores)
for metric in avg_rouge:
    avg_rouge[metric]["precision"] /= n
    avg_rouge[metric]["recall"] /= n
    avg_rouge[metric]["fmeasure"] /= n

# Print results
print("Average ROUGE Scores:")
for metric, values in avg_rouge.items():
    print(f"{metric}: Precision={values['precision']:.4f}, Recall={values['recall']:.4f}, F1={values['fmeasure']:.4f}")


Average ROUGE Scores:
rouge1: Precision=0.4264, Recall=0.4930, F1=0.4521
rouge2: Precision=0.1961, Recall=0.2270, F1=0.2077
rougeL: Precision=0.3813, Recall=0.4408, F1=0.4042


# Qualitative check

In [11]:
# Pick a few examples (e.g., first 3 from test set)
num_examples = 3
for i, example in enumerate(tokenized_dataset["test"]):
    if i >= num_examples:
        break

    # Decode original article (remove "summarize: " prefix if present)
    article_text = tokenizer.decode(example["input_ids"], skip_special_tokens=True)
    if article_text.startswith("summarize: "):
        article_text = article_text[len("summarize: "):]

    # Prepare input tensors
    input_ids = torch.tensor(example["input_ids"]).unsqueeze(0).to(device)
    attention_mask = torch.tensor(example["attention_mask"]).unsqueeze(0).to(device)

    # Generate summary
    summary_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=60,
        min_length=15,
        length_penalty=3.0,
        num_beams=1,
        early_stopping=True
    )
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Decode reference summary from labels
    reference_summary = tokenizer.decode(example["labels"], skip_special_tokens=True)

    # Print results
    print(f"\nExample {i+1}:")
    print(f"Article Text: {article_text[:500]}...")  # Truncate for readability
    print(f"Generated Summary: {generated_summary}")
    print(f"Reference Summary: {reference_summary}")
    print("-" * 50)




Example 1:
Article Text: BUCHAREST (Reuters) - Thousands of Romanians were protesting in the capital Bucharest and cities across the country on Sunday against plans by the ruling Social Democrats to overhaul judicial legislation. The plans, initially announced by the justice minister in August and currently in under debate in parliament, could put the judicial system under political control in one of the European Union s most corrupt states. Thousands of magistrates, centrist President Klaus Iohannis, the European Commi...
Generated Summary: Romanians protest against reforms of judicial legislation in Bucharest
Reference Summary: Thousands of Romanians protest ruling party's judicial overhaul plans
--------------------------------------------------

Example 2:
Article Text: NEW YORK (Reuters) - Democrats beat Republicans in U.S. television ratings, according to Nielsen data released on Tuesday for the first night of the Democratic National Convention. An estimated 26 million people wa

# Function to generate summary

In [4]:
def generate_summary(article_text):
    """
    Generate a summary for the given article text using the fine-tuned T5-small model.

    Args:
        article_text (str): The article text to summarize.

    Returns:
        str: The generated summary.
    """
    # Preprocess the article
    input_text = "summarize: " + article_text.strip()
    inputs = tokenizer(input_text, max_length=512, truncation=True, padding="max_length", return_tensors="pt")

    # Move tensors to the same device as the model
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # Generate summary
    summary_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=60,  # Adjustable
        min_length=15,
        length_penalty=3.0,
        num_beams=1,
        early_stopping=True
    )

    # Decode and return the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Try any article to summarize

In [8]:
article = """
A United Airlines flight had to make an emergency landing in Waco, Texas, on Sunday evening after it hit severe turbulence that injured five passengers, officials said.

SkyWest Flight 5690, which was operating as United Express, a United Airlines regional flight service network, took off from Springfield-Branson National Airport in Missouri at 4:48 p.m. CT and was headed to George Bush Intercontinental Airport in Houston, according to data on FlightAware.com.

The Bombardier CRJ-200 experienced turbulence and had to make an emergency landing at Waco Regional Airport, SkyWest said in a statement.

SkyWest did not clarify where the flight hit rough air.

"Medical personnel met and evaluated passengers, and five passengers were transported to the hospital," the statement read. "SkyWest and United are making sure all customers get the care they need and dispatched another aircraft to fly the customers from Waco to Houston."

The five passengers had minor injuries, the airline said. There were 29 passengers and three crew members on the flight.

The Waco Fire Department responded to the plane with five units and 13 personnel, acting Fire Chief Robby Bergerson said.
"""
generated_summary = generate_summary(article)
print("Generated summary: ", generated_summary)

Generated summary:  United Airlines flight had to make emergency landing in Waco, Texas, after turbulence


##