## **Objective :** Train or fine-tune a model for text summarization.

In [None]:
# !python -m spacy download en_core_web_sm

In [None]:
# !pip install rouge_score
# !pip install evaluate

In [None]:
from datasets import load_dataset
from transformers import T5Tokenizer , T5ForConditionalGeneration
import shutil
import os
import zipfile
from datasets import load_from_disk
import numpy as np
import evaluate
from transformers import Seq2SeqTrainingArguments
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer
import spacy

### Step 1: Load and preprocess dataset

In [None]:
from datasets import load_dataset

In [None]:
data = load_dataset("cnn_dailymail", "3.0.0")
data.shape

README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

{'train': (287113, 3), 'validation': (13368, 3), 'test': (11490, 3)}

In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [None]:
# Function to check null or empty strings
def check_missing(data, field):
    nulls = sum([1 for x in data[field] if x is None or str(x).strip() == ""])
    print(f"Missing values in '{field}': {nulls} out of {len(data[field])}")

for split in ["train", "validation", "test"]:
    print(f"\nChecking {split} split:")
    check_missing(data[split], "article")
    check_missing(data[split], "highlights")


Checking train split:
Missing values in 'article': 0 out of 287113
Missing values in 'highlights': 0 out of 287113

Checking validation split:
Missing values in 'article': 0 out of 13368
Missing values in 'highlights': 0 out of 13368

Checking test split:
Missing values in 'article': 0 out of 11490
Missing values in 'highlights': 0 out of 11490


In [None]:
# selecting less datapoints
train_ds = data["train"].select(range(18000))
val_ds = data["validation"].select(range(2000))
test_ds = data["test"].select(range(2000))
train_ds.shape , val_ds.shape , test_ds.shape

((18000, 3), (2000, 3), (2000, 3))

In [None]:
train_ds[0]["article"]

'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details o

In [None]:
train_ds[0]["highlights"]

"Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .\nYoung actor says he has no plans to fritter his cash away .\nRadcliffe's earnings from first five Potter films have been held in trust fund ."

### Step 2 : Tokenization

In [None]:
from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
def preprocess(batch):
    inputs = ["summarize: " + doc for doc in batch["article"]]
    model_inp = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["highlights"], max_length=150, truncation=True, padding="max_length")

    model_inp["labels"] = labels["input_ids"]
    return model_inp

In [None]:
train_tok = train_ds.map(preprocess, batched=True, batch_size=16)
val_tok = val_ds.map(preprocess, batched=True, batch_size=16)
test_tok = test_ds.map(preprocess, batched=True, batch_size=16)

Map:   0%|          | 0/18000 [00:00<?, ? examples/s]



Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
print(f"Sample tokenized input IDs: {train_tok[0]['input_ids'][:50]}")

Sample tokenized input IDs: [21603, 10, 301, 24796, 4170, 6, 2789, 41, 18844, 61, 1636, 8929, 16023, 2213, 4173, 6324, 12591, 15, 11391, 592, 12, 3, 9, 2196, 3996, 1755, 770, 8785, 591, 11039, 770, 61, 13462, 38, 3, 88, 5050, 507, 30, 2089, 6, 68, 3, 88, 10419, 7, 8, 540, 751, 31]


In [None]:
train_tok.save_to_disk("/kaggle/working/train_tok")
val_tok.save_to_disk("/kaggle/working/val_tok")
test_tok.save_to_disk("/kaggle/working/test_tok")

Saving the dataset (0/1 shards):   0%|          | 0/18000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2000 [00:00<?, ? examples/s]

### Step 3 : Saving the Tokenized sets

In [None]:
import shutil
import os
import zipfile

In [None]:
os.makedirs("/kaggle/working/all_tok", exist_ok=True)

In [None]:
shutil.copytree("/kaggle/working/train_tok", "/kaggle/working/all_tok/train_tok")
shutil.copytree("/kaggle/working/val_tok", "/kaggle/working/all_tok/val_tok")
shutil.copytree("/kaggle/working/test_tok", "/kaggle/working/all_tok/test_tok")

# Now zip the combined folder
shutil.make_archive("/kaggle/working/all_tokenized_data", 'zip', "/kaggle/working/all_tok")

'/kaggle/working/all_tokenized_data.zip'

### Step 4 : Loading the Tokenized sets

In [None]:
zip_path = "/content/all_tokenized_data.zip"

In [None]:
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall("/content/")

In [None]:
from datasets import load_from_disk

train_tok = load_from_disk("/content/train_tok")
val_tok = load_from_disk("/content/val_tok")
test_tok = load_from_disk("/content/test_tok")

In [None]:
print(train_tok)
print(val_tok)
print(test_tok)

Dataset({
    features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 18000
})
Dataset({
    features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 2000
})
Dataset({
    features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 2000
})


### Step 5 : Load the Model

In [None]:
from transformers import T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained("t5-small")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# !pip install rouge_score
!pip install evaluate

In [None]:
import numpy as np
import evaluate

In [None]:
# using the rouge score for evaluation
rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 2) for k, v in result.items()}

    gen_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = round(np.mean(gen_lens), 2)

    return result

### Step 6 : Set Training Arguments

In [None]:
from transformers import Seq2SeqTrainingArguments

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_strategy="epoch",
    generation_max_length=150,
    generation_num_beams=4,
    fp16=True
)

### Step 7 : Setup Trainer and Start Training

In [None]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
trainer = Seq2SeqTrainer(
    model= model,
    args= training_args,
    train_dataset= train_tok,
    eval_dataset= val_tok,
    tokenizer= tokenizer,
    data_collator= data_collator,
    compute_metrics= compute_metrics
)

  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkodamkarthik03[0m ([33mkodamkarthik03-innomatics-research-labs[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.8722,0.823182,34.01,13.99,23.81,23.81,97.22
2,0.8602,0.82343,34.71,14.31,23.98,23.96,101.18
3,0.8206,0.828837,34.54,14.18,23.95,23.95,101.16
4,0.7905,0.832386,34.78,14.36,24.15,24.12,99.27
5,0.7692,0.836604,35.08,14.57,24.38,24.37,98.61


TrainOutput(global_step=11250, training_loss=0.8225454969618056, metrics={'train_runtime': 5095.9781, 'train_samples_per_second': 17.661, 'train_steps_per_second': 2.208, 'total_flos': 1.218076213248e+16, 'train_loss': 0.8225454969618056, 'epoch': 5.0})

### Step 8 : Save the Model and Tokenizer

In [None]:
model_path = "/content/t5_cnn_summary_model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

In [None]:
model_path = "/content/t5_cnn_summary_model"
shutil.make_archive("t5_cnn_summary_model", 'zip', model_path)

### Step 9 : Evaluate on Test Set

In [None]:
trainer.evaluate(test_tok)

{'eval_loss': 0.8116223216056824,
 'eval_rouge1': 36.33,
 'eval_rouge2': 15.67,
 'eval_rougeL': 25.67,
 'eval_rougeLsum': 25.65,
 'eval_gen_len': 98.72,
 'eval_runtime': 550.5062,
 'eval_samples_per_second': 3.633,
 'eval_steps_per_second': 0.454,
 'epoch': 5.0}

### Step 10 : Predictions

In [None]:
import zipfile
import os

with zipfile.ZipFile("t5_cnn_summary_model.zip", 'r') as zip_ref:
    zip_ref.extractall("t5_cnn_summary_model")

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the saved model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('t5_cnn_summary_model')
tokenizer = T5Tokenizer.from_pretrained('t5_cnn_summary_model')

In [None]:
sample = test_tok[0]
input_ids = tokenizer.encode(sample["article"], return_tensors="pt", max_length=512, truncation=True)

output_ids = model.generate(input_ids, max_length=150, num_beams=4, early_stopping=True)
summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Generated Summary:\n", summary)
print("\nReference Summary:\n", sample["highlights"])

Generated Summary:
 Palestinian Authority officially becomes 123rd member of the International Criminal Court. The move gives the court jurisdiction over alleged crimes in Palestinian territories. Palestinians signed the ICC's founding Rome Statute in January. Palestinians may be subject to counter-charges as well as counter-charges.

Reference Summary:
 Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June .
Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .


### Predictions with Test Data

In [None]:
def generate_summary(sample, model, tokenizer, max_input_len=512, max_output_len=150, num_beams=4):

  input_ids = tokenizer.encode(
    "summarize: " + sample["article"], return_tensors= "pt",
    max_length= max_input_len, truncation= True)

  output_ids = model.generate(input_ids,max_length=max_output_len,num_beams=num_beams,early_stopping=True)

  summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)

  reference = sample.get("highlights", None)
  if reference:
    return summary, reference
  return summary

In [None]:
sample = test_tok[0]

generated, reference = generate_summary(sample, model, tokenizer)
print("Generated Summary:\n", generated)
print("\nReference Summary:\n", reference)

Generated Summary:
 The Palestinian Authority officially became the 123rd member of the International Criminal Court. The formal accession was marked with a ceremony at The Hague, in the Netherlands. The ICC opened a preliminary examination into the situation in Palestinian territories.

Reference Summary:
 Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June .
Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .


In [None]:
sample = test_tok[732]

generated, reference = generate_summary(sample, model, tokenizer)
print("Generated Summary:\n", generated)
print("\nReference Summary:\n", reference)

Generated Summary:
 Suzanne Crough played youngest daughter Tracy on "The Partridge Family" The group consisted of a widowed mom, played by Shirley Jones, and her five children. Redheaded Crough was raised in Los Angeles, the youngest of eight children.

Reference Summary:
 Suzanne Crough was the youngest member of TV's "Partridge Family"
Crough died Monday at 52 in Nevada .


In [None]:
sample = test_tok[1532]

generated, reference = generate_summary(sample, model, tokenizer)
print("Generated Summary:\n", generated)
print("\nReference Summary:\n", reference)

Generated Summary:
 Gary Bowyer challenged Jordan Rhodes to become an FA Cup hero. The 17-goal striker scored the winner in Wednesday night's sixth-round replay. The championship underdogs face Liverpool in the first full house at Ewood Park since 2011.

Reference Summary:
 Gary Bowyer hopes Jordan Rhodes can fire Blackburn into next round .
Blackburn face Liverpool at home after drawing with the Reds at Anfield .
The Championship outfit will have their first sell-out crowd since 2011 .


In [None]:
sample1 = {"article": """(Reuters) - U.S. President Joe Biden said on Monday that he was prepared to negotiate on the details of his $2 trillion
            infrastructure plan, but would not be willing to “do nothing.” Biden said at a White House meeting with a bipartisan group of lawmakers that
            he was open to compromise on how to pay for the package, which aims to fix roads and bridges, expand broadband access, and boost electric
            vehicle use. However, he emphasized the importance of bold investment to ensure the U.S. remains competitive in the global economy."""}

summary1 = generate_summary(sample1, model, tokenizer)
print("Summary 1: ", summary1)

Summary 1:  Joe Biden said he was prepared to negotiate on the details of his $2 trillion infrastructure plan. Biden said he was open to compromise on how to pay for the package. He emphasized the importance of bold investment to ensure the U.S. remains competitive in the global economy.


In [None]:
sample2 = {"article": """LONDON (Reuters) - Britain’s economy grew by 0.4% in February 2021 as companies adapted to lockdown restrictions, official data
           4showed on Tuesday, offering some hope that a recovery from the coronavirus crisis is underway. The increase was driven by growth in manufacturing
           and trade, as well as the reopening of schools. However, the economy remained 7.8% smaller than in February 2020, before the pandemic hit.
           Economists said the figures were encouraging but cautioned that the road to full recovery remained long."""}

summary2 = generate_summary(sample2, model, tokenizer)
print("Summary 2: ", summary2)

Summary 2:  Britain's economy grew by 0.4% in February 2021 as companies adapted to lockdown restrictions. Growth was driven by growth in manufacturing and trade, as well as the reopening of schools.


### Prediction with Sample input

In [None]:
def summarize_text(text, model, tokenizer, max_input_len=512, max_output_len=150, num_beams=4):

    input_ids = tokenizer.encode(
        "summarize: " + text,return_tensors="pt",
        max_length=max_input_len,truncation=True)

    output_ids = model.generate(input_ids,max_length=max_output_len,num_beams=num_beams,early_stopping=True)

    summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return summary

In [None]:
user_input = """Indian Space Research Organisation (ISRO) successfully launched the Chandrayaan-3 mission to explore the lunar surface & conduct scientific
experiments. This mission is expected to demonstrate the ability to perform a soft landing on the Moon and deploy a rover to study lunar soil and rocks."""

summary = summarize_text(user_input, model, tokenizer)
print("Summary:", summary)


Summary: The Chandrayaan-3 mission is expected to demonstrate the ability to perform a soft landing on the Moon. The mission is expected to demonstrate the ability to perform a soft landing on the Moon.


In [None]:
sample2 = '''LONDON (Reuters) - Britain’s economy grew by 0.4% in February 2021 as companies adapted to lockdown restrictions, official data 4
            showed on Tuesday, offering some hope that a recovery from the coronavirus crisis is underway. The increase was driven by growth in manufacturing
           and trade, as well as the reopening of schools. However, the economy remained 7.8% smaller than in February 2020, before the pandemic hit.
           Economists said the figures were encouraging but cautioned that the road to full recovery remained long.'''

summary2 = summarize_text(sample2, model, tokenizer)
print("Summary 2 :\n", summary2)

Summary 2 :
 Britain's economy grew by 0.4% in February 2021 as companies adapt to lockdown restrictions. Growth was driven by growth in manufacturing and trade, as well as the reopening of schools.


### Building Extractive Summary (spaCy version)

In [1]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
def extractive_summary(text, top_n=2):
    doc = nlp(text)
    sentence_scores = []

    for sent in doc.sents:
        score = sum(1 for token in sent if token.ent_type_)
        sentence_scores.append((score, sent.text))

    top_sentences = sorted(sentence_scores, reverse=True)[:top_n]
    summary = " ".join([sent for _, sent in top_sentences])

    return summary

In [4]:
user_input = """Indian Space Research Organisation (ISRO) successfully launched the Chandrayaan-3 mission to explore the lunar surface & conduct scientific
experiments. This mission is expected to demonstrate the ability to perform a soft landing on the Moon and deploy a rover to study lunar soil and rocks."""

extractive_summary(user_input)

'Indian Space Research Organisation (ISRO) successfully launched the Chandrayaan-3 mission to explore the lunar surface & conduct scientific\nexperiments. This mission is expected to demonstrate the ability to perform a soft landing on the Moon and deploy a rover to study lunar soil and rocks.'

In [5]:
sample2 = '''LONDON (Reuters) - Britain’s economy grew by 0.4% in February 2021 as companies adapted to lockdown restrictions, official data 4
            showed on Tuesday, offering some hope that a recovery from the coronavirus crisis is underway. The increase was driven by growth in manufacturing
           and trade, as well as the reopening of schools. However, the economy remained 7.8% smaller than in February 2020, before the pandemic hit.
           Economists said the figures were encouraging but cautioned that the road to full recovery remained long.'''

extractive_summary(sample2)

'LONDON (Reuters) - Britain’s economy grew by 0.4% in February 2021 as companies adapted to lockdown restrictions, official data 4\n            showed on Tuesday, offering some hope that a recovery from the coronavirus crisis is underway. However, the economy remained 7.8% smaller than in February 2020, before the pandemic hit.\n           '