In [1]:
from transformers import pipeline, set_seed, AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, TrainingArguments, Trainer
from datasets import load_dataset, load_from_disk, load_metric
import matplotlib.pyplot as plt
import pandas as pd

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\itadi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
import torch
import torchvision

print(torch.cuda.is_available())
print("Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

True
Device: NVIDIA GeForce RTX 3060 Ti


In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_ckpt = 'google/pegasus-cnn_dailymail'

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
data = load_dataset('Samsung/samsum', trust_remote_code=True)

In [5]:
data

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [6]:
data['train'][:2]

{'id': ['13818513', '13728867'],
 'dialogue': ["Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)",
  'Olivia: Who are you voting for in this election? \r\nOliver: Liberals as always.\r\nOlivia: Me too!!\r\nOliver: Great'],
 'summary': ['Amanda baked cookies and will bring Jerry some tomorrow.',
  'Olivia and Olivier are voting for liberals in this election. ']}

In [7]:
split_lengths = [len(data[split]) for split in data]

print(f"Split Lengths: {split_lengths}")
print(f"Features: {data['train'].column_names}")
print(f"Dialogue: {data['test'][0]['dialogue']}")
print(f"\nSummary: {data['test'][0]['dialogue']}")

Split Lengths: [14732, 819, 818]
Features: ['id', 'dialogue', 'summary']
Dialogue: Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

Summary: Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye


In [8]:
def generate_features(batch):
    """Tokenizes input and target text for model training.

    Args:
        batch (dict): A dictionary containing 'dialogue' and 'summary' fields.
            - 'dialogue' (str): The input text to be tokenized.
            - 'summary' (str): The target summary text to be tokenized.

    Returns:
         dict: A dictionary with the following keys:
            - 'input_ids' (list of int): Token IDs for the input dialogue.
            - 'attention_mask' (list of int): Attention mask for the input dialogue.
            - 'labels' (list of int): Token IDs for the target summary.
    """
    try:
        # Tokenize the input dialogue and target summary together
        encodings = tokenizer(batch['dialogue'], 
                              text_target=batch['summary'], 
                              max_length=1024, 
                              padding=True, 
                              truncation=True, 
                              return_tensors='pt')
    except ValueError as e:
        print(f"Error processing batch: {e}")

    return {
        'input_ids': encodings['input_ids'].to(device),
        'attention_mask': encodings['attention_mask'].to(device),
        'labels': encodings['labels'].to(device)
    }


In [9]:
data_pt = data.map(generate_features, batched=True)

Map: 100%|██████████| 819/819 [00:00<00:00, 1536.15 examples/s]


In [10]:
data_pt['train']

Dataset({
    features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 14732
})

In [11]:
data_pt['train'][0]

{'id': '13818513',
 'dialogue': "Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)",
 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.',
 'input_ids': [12195,
  151,
  125,
  7091,
  3659,
  107,
  842,
  119,
  245,
  181,
  152,
  10508,
  151,
  7435,
  147,
  12195,
  151,
  125,
  131,
  267,
  650,
  119,
  3469,
  29344,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
 

In [12]:
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

trainer_args = TrainingArguments(
    output_dir='pegasus-samsum', num_train_epochs=10, warmup_steps=500,
    per_device_train_batch_size=2, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=50,
    eval_strategy='steps', eval_steps=1000, save_steps=2000,
    gradient_accumulation_steps=8, fp16=True
)

In [None]:
trainer = Trainer(model=model_pegasus, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=data_pt["train"].select(range(1000)), 
                  eval_dataset=data_pt["validation"])

trainer.train()

                                      
  0%|          | 0/10 [2:43:06<?, ?it/s]          

{'loss': 9.203, 'grad_norm': 22.2099609375, 'learning_rate': 4.5e-06, 'epoch': 0.8}




In [None]:
def generate_batch_sized_chunks(list_of_elements, batch_size):
    """Generates chunks of data of a specified batch size from a list of elements.

    Args:
        list_of_elements (list): The list of elements to divide into batches.
        batch_size (int): The number of elements in each batch.

    Yields:
        list: A batch of elements with length up to the specified batch size.
    """
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]



def calculate_metric_on_test_ds(dataset, metric, model, tokenizer, 
                               batch_size=16, device=device, 
                               column_text="article", 
                               column_summary="highlights"):
    """Calculates evaluation metrics on a test dataset using a given model and metric.

    Args:
        dataset (Dataset): The dataset containing the text and summaries.
        metric (Metric): The metric to calculate on the model's predictions.
        model (Model): The language generation model to use for generating summaries.
        tokenizer (Tokenizer): The tokenizer to preprocess input text.
        batch_size (int, optional): The number of examples per batch. Defaults to 16.
        device (torch.device, optional): The device to run the model on. Defaults to `device`.
        column_text (str, optional): The name of the column in the dataset containing the input text. Defaults to "article".
        column_summary (str, optional): The name of the column in the dataset containing the target summaries. Defaults to "highlights".

    Returns:
        dict: A dictionary containing the computed scores for the specified metric.
    """
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):
        
        inputs = tokenizer(article_batch, max_length=1024,  truncation=True, 
                        padding="max_length", return_tensors="pt")
        
        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device), 
                         length_penalty=0.8, num_beams=8, max_length=128)
        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''
        
        # Finally, we decode the generated texts, 
        # replace the  token, and add the decoded texts with the references to the metric.
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, 
                                clean_up_tokenization_spaces=True) 
               for s in summaries]      
        
        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
        
        
        metric.add_batch(predictions=decoded_summaries, references=target_batch)
        
    #  Finally compute and return the ROUGE scores.
    score = metric.compute()
    return score

In [None]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_metric = load_metric('rouge')

In [None]:
score = calculate_metric_on_test_ds(
    data['test'], rouge_metric, trainer.model, tokenizer, batch_size = 2, column_text = 'dialogue', column_summary= 'summary'
)

rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame(rouge_dict, index = [f'pegasus'] )

In [None]:
model_pegasus.save_pretrained("pegasus-samsum-model")

In [None]:
tokenizer.save_pretrained("tokenizer")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("/content/tokenizer")

In [None]:
gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}

sample_text = data["test"][0]["dialogue"]

reference = data["test"][0]["summary"]

pipe = pipeline("summarization", model="pegasus-samsum-model",tokenizer=tokenizer)

## 
print("Dialogue:")
print(sample_text)


print("\nReference Summary:")
print(reference)


print("\nModel Summary:")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])