# Introduction to Pegasus

# 1. Preprocessing data

PEGASUS (Pre-training with Extracted Gap-sentences for Abstractive Summarization) is a state-of-the-art language model developed by Google Research, designed specifically for abstractive text summarization tasks. Unlike traditional models, PEGASUS introduces a novel pretraining objective called Gap Sentence Generation (GSG), which closely aligns with the summarization process.

During pretraining, PEGASUS is fine-tuned by masking entire sentences in a document and training the model to generate these missing sentences based on the surrounding context. This approach enables the model to focus on understanding the core ideas of a document, simulating the process of creating summaries. By leveraging GSG and massive-scale datasets, PEGASUS achieves remarkable performance on a variety of summarization benchmarks, such as CNN/DailyMail and XSum, often exceeding prior models in terms of coherence and informativeness.

PEGASUS is built on the Transformer architecture and benefits from its scalability and parallelization, making it suitable for a wide range of natural language processing applications beyond summarization. Its implementation and pretrained weights are accessible through the [Hugging Face](https://huggingface.co/docs/transformers/en/model_doc/pegasus) Transformers library, enabling researchers and developers to explore its capabilities and adapt it for specific use cases with ease.

*(Text henerated with the help from ChatGPT)*

In [None]:
from IPython import get_ipython
get_ipython().cache_size = 0  # Disable output cache

# Clean workspace
import gc
import torch

# Clear unnecessary variables
def clean_workspace():
    print("Cleaning workspace...")
    
    # Delete all variables in the global scope except system modules
    global_vars = list(globals().keys())
    for var in global_vars:
        if var not in ["gc", "torch", "clean_workspace"]:  # Keep required modules and function
            del globals()[var]
    
    # Clear GPU memory
    print("Clearing GPU memory...")
    torch.cuda.empty_cache()
    torch.cuda.synchronize()

    # Perform garbage collection
    print("Running garbage collection...")
    gc.collect()

    print("Workspace cleaned successfully!")

# Call the function
clean_workspace()

## 1.1 Import necessary libraries

In [None]:
%%capture captured_output
!pip install sentence_transformers bert_score evaluate
! pip install rouge_score

import pandas as pd 
import shutil
import random
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset

# for evaluation
import os
import torch
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import evaluate
from bert_score import score
import numpy as np

# Load sentence transformer for embeddings
sentence_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Load ROUGE metric
rouge = evaluate.load("rouge")

## 1.2 Create datasets for train and test

In [None]:
# Function to load data from text files
def load_data(judgement_folder, summary_folder, max_files=6000):
    data = []
    # List files in judgement and summary folders
    judgement_files = sorted(os.listdir(judgement_folder))[:max_files]
    summary_files = sorted(os.listdir(summary_folder))[:max_files]

    # Loop through first `max_files` files
    for j_file, s_file in zip(judgement_files, summary_files):
        with open(os.path.join(judgement_folder, j_file), 'r') as j_f:
            judgement_text = j_f.read()
        with open(os.path.join(summary_folder, s_file), 'r') as s_f:
            summary_text = s_f.read()

        # Append data to list
        data.append({"judgement": judgement_text, "summary": summary_text})
    
    return data

In [None]:
# Load training and test data
test_data = load_data("/kaggle/input/legal-case-document-summarization/dataset/IN-Abs/test-data/judgement", "/kaggle/input/legal-case-document-summarization/dataset/IN-Abs/test-data/summary")
train_data = load_data("/kaggle/input/legal-case-document-summarization/dataset/IN-Abs/train-data/judgement", "/kaggle/input/legal-case-document-summarization/dataset/IN-Abs/train-data/summary")

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)

# 2. Train Pegasus Model

In [None]:
%%capture captured_output
# Check if GPU is availablee
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Training on: {device}")

# Load in pre trained Pegasus Model
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-large").to(device)

## 2.1 Generated summaries with Pre-trained model

To compare the performances between pre-trained Pegasus model and fine-tuned pegasus model, we generated summaries with pre-trained model first to calculate ROUGE scores and Bert score.

In [None]:
%%capture captured_output

test_dir = '/kaggle/input/legal-case-document-summarization/dataset/IN-Abs/test-data/judgement'
summary_dir = '/kaggle/input/legal-case-document-summarization/dataset/IN-Abs/test-data/summary'

pre_generated_summaries = []
reference_summaries = []
pre_cosine_similarities = []

# Loop through the test files
for filename in os.listdir(test_dir):
    if filename.endswith('.txt'):
        # Read the test document
        with open(os.path.join(test_dir, filename), 'r', encoding='utf-8') as file:
            test_document = file.read()
            
        # Read the corresponding reference summary
        with open(os.path.join(summary_dir, filename), 'r', encoding='utf-8') as ref_file:
            reference_summary = ref_file.read()
            reference_summaries.append(reference_summary)

        # Tokenize and generate summary using pre_trained Pegasus
        inputs = tokenizer(test_document, return_tensors="pt", max_length=1024, truncation=True)
        input_ids = inputs["input_ids"].to(device)  # Send input_ids tensor to the device
        attention_mask = inputs["attention_mask"].to(device)  # Send attention mask to the device

        # Generate the summary
        summary_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=150, min_length=40, 
                                     length_penalty=2.0, num_beams=4, early_stopping=True)
        pre_generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        pre_generated_summaries.append(pre_generated_summary)

        # Calculate cosine similarity using sentence embeddings
        ref_embedding = sentence_model.encode([reference_summary])[0]
        gen_embedding = sentence_model.encode([pre_generated_summary])[0]

        cosine_sim = cosine_similarity([ref_embedding], [gen_embedding])
        pre_cosine_similarities.append(cosine_sim[0][0])

In [None]:
# Calculate average cosine similarity
pre_average_cosine_similarity = np.mean(pre_cosine_similarities)
print(f"Average Cosine Similarity for Pre Trained Pegasus: {pre_average_cosine_similarity}")

# Calculate ROUGE score
rouge_scores = rouge.compute(predictions=pre_generated_summaries, references=reference_summaries)
print(f"ROUGE scores for Pre Trained Pegasus: {rouge_scores}")

# Calculate BERTScore
P, R, F1 = score(pre_generated_summaries, reference_summaries, lang='en', rescale_with_baseline=True)
avg_f1 = np.mean(F1.numpy())
print(f"BERTScore F1 for Pre Trained Pegasus: {avg_f1}")

## 2.2 Fine tune the model

In [None]:
# Tokenization
def preprocess_function(examples):
    model_inputs = tokenizer(examples["judgement"], max_length=1024, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=256, truncation=True, padding="max_length")
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize the datasets
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

We learned the training arguments from [this kaggle notebook](https://www.kaggle.com/code/sathwikareddy28/casemain).

In [None]:
# Define Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",   # Evaluate every epoch
    learning_rate=2e-5,
    per_device_train_batch_size=1,  # Adjust batch size based on GPU memory
    per_device_eval_batch_size=1,   # Adjust batch size based on GPU memory
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    save_total_limit=None,
    save_steps=0,
    report_to="none",  # Disable reports to WandB, etc.
    fp16=True,  # Enable mixed precision for faster training on GPU
)
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,  # Evaluation on test dataset
)

In [None]:
# Start Training
trainer.train()

# Save the fine-tuned model
model.save_pretrained('./fine_tuned_pegasus_model')
tokenizer.save_pretrained('./fine_tuned_pegasus_model')

print("Training complete and model saved.")

# 3. Evaluation

In [None]:
# Check if GPU is available and set the device for the model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.cuda.empty_cache()

# Load the Pegasus model and tokenizer for summary generation
model = PegasusForConditionalGeneration.from_pretrained("/kaggle/working/fine_tuned_pegasus_model").to(device)
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")

In [None]:
%%capture captured_output

test_dir = '/kaggle/input/legal-case-document-summarization/dataset/IN-Abs/test-data/judgement'
summary_dir = '/kaggle/input/legal-case-document-summarization/dataset/IN-Abs/test-data/summary'

generated_summaries = []
reference_summaries = []
cosine_similarities = []

# Loop through the test files
for filename in os.listdir(test_dir):
    if filename.endswith('.txt'):
        # Read the test document
        with open(os.path.join(test_dir, filename), 'r', encoding='utf-8') as file:
            test_document = file.read()
            
        # Read the corresponding reference summary
        with open(os.path.join(summary_dir, filename), 'r', encoding='utf-8') as ref_file:
            reference_summary = ref_file.read()
            reference_summaries.append({'id': filename, 'summary': reference_summary})

        # Tokenize and generate summary using Pegasus
        inputs = tokenizer(test_document, return_tensors="pt", max_length=1024, truncation=True)
        input_ids = inputs["input_ids"].to(device)  # Send input_ids tensor to the device
        attention_mask = inputs["attention_mask"].to(device)  # Send attention mask to the device

        # Generate the summary
        summary_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=150, min_length=40, 
                                     length_penalty=2.0, num_beams=4, early_stopping=True)
        generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        generated_summaries.append({'id': filename, 'summary': generated_summary})

        # Calculate cosine similarity using sentence embeddings
        ref_embedding = sentence_model.encode([reference_summary])[0]
        gen_embedding = sentence_model.encode([generated_summary])[0]

        cosine_sim = cosine_similarity([ref_embedding], [gen_embedding])
        cosine_similarities.append(cosine_sim[0][0])

In [None]:
# Calculate average cosine similarity
average_cosine_similarity = np.mean(cosine_similarities)
print(f"Average Cosine Similarity for Pegasus: {average_cosine_similarity}")

# Extract summaries text
plain_generated_summaries = [item['summary'] for item in generated_summaries]
plain_reference_summaries = [[item['summary']] for item in reference_summaries]

# Calculate ROUGE score
rouge_scores = rouge.compute(predictions=plain_generated_summaries, references=plain_reference_summaries)
print(f"ROUGE scores for Pegasus: {rouge_scores}")

# Calculate BERTScore
P, R, F1 = score(plain_generated_summaries, plain_reference_summaries, lang='en', rescale_with_baseline=True)
avg_f1 = np.mean(F1.numpy())
print(f"BERTScore F1 for Pegasus: {avg_f1}")

After obtaining the Rouge scores and bert score of the fine-tuned pegasus model, we can see that these scores are higher compared to the pre-trained model. Thus, training the model improved its performance.

# 4. Download generated summaries and reference summaries

Here, we downloaded summaries generated by fine-tuned pegasus model for further classification between pegasus generated summaries and GPT-4t generated summaries.

In [None]:
# Create the dataset
dataset = []
for gen, ref in zip(generated_summaries, reference_summaries):
    dataset.append({
        'id': gen['id'],  # File name as id
        'generated_summary': gen['summary'],  # Generated summary
        'reference_summary': ref['summary'],  # Reference summary
        'label': 'Pegasus'  # Add label column with "Pegasus"
    })

# Convert to a Pandas DataFrame
df = pd.DataFrame(dataset)

# Save the dataset to a CSV file
df.to_csv("summary_dataset.csv", index=False, encoding='utf-8')

# Display the first few rows of the dataset
print(df.head())

In [None]:
# Example legal document text
document_text = """This Non-Disclosure Agreement (NDA) is made and entered into on February 21, 2025, by and between Alpha Innovations, a corporation duly organized under the laws of Texas, and Beta Solutions, a limited liability company registered in New York. The parties agree to maintain the confidentiality of proprietary information exchanged during discussions related to potential business collaboration"""

# Get the summary
summary = summarize_legal_document(document_text)

# Print the summary
print("Generated Summary:")
print(summary)


In [None]:
!ngrok config add-authtoken 2tM3DYgLA5nFdgtHTAeERq6sLJC_2Vsp1uCzyVVJc7qQLSNid
!pip install pyngrok
!pip install cors
!pip install flask-cors
import os
from flask import Flask, request, jsonify
from flask_cors import CORS
from transformers import pipeline
import os
import torch
from flask import Flask, request, jsonify
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from pyngrok import ngrok

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Use the directory that contains the full fine-tuned model (including tokenizer files)
model_path = "/kaggle/working/fine_tuned_pegasus_model"
if os.path.exists(model_path):
    model = PegasusForConditionalGeneration.from_pretrained(model_path).to(device)
    tokenizer = PegasusTokenizer.from_pretrained(model_path)
else:
    model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-large").to(device)
    tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")

# Summarization function
def summarize_legal_document(document_text, max_length=150, min_length=40, num_beams=4):
    inputs = tokenizer(document_text, return_tensors="pt", max_length=1024, truncation=True)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)
    summary_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        min_length=min_length,
        length_penalty=2.0,
        num_beams=num_beams,
        early_stopping=True
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Initialize Flask app
app = Flask(__name__)
CORS(app, resources={r"/*": {"origins": "*"}})

# Endpoint for summarizing plain text
@app.route('/summarize_text', methods=['POST'])
def summarize_text_api():
    data = request.get_json()
    if not data or "document_text" not in data:
        return jsonify({"error": "Please provide document_text in JSON payload"}), 400

    document_text = data["document_text"]
    summary = summarize_legal_document(document_text)
    return jsonify({"summary": summary})

if __name__ == '__main__':
    public_url = ngrok.connect(5000).public_url
    print("Public URL:", public_url)
    app.run(host='0.0.0.0', port=5000)


In [None]:
def summarize_legal_document(document_text, max_length=150, min_length=40, num_beams=4):

    # Tokenize the input document
    inputs = tokenizer(document_text, return_tensors="pt", max_length=1024, truncation=True)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # Generate summary
    summary_ids = model.generate(
        input_ids, 
        attention_mask=attention_mask, 
        max_length=max_length, 
        min_length=min_length, 
        length_penalty=2.0, 
        num_beams=num_beams, 
        early_stopping=True
    )

    # Decode summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


In [None]:
!pip install pyngrok