In [1]:
!pip install transformers




[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
!pip install sentence-transformers



[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip




In [3]:
!pip install torch





[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
!pip install summa





[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip





In [5]:
!pip install numpy





[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
!pip install scikit-learn





[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
# Import necessary libraries
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer
import torch
from summa import summarizer  # Using Summa for final summarization

# Initialize the models
tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-pegasus")
model = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-pegasus")
sentence_model = SentenceTransformer("average_word_embeddings_glove.6B.300d")

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
# Function to preprocess text and split into chunks
def preprocess_and_chunk(text, chunk_size=1000):
    preprocessed_chunks = []
    text_chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    for chunk in text_chunks:
        # Tokenize and encode the chunk
        input_tokenized = tokenizer.encode(chunk, return_tensors='pt', max_length=1024, truncation=True)
        preprocessed_chunks.append(input_tokenized)
    return preprocessed_chunks

In [11]:
# Function to calculate mean cosine similarity
def calculate_mean_cosine_similarity(document_chunk, summary_vectors):
    chunk_vectors = sentence_model.encode(tokenizer.decode(document_chunk[0]).split('\n'))
    similarities = cosine_similarity(chunk_vectors, summary_vectors)
    return np.mean(similarities)


In [12]:
# Function to generate summaries for each chunk
def generate_chunk_summaries(document_chunks, summary_vectors):
    chunk_summaries = []
    for chunk in document_chunks:
        with torch.no_grad():
            summary_ids = model.generate(chunk, num_beams=9, no_repeat_ngram_size=3, length_penalty=2.0,
                                         min_length=150, max_length=250, early_stopping=True)
            summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids][0]
            chunk_summaries.append(summary)
    return chunk_summaries

In [None]:
# Function to generate and save combined summaries
def generate_and_save_combined_summaries(judgement_folder, summary_folder, output_folder):
    for file_name in os.listdir(judgement_folder):
        if file_name.endswith(".txt"):
            with open(os.path.join(judgement_folder, file_name), 'r', encoding='utf-8') as f:
                document_text = f.read()

            # Find the corresponding summary file based on prefix matching
            summary_file_prefix = file_name.replace(".txt", "")
            summary_file_name = None
            for summary_file in os.listdir(summary_folder):
                if summary_file.startswith(summary_file_prefix):
                    summary_file_name = summary_file
                    break

            if summary_file_name is not None:
                with open(os.path.join(summary_folder, summary_file_name), 'r', encoding='utf-8') as f:
                    summary_text = f.read()

                document_chunks = preprocess_and_chunk(document_text)
                summary_vectors = sentence_model.encode(summary_text.split('\n'))
                chunk_summaries = generate_chunk_summaries(document_chunks, summary_vectors)
                
                # Combine the results
                combined_summary = "\n".join([f"Chunk {i+1}: Summary - {summary}" for i, summary in enumerate(chunk_summaries)])
                
                # Summarize all chunk summaries
                final_summary = summarizer.summarize(combined_summary, ratio=0.3)  # You can adjust the ratio as needed
                
                # Save final summary for each document
                with open(os.path.join(output_folder, file_name), 'w', encoding='utf-8') as f:
                    f.write(final_summary)
            else:
                print(f"No summary found for {file_name}")

# Paths
judgement_folder = "judgement"
summary_folder = "summary"
output_folder = "New_summary"

# Generate and save combined summaries
generate_and_save_combined_summaries(judgement_folder, summary_folder, output_folder)