In [1]:
!pip install transformers



In [1]:
import os
import torch

print("CUDA Available:", torch.cuda.is_available())
print("Number of CUDA devices:", torch.cuda.device_count())

print("Number of CUDA devices:", torch.cuda.device_count())
for i in range(torch.cuda.device_count()):
    print("CUDA Device {}: {}".format(i, torch.cuda.get_device_name(i)))

# Check for the availability of CUDA-enabled GPU
if torch.cuda.is_available():
    # Set device to GPU
    torch.cuda.set_device(0)
    device = torch.device("cuda")
    print("Switched to GPU")
else:
    # Use CPU if no GPU is available
    device = torch.device("cpu")
    print("No GPU available, using CPU")

print("Using device:", device)


CUDA Available: True
Number of CUDA devices: 1
Number of CUDA devices: 1
CUDA Device 0: NVIDIA GeForce RTX 3060 Laptop GPU
Switched to GPU
Using device: cuda


In [4]:
!pip install sentence-transformers




In [5]:
!pip install torch




In [7]:
!pip install summa




In [6]:
!pip install numpy




In [8]:
!pip install scikit-learn




In [55]:
import os
import re
from transformers import PegasusTokenizer

# Initialize Legal-PEGASUS tokenizer
tokenizer = PegasusTokenizer.from_pretrained("nsi319/legal-pegasus")

def preprocess_text(text):
    # Convert text to lowercase and remove unnecessary characters or metadata
    cleaned_text = text.lower()
    cleaned_text = re.sub(r'\n+', ' ', cleaned_text)  # Remove extra newlines
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Remove extra whitespaces
    return cleaned_text.strip()


def tokenize_text(text):
    # Tokenize text using Legal-PEGASUS tokenizer
    tokens = tokenizer.tokenize(text)
    return tokens

def load_dataset(folder_path):
    dataset = []
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            text = preprocess_text(text)
            tokens = tokenize_text(text)
            dataset.append((tokens, folder_path.split("/")[-1]))  # Pairing text with its folder name (judgment/summary)
    return dataset

# Example usage
judgment_dir = "judgment"  # Replace this with the actual path to your judgment folder
summary_dir = "summary"    # Replace this with the actual path to your summary folder

judgment_dataset = load_dataset(judgment_dir)
summary_dataset = load_dataset(summary_dir)

print("Judgment dataset size:", len(judgment_dataset))
print("Summary dataset size:", len(summary_dataset))


Judgment dataset size: 7030
Summary dataset size: 7030


In [57]:
import os
import re
def chunk_document_flexible(text, max_chunk_size):
    chunks = []
    current_chunk = []
    current_size = 0

    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)  # Split text into sentences
    for sentence in sentences:
        sentence_size = len(sentence.split())
        if current_size + sentence_size > max_chunk_size and current_chunk:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            current_size = 0
        current_chunk.append(sentence)
        current_size += sentence_size
    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

def process_judgment_files(input_folder, output_folder, max_chunk_size):
    os.makedirs(output_folder, exist_ok=True)
    for file_name in os.listdir(input_folder):
        input_file_path = os.path.join(input_folder, file_name)
        output_base_name = os.path.splitext(file_name)[0] + "_chunk"
        with open(input_file_path, 'r', encoding='utf-8') as input_file:
            text = input_file.read()
            text = preprocess_text(text)
            chunks = chunk_document_flexible(text, max_chunk_size)
            for i, chunk in enumerate(chunks, start=1):
                output_file_path = os.path.join(output_folder, f"{output_base_name}{i}.txt")
                with open(output_file_path, 'w', encoding='utf-8') as output_file:
                    output_file.write(chunk)


In [59]:
judgment_input_folder = "judgment"  
judgment_output_folder = "judgment_chunk" 
max_chunk_size = 950  

process_judgment_files(judgment_input_folder, judgment_output_folder, max_chunk_size)

In [26]:
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

def get_embedding(text):
    input_ids = tokenizer.encode(text, return_tensors="pt", max_length=1024, truncation=True)
    with torch.no_grad():
        output = model(input_ids)
    return output.last_hidden_state.mean(dim=1).squeeze().numpy()

def calculate_mean_cosine_similarity(judgment_embedding, summary_embedding):
    similarities = cosine_similarity(judgment_embedding.reshape(1, -1), summary_embedding)
    mean_similarity = np.mean(similarities)
    return mean_similarity

def generate_summary(judgment_chunk, summary_file):
    judgment_embedding = get_embedding(judgment_chunk)
    
    with open(summary_file, 'r', encoding='utf-8') as file:
        summary_text = file.read()
        summary_text = preprocess_text(summary_text)
        summary_embedding = get_embedding(summary_text)
    
    similarity = calculate_mean_cosine_similarity(judgment_embedding, summary_embedding)
    return summary_text if similarity > 0 else ""

def process_judgment_chunks(judgment_chunk_folder, summary_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    for chunk_file_name in os.listdir(judgment_chunk_folder):
        judgment_chunk_path = os.path.join(judgment_chunk_folder, chunk_file_name)
        summary_file_name = chunk_file_name.split("_")[0] + ".txt"
        summary_file_path = os.path.join(summary_folder, summary_file_name)
        output_file_path = os.path.join(output_folder, chunk_file_name.replace("_chunk", "_chunksum"))
        
        with open(judgment_chunk_path, 'r', encoding='utf-8') as judgment_chunk_file, \
             open(output_file_path, 'w', encoding='utf-8') as output_file:
            judgment_chunk = judgment_chunk_file.read()
            summary = generate_summary(judgment_chunk, summary_file_path)
            output_file.write(summary)

In [100]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to preprocess text
def preprocess_text(text):
    cleaned_text = text.lower()  # Convert text to lowercase
    cleaned_text = re.sub(r'\n+', ' ', cleaned_text)  # Remove extra newlines
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Remove extra whitespaces
    return cleaned_text.strip()

   

In [44]:
# Function to process judgment chunks
def process_specified_chunks(judgment_chunk_folder, summary_folder, output_folder, chunk_names):
    os.makedirs(output_folder, exist_ok=True)
    for chunk_name in chunk_names:
        judgment_chunk_file = os.path.join(judgment_chunk_folder, f"{chunk_name}.txt")
        summary_file = os.path.join(summary_folder, "1.txt")  # Assuming all chunks correspond to 1.txt summary
        
        output_file_path = os.path.join(output_folder, f"{chunk_name.replace('_chunk', '_chunksum')}.txt")
        
        with open(judgment_chunk_file, 'r', encoding='utf-8') as file:
            judgment_chunk = file.read()
            summary = generate_summary(judgment_chunk, summary_file)
            with open(output_file_path, 'w', encoding='utf-8') as output_file:
                output_file.write(summary)

# Example usage
judgment_chunk_folder = "judgment_chunk"  # Replace this with the actual path to your judgment chunk folder
summary_folder = "summary"  # Replace this with the actual path to your summary folder
summary_chunk_folder = "summary_chunk"  # Replace this with the desired output path for chunked summaries
chunk_names = ["1_chunk1", "1_chunk2", "1_chunk3", "1_chunk4"]  # Specify the chunk names to be processed

process_specified_chunks(judgment_chunk_folder, summary_folder, summary_chunk_folder, chunk_names)


In [60]:

import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer
import torch
from summa import summarizer  # Using Summa for final summarization

# Initialize the models
tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-pegasus")
model = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-pegasus")
sentence_model = SentenceTransformer("average_word_embeddings_glove.6B.300d")

In [77]:
def calculate_mean_cosine_similarity(document_chunk, summary_vectors):
    chunk_vector = sentence_model.encode(document_chunk)
    # Reshape summary vectors to match the shape of chunk vector
    summary_vectors = summary_vectors.reshape(len(summary_vectors), -1)
    similarities = cosine_similarity(chunk_vector, summary_vectors)
    return np.mean(similarities)


In [90]:
# Function to generate summaries for each chunk
# Function to generate summaries for each chunk
def generate_chunk_summaries(document_chunks, summary_vectors, model, tokenizer):
    chunk_summaries = []
    for chunk in document_chunks:
        # Encode the chunk text
        inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            # Generate summary
            summary_ids = model.generate(inputs.input_ids, num_beams=4, min_length=10, max_length=150, early_stopping=True)
            # Decode summary
            summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
            chunk_summaries.append(summary)
    return chunk_summaries


In [135]:
import os
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Function to calculate mean cosine similarity
def calculate_mean_cosine_similarity(document_chunk, summary_vectors):
    chunk_vector = sentence_model.encode([document_chunk])[0]
    summary_vectors = np.vstack(summary_vectors)
    similarities = cosine_similarity(chunk_vector.reshape(1, -1), summary_vectors)
    mean_similarity = np.mean(similarities)
    return mean_similarity

# Function to generate and save chunk summaries separately
def generate_and_save_chunk_summaries(judgment_chunk_folder, summary_folder, output_folder):
    for chunk_file_name in os.listdir(judgment_chunk_folder):
        if chunk_file_name.endswith(".txt"):
            with open(os.path.join(judgment_chunk_folder, chunk_file_name), 'r', encoding='utf-8') as f:
                document_text = f.read()

            # Find the corresponding summary file
            summary_file_name = "1.txt"  # Assuming all summaries are in the same file for this example
            summary_file_path = os.path.join(summary_folder, summary_file_name)

            if os.path.exists(summary_file_path):
                with open(summary_file_path, 'r', encoding='utf-8') as f:
                    summary_text = f.read()

                # Preprocess text
                document_chunks = document_text.split('\n')
                summary_vectors = sentence_model.encode(summary_text.split('\n'))

                # Generate summaries for each chunk
                chunk_summaries = []
                for i, chunk in enumerate(document_chunks, start=1):
                    similarity = calculate_mean_cosine_similarity(chunk, summary_vectors)
                    if similarity > 0:  # If similarity is found
                        # Summarize the chunk
                        chunk_summary = summarizer.summarize(chunk)
                        chunk_summaries.append(chunk_summary)
                    else:
                        chunk_summaries.append("")  # Empty summary if no similarity is found

                # Save each chunk summary separately
                for i, summary in enumerate(chunk_summaries, start=1):
                    chunk_summary_file_name = f"{os.path.splitext(chunk_file_name)[0]}_sum{i}.txt"
                    with open(os.path.join(output_folder, chunk_summary_file_name), 'w', encoding='utf-8') as f:
                        f.write(summary)
            else:
                print(f"No summary found for {chunk_file_name}")

# Paths
judgment_chunk_folder = "judgment_chunk"  # Folder containing judgment chunks
summary_folder = "summary"  # Folder containing summary files
output_folder = "summary_chunk_final"  # Output folder to store chunk summaries separately

# Create the output directory if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Call the function
generate_and_save_chunk_summaries(judgment_chunk_folder, summary_folder, output_folder)


In [19]:
import os

def count_files_in_folder(folder_path):
    if not os.path.isdir(folder_path):
        return 0
    return len([name for name in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, name))])

judgment_chunk_path = "judgment_chunk"
summary_chunk_final_path = "summary_chunk_final"

judgment_chunk_count = count_files_in_folder(judgment_chunk_path)
summary_chunk_final_count = count_files_in_folder(summary_chunk_final_path)

total_files_count = judgment_chunk_count + summary_chunk_final_count

print("Total number of files in judgment_chunk:", judgment_chunk_count)
print("Total number of files in summary_chunk_final:", summary_chunk_final_count)
print("Total number of files in both folders:", total_files_count)


Total number of files in judgment_chunk: 36469
Total number of files in summary_chunk_final: 36469
Total number of files in both folders: 72938


In [124]:
import os

def rename_files_in_folder(folder_path, suffix):
    if not os.path.isdir(folder_path):
        print(f"The folder '{folder_path}' does not exist.")
        return
    
    processed_folder_path = "processed_" + os.path.basename(folder_path)
    processed_folder_full_path = os.path.join(os.path.dirname(folder_path), processed_folder_path)
    os.makedirs(processed_folder_full_path, exist_ok=True)

    # Iterate over files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(suffix):
            # Extract the prefix and chunk number from the filename
            prefix, chunk_number = filename.split('_')[0], filename.split('_')[1]
            # Construct the new filename
            new_filename = f"{prefix}_{chunk_number}.{suffix}"
            # Rename the file
            os.rename(os.path.join(folder_path, filename), os.path.join(processed_folder_full_path, new_filename))
            print(f"Renamed '{filename}' to '{new_filename}'")

summary_chunk_final_path = "summary_chunk_final"
suffix = "sum1.txt"

rename_files_in_folder(summary_chunk_final_path, suffix)


In [137]:
import os
import shutil

# Source and destination directories
source_folder = "summary_chunk_final"
destination_folder = "summary_chunk"

# Create the destination folder if it doesn't exist
os.makedirs(destination_folder, exist_ok=True)

# Iterate over files in the source folder
for filename in os.listdir(source_folder):
    source_file_path = os.path.join(source_folder, filename)
    destination_file_path = os.path.join(destination_folder, filename)
    # Copy the file to the destination folder
    shutil.copy(source_file_path, destination_file_path)

print("Files copied successfully!")


Files copied successfully!


In [17]:
import pandas as pd
import os

# Path to the folders
judgment_chunk_path = "judgment_chunk"
summary_chunk_final_path = "summary_chunk_final"

# List all files in the folders
judgment_files = sorted(os.listdir(judgment_chunk_path))
summary_files = sorted(os.listdir(summary_chunk_final_path))

# Initialize lists to store file contents and names
judgment_contents = []
summary_contents = []
judgment_filenames = []
summary_filenames = []

# Read contents of judgment files and save file names
for file in judgment_files:
    with open(os.path.join(judgment_chunk_path, file), 'r') as f:
        judgment_contents.append(f.read())
    judgment_filenames.append(file)

# Read contents of summary files and save file names
for file in summary_files:
    with open(os.path.join(summary_chunk_final_path, file), 'r') as f:
        summary_contents.append(f.read())
    summary_filenames.append(file)

# Create DataFrame
data = {
    
    'judgment': judgment_contents,
    'summary_filename': summary_filenames,
    'judgment_filename': judgment_filenames,
    'summary': summary_contents
}
df = pd.DataFrame(data)

# Save DataFrame to CSV
df.to_csv('output.csv', index=False)


In [18]:
df

Unnamed: 0,judgment,summary_filename,judgment_filename,summary
0,appeal no. 388 of 1960. appeal by special leav...,1000_chunk1_sum1.txt,1000_chunk1.txt,"on february 1, 1957, the nomination paper file..."
1,"section 7(d), as it stood at the material time...",1000_chunk2_sum1.txt,1000_chunk2.txt,"section 7(d), as it stood at the material time..."
2,"varma, however, contends that the service unde...",1000_chunk3_sum1.txt,1000_chunk3.txt,"varma, however, contends that the service unde..."
3,in that case the appellant who was a contracto...,1000_chunk4_sum1.txt,1000_chunk4.txt,"""it cannot be gainsaid"", observed sinha, j., a..."
4,appeal no. 198 of 1954. appeal from the judgme...,1001_chunk1_sum1.txt,1001_chunk1.txt,this is an appeal from the judgment of the nag...
...,...,...,...,...
36464,the learned advocate for the appellant therefo...,9_chunk2_sum1.txt,9_chunk2.txt,that the judgment debtor respondent suppressed...
36465,"in the insolvency court, he set up the plea th...",9_chunk3_sum1.txt,9_chunk3.txt,(2) nothing in this section shall be deemed (a...
36466,having thus got over the difficulty in his way...,9_chunk4_sum1.txt,9_chunk4.txt,if the facts proved and found as established a...
36467,concealing from a person the knowledge of his ...,9_chunk5_sum1.txt,9_chunk5.txt,the decree holder must have been taking steps ...


In [20]:
# Assuming you have already created the DataFrame 'df'
row = df[df['summary_filename'] == '1_chunk4_sum1.txt']

# Printing the row
print(row)


                                               judgment   summary_filename  \
6108  it is elementary that the primary duty of a co...  1_chunk4_sum1.txt   

     judgment_filename                                            summary  
6108      1_chunk4.txt  in the above case in arriving at his conclusio...  


In [23]:
# Assuming you have already created the DataFrame 'df'
df1 = df.drop(columns=['summary_filename', 'judgment_filename'])

# Print the modified DataFrame
df1


Unnamed: 0,judgment,summary
0,appeal no. 388 of 1960. appeal by special leav...,"on february 1, 1957, the nomination paper file..."
1,"section 7(d), as it stood at the material time...","section 7(d), as it stood at the material time..."
2,"varma, however, contends that the service unde...","varma, however, contends that the service unde..."
3,in that case the appellant who was a contracto...,"""it cannot be gainsaid"", observed sinha, j., a..."
4,appeal no. 198 of 1954. appeal from the judgme...,this is an appeal from the judgment of the nag...
...,...,...
36464,the learned advocate for the appellant therefo...,that the judgment debtor respondent suppressed...
36465,"in the insolvency court, he set up the plea th...",(2) nothing in this section shall be deemed (a...
36466,having thus got over the difficulty in his way...,if the facts proved and found as established a...
36467,concealing from a person the knowledge of his ...,the decree holder must have been taking steps ...


In [31]:
file_path = 'final_dataframe.csv'

# Save the DataFrame to a CSV file
df.to_csv(file_path, index=False)


In [27]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, AdamW
import torch
import pandas as pd
import time

# Load Legal PEGASUS tokenizer
tokenizer = PegasusTokenizer.from_pretrained("nsi319/legal-pegasus")

# Prepare data
judgments = df1['judgment'].tolist()
summaries = df1['summary'].tolist()

# Tokenize and encode data
input_ids = tokenizer.prepare_seq2seq_batch(judgments, summaries, truncation=True, padding=True, return_tensors='pt')

# Load Legal PEGASUS model
model = PegasusForConditionalGeneration.from_pretrained("nsi319/legal-pegasus")

# Define training arguments




In [30]:
num_train_epochs = 2
learning_rate = 5e-5
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Start time
start_time = time.time()

# Training loop
for epoch in range(num_train_epochs):
    # Train the model
    model.train()
    optimizer.zero_grad()
    outputs = model(**input_ids)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    
    # Print elapsed time
    elapsed_time = time.time() - start_time
    print(f"Epoch {epoch + 1}/{num_train_epochs} - Elapsed Time: {elapsed_time:.2f} seconds")

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_nsi319_legal_pegasus")


RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 152962072576 bytes.