In [2]:
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer
import torch
from summa import summarizer

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
import os
import nltk
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('punkt')




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [11]:
# Function to read text files from a directory
def read_files_from_directory(directory):
    files = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                files.append((filename, file.read()))
    return files

# Function to chunk text into smaller parts
def chunk_text(text, chunk_size=1000):
    chunks = []
    words = nltk.word_tokenize(text)
    for i in range(0, len(words), chunk_size):
        chunks.append(' '.join(words[i:i+chunk_size]))
    return chunks

# Function to calculate Mean Cosine Similarity (MCS)
def mean_cosine_similarity(reference_sentence, sentences):
    vectorizer = CountVectorizer().fit_transform([reference_sentence] + sentences)
    vectors = vectorizer.toarray()
    return np.mean(cosine_similarity(vectors)[0, 1:])


In [19]:

# Directory paths
judgment_directory = 'judgement'
summary_directory = 'summary'

# Read judgments and summaries
judgments = read_files_from_directory(judgment_directory)[:50]
summaries = read_files_from_directory(summary_directory)[:50]
# Step 1: Chunking and Summarization
judgment_chunks = []
summary_chunks = []

# Iterate through judgment files
for judgment_filename, judgment_text in judgments:
    # Chunking text
    chunks = chunk_text(judgment_text)
    for i, chunk in enumerate(chunks):
        # Calculate MCS with corresponding summary
        print(i)
        summary_filename, summary_text = next(filter(lambda x: x[0] == judgment_filename, summaries), (None, None))
        if summary_text:
            summary_sentences = nltk.sent_tokenize(summary_text)
            chunk_sentences = nltk.sent_tokenize(chunk)
            combined_sentences = []
            for reference_sentence in summary_sentences:
                similarity_scores = [(mean_cosine_similarity(reference_sentence, chunk_sentences), sentence) for sentence in chunk_sentences]
                most_similar_sentence = max(similarity_scores)[1]
                combined_sentences.append(most_similar_sentence)
            generated_summary = ' '.join(combined_sentences)
        else:
            generated_summary = "No summary available"
        # Save judgment chunk and its corresponding summary chunk
        judgment_chunks.append((judgment_filename, i, chunk))
        summary_chunks.append((judgment_filename, i, generated_summary))


0
1
2
3
0
1
2
0
1
2
3
0
1
2
3
0
1
2
0
1
2
0
1
2
3
4
0
1
2
0
1
2
0
1
2
3
4
5
0
1
2
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
0
1
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
0
1
2
3
4
0
1
2
3
4
0
1
2
0
1
2
3
4
0
1
2
3
4
0
1
2
0
1
2
3
4
5
0
1
2
3
0
1
2
0
1
2
3
0
1
2
3
4
5
6
7
0
1
2
0
1
2
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
0
1
2
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
0
1
2
0
1
2
0
1
2
0
1
2
0
1
0
1
2
0
1
0
1
0
1
0
1
2
3
4
5
6
0
1
2
0
1
2
0
1
2
3
4
5
6
0
1
2
0
1
0
1
2
3
0
1
2
3
4
5
6
0
1
2
0
1
2
0
1
2


In [20]:
# Step 2: Saving Chunks
def save_chunks(chunks, folder):
    if not os.path.exists(folder):
        os.makedirs(folder)
    for filename, chunk_index, chunk_text in chunks:
        filename = os.path.join(folder, f'{filename}_chunk_{chunk_index}.txt')
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(chunk_text)

# Save judgment chunks
save_chunks(judgment_chunks, 'judgment_chunk')

# Save summary chunks
save_chunks(summary_chunks, 'summary_chunk')

In [33]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset

# Load LegalPegasus tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-pegasus")
model = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-pegasus")

# Define function to read text files from directory
def read_files_from_directory(directory):
    files = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                files.append(file.read())
    return files

# Read input texts and target texts from judgment_chunk and summary_chunk folders
input_texts = read_files_from_directory('judgment_chunk')
target_texts = read_files_from_directory('summary_chunk')

# Prepare data
train_dataset = Dataset.from_dict({
    'input_text': input_texts,
    'target_text': target_texts
})

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=1000,
    save_steps=500,
    save_total_limit=2,
    overwrite_output_dir=True,
    report_to="tensorboard",
)

# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("fine_tuned_legal_pegasus")


TypeError: expected string or bytes-like object

In [1]:
import os
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments

# Function to read text files from a directory
def read_files_from_directory(directory):
    texts = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                texts.append(file.read())
    return texts

# Load LegalPegasus tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-pegasus")
model = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-pegasus")

# Read input texts from judgment_chunk folder
input_texts = read_files_from_directory("judgment_chunk")

# Read target texts from summary_chunk folder
target_texts = read_files_from_directory("summary_chunk")

# Make sure the number of input and target texts match
assert len(input_texts) == len(target_texts)

# Prepare training data
train_dataset = list(zip(input_texts, target_texts))

# Configure training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir='./output',  # Specify the output directory
    predict_with_generate=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=1000,
    save_steps=500,
    save_total_limit=2,
    overwrite_output_dir=True,
    report_to="tensorboard",
)

# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("fine_tuned_legal_pegasus")


  from .autonotebook import tqdm as notebook_tqdm


RuntimeError: Failed to import transformers.trainer_seq2seq because of the following error (look up to see its traceback):
expected string or bytes-like object

In [3]:
import os

judgement_folder1 = os.path.join(os.getcwd(), "judgment_chunk")
summary_folder1 = os.path.join(os.getcwd(), "summary_chunk")

judgements1 = []
summaries1 = []

for filename in os.listdir(judgement_folder1):
    with open(os.path.join(judgement_folder1, filename), 'r', encoding='utf-8') as file:
        judgements1.append(file.read())
    
    with open(os.path.join(summary_folder1, filename), 'r', encoding='utf-8') as file:
        summaries1.append(file.read())

In [4]:
judgements1

["Appeal No . LXVI of 1949 . Appeal from the High Court of judicature , Bombay , in a reference under section 66 of the Indian Income tax Act , 1022 . K.M . Munshi ( N. P. Nathvani , with him ) , for the appel lant . ' M.C . Setalvad , Attorney General for India ( H. J. Umrigar , with him ) , for the respondent . 1950 . May 26 . The judgment of the Court was delivered by MEHR CHAND MAHAJAN J . This is an appeal against a judgment of the High Court of Judicature at Bombay in an income tax matter and it raises the question whether munici pal property tax and urban immoveable property tax payable under the relevant Bombay Acts are allowable deductions under section 9 ( 1 ) ( iv ) of the Indian Income tax Act . The assessee company is an investment company deriving its income from properties in the city of Bombay . For the assessment year 1940 41 the net income of the assessee under the head `` property '' was computed by the Income tax Officer in the sum of Rs . 6,21,764 after deducting f

In [5]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam

class LegalDataset(Dataset):
    def __init__(self, judgements1, summaries1, tokenizer, max_length=1024):
        self.judgements1 = judgements1
        self.summaries1 = summaries1
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.judgements1)

    def __getitem__(self, idx):
        input_text = self.judgements1[idx]
        target_text = self.summaries1[idx]

        encoding = self.tokenizer(input_text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")
        labels = self.tokenizer(target_text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt").input_ids

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': labels.squeeze()
        }

# Load pre-trained model and tokenizer
tokenizer = PegasusTokenizer.from_pretrained('nsi319/legal-pegasus')
model = PegasusForConditionalGeneration.from_pretrained('nsi319/legal-pegasus')

# Create an optimizer
optimizer = Adam(model.parameters(), lr=1e-4)

# Instantiate the dataset and dataloader
dataset = LegalDataset(judgements1, summaries1, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Fine-tune the model using your legal dataset
for epoch in range(1):
    for batch in dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# Save the fine-tuned model
model.save_pretrained('fine_tuned_legal_summarizer')


: 