In [2]:
import os
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM

# Directory paths
source_dir = r"D:/Sem-5/IMD/Project/LawSage.AI/static/dataset/IN-Abs/train-data/judgement"
target_dir = r"D:/Sem-5/IMD/Project/LawSage.AI/static/dataset/IN-Abs/train-data/summary"

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Function to read txt files from the directories
def load_data_from_txt(source_dir, target_dir):
    source_texts = []
    target_texts = []

    # Reading judgement (source) and summary (target) files
    for filename in os.listdir(source_dir):
        source_file_path = os.path.join(source_dir, filename)
        target_file_path = os.path.join(target_dir, filename)  # Assuming same filename in both directories

        # Read source file (judgement)
        with open(source_file_path, 'r', encoding='utf-8') as file:
            source_texts.append(file.read())

        # Read target file (summary)
        with open(target_file_path, 'r', encoding='utf-8') as file:
            target_texts.append(file.read())

    return source_texts, target_texts

train_texts, train_labels = load_data_from_txt(source_dir, target_dir)

In [8]:
# Pegasus dataset class
class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item
    
    def __len__(self):
        return len(self.labels['input_ids'])

In [9]:
# Prepare data for tokenization and model
def prepare_data(model_name, train_texts, train_labels, val_texts=None, val_labels=None, test_texts=None, test_labels=None):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    def tokenize_data(texts, labels):
        encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
        decodings = tokenizer(labels, truncation=True, padding=True, max_length=256)
        return PegasusDataset(encodings, decodings)
    
    train_dataset = tokenize_data(train_texts, train_labels)
    val_dataset = tokenize_data(val_texts, val_labels) if val_texts and val_labels else None
    test_dataset = tokenize_data(test_texts, test_labels) if test_texts and test_labels else None
    
    return train_dataset, val_dataset, test_dataset, tokenizer

In [10]:
# Prepare fine-tuning function
def prepare_fine_tuning(model_name, tokenizer, train_dataset, val_dataset=None, freeze_encoder=False, output_dir='./results'):
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

    if freeze_encoder:
        for param in model.model.encoder.parameters():
            param.requires_grad = False

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=2,
        per_device_train_batch_size=1,
        save_steps=500,
        save_total_limit=5,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=100,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer
    )

    return trainer

In [11]:
model_name = 'nsi319/legal-pegasus'
train_dataset, _, _, tokenizer = prepare_data(model_name, train_texts, train_labels)

# Fine-tune the model
trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset)
trainer.train()

ValueError: Couldn't instantiate the backend tokenizer from one of: 
(1) a `tokenizers` library serialization file, 
(2) a slow tokenizer instance to convert or 
(3) an equivalent slow tokenizer class to instantiate and convert. 
You need to have sentencepiece installed to convert a slow tokenizer to a fast one.

In [None]:
# Save the trained model
if not os.path.exists('./output_model/'):
    os.makedirs('./output_model/')
trainer.model.save_pretrained("./output_model/")
!zip -r output_model.zip ./output_model/