In [1]:
import pandas as pd
data=pd.read_csv("legal_summaries.csv")
len(data)

7823

In [3]:
print(len(data['summary_text'][0]))

print(len(data['input_text'][0]))

837
18759


In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

class LegalSummarizer:
    def __init__(self, model_name="t5-base"):
        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
        self.model = T5ForConditionalGeneration.from_pretrained(model_name)
    
    def summarize(self, document, max_length=250, min_length=100):
        inputs = self.tokenizer(
            f"summarize: {document}", 
            max_length=1024, 
            truncation=True, 
            return_tensors="pt"
        )
        
        summary_ids = self.model.generate(
            inputs['input_ids'], 
            max_length=max_length, 
            min_length=min_length,
            num_beams=4, 
            early_stopping=True
        )
        
        return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Usage
long_legal_document=data['input_text'][]
summarizer = LegalSummarizer()
summary = summarizer.summarize(long_legal_document)
print(summary)

the appeal arises out of a suit brought by the respondent in the court of the Subordinate Judge, Dhanbad . the court held that the compromise decree not having been registered was inadmissible in evidence . the second defendant has preferred this appeal; the facts are not now in dispute . the appeal will be heard on thursday, january 2, at 8 p.m. et in the supreme court of india .


In [24]:
torch.cuda.is_available()

True

In [28]:
import torch
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset

class LegalSummarizerTrainer:
    def __init__(self, model_name="t5-base"):
        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
        self.model = T5ForConditionalGeneration.from_pretrained(model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        
        # Initialize data collator for proper padding
        self.data_collator = DataCollatorForSeq2Seq(
            tokenizer=self.tokenizer,
            model=self.model,
            padding=True
        )
    
    def prepare_dataset(self, data, test_size=0.1):
        # Convert to DataFrame if not already
        if not isinstance(data, pd.DataFrame):
            data = pd.DataFrame(data)
            
        # Handle NaN values safely
        clean_data = data.copy()
        clean_data['input_text'] = clean_data['input_text'].fillna('')
        clean_data['summary_text'] = clean_data['summary_text'].fillna('')
        
        # Convert to strings using .loc to avoid SettingWithCopyWarning
        clean_data.loc[:, 'input_text'] = clean_data['input_text'].astype(str)
        clean_data.loc[:, 'summary_text'] = clean_data['summary_text'].astype(str)
        
        # Convert to HuggingFace Dataset
        dataset = Dataset.from_dict({
            'input_text': clean_data['input_text'].tolist(),
            'target_text': clean_data['summary_text'].tolist()
        })
        
        # Split dataset
        dataset = dataset.train_test_split(test_size=test_size)
        return dataset['train'], dataset['test']
    
    def preprocess_data(self, dataset, max_input_length=512, max_target_length=128):
        def preprocess_function(examples):
            # Prefix input text with task description
            inputs = [f"summarize: {doc}" for doc in examples['input_text']]
            
            # Tokenize inputs with padding and truncation
            model_inputs = self.tokenizer(
                inputs,
                max_length=max_input_length,
                padding='max_length',
                truncation=True,
                return_tensors="pt"
            )
            
            # Tokenize targets with padding and truncation
            labels = self.tokenizer(
                examples['target_text'],
                max_length=max_target_length,
                padding='max_length',
                truncation=True,
                return_tensors="pt"
            )
            
            model_inputs['labels'] = labels['input_ids']
            return model_inputs
        
        return dataset.map(
            preprocess_function,
            batched=True,
            remove_columns=dataset.column_names
        )
    
    def fine_tune(self, data, output_dir='./legal_summarizer_model', 
                 num_epochs=3, batch_size=8, learning_rate=5e-5):
        # Prepare datasets
        train_dataset, val_dataset = self.prepare_dataset(data)
        
        # Preprocess datasets
        train_dataset = self.preprocess_data(train_dataset)
        val_dataset = self.preprocess_data(val_dataset)
        
        # Training arguments
        training_args = Seq2SeqTrainingArguments(
            output_dir=output_dir,
            num_train_epochs=num_epochs,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            warmup_ratio=0.1,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=100,
            eval_steps=500,
            save_steps=1000,
            eval_strategy="steps",
            learning_rate=learning_rate,
            load_best_model_at_end=True,
            metric_for_best_model="loss",
            greater_is_better=False,
            gradient_accumulation_steps=4,
            fp16=torch.cuda.is_available()  # Use mixed precision if GPU available
        )
        
        # Initialize trainer with data collator
        trainer = Seq2SeqTrainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            data_collator=self.data_collator,
            tokenizer=self.tokenizer
        )
        
        # Train and save
        trainer.train()
        trainer.save_model(f'{output_dir}/final')
        self.tokenizer.save_pretrained(f'{output_dir}/final')
    
    def summarize(self, document, max_length=128, min_length=30):
        inputs = self.tokenizer(
            f"summarize: {document}",
            max_length=512,
            padding=True,
            truncation=True,
            return_tensors="pt"
        ).to(self.device)
        
        summary_ids = self.model.generate(
            inputs['input_ids'],
            max_length=max_length,
            min_length=min_length,
            num_beams=4,
            length_penalty=2.0,
            early_stopping=True,
            no_repeat_ngram_size=3
        )
        
        return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Usage example
if __name__ == "__main__":
    # Assuming data is your DataFrame with 'input_text' and 'summary_text' columns
    # data = pd.read_csv('your_legal_dataset.csv')
    
    trainer = LegalSummarizerTrainer()
    trainer.fine_tune(
        data,
        output_dir='./legal_summarizer_model',
        num_epochs=3,
        batch_size=8,
        learning_rate=5e-5
    )

Map:   0%|          | 0/7040 [00:00<?, ? examples/s]

Map:   0%|          | 0/783 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss


RuntimeError: trivial_device_copy D->H failed: cudaErrorUnknown: unknown error

In [None]:
data = pd.read_csv('legal_summaries.csv')
trainer = LegalSummarizerTrainer()
trainer.fine_tune(
    data,
    output_dir='./legal_summarizer_model',
    num_epochs=3,
    batch_size=4,  # Reduced from 8 to 4 for 6GB VRAM
    learning_rate=3e-5  # Slightly lower learning rate for stability
)