In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import LEDForConditionalGeneration, LEDTokenizer
from transformers import Trainer, TrainingArguments




In [8]:
class CourtCaseDataset(Dataset):
    def __init__(self, tokenizer, csv_file, max_input_length=2048, max_output_length=2048):
        self.tokenizer = tokenizer
        self.data = pd.read_csv(csv_file)
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Retrieve the case data
        facts = self.data.iloc[idx]['facts']
        issues = self.data.iloc[idx]['issues']
        ruling = self.data.iloc[idx]['ruling']
        
        # Combine sections into a single input string
        input_text = f"{facts} [SEP] {issues} [SEP] {ruling}"
        labels = self.data.iloc[idx]['whole_text']  # The entire text as the target
        print(input_text)
        print(labels)
        
        # Process inputs and labels
        inputs = self.tokenizer(
            input_text,
            padding="max_length",
            truncation=True,
            max_length=self.max_input_length,
            return_tensors="pt"
        )
        
        outputs = self.tokenizer(
            labels,
            padding="max_length",
            truncation=True,
            max_length=self.max_output_length,
            return_tensors="pt"
        )

        # Create attention masks
        batch = {
            "input_ids": inputs["input_ids"].flatten(),
            "attention_mask": inputs["attention_mask"].flatten(),
            "labels": outputs["input_ids"].flatten()
        }

        # Create global attention mask with the first token set to 1
        batch["global_attention_mask"] = torch.zeros_like(batch["input_ids"]).tolist()
        batch["global_attention_mask"][0] = 1

        # Make sure that the PAD token is ignored
        batch["labels"] = [
            [-100 if token == self.tokenizer.pad_token_id else token for token in outputs["input_ids"].flatten().tolist()]
        ]

        return batch

In [3]:
# Load the model and tokenizer
model_name = "allenai/led-large-16384"
tokenizer = LEDTokenizer.from_pretrained(model_name)
model = LEDForConditionalGeneration.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/1.84G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [9]:
# Prepare the dataset
train_dataset = CourtCaseDataset(tokenizer, 'new_court_cases.csv')  
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

In [10]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    num_train_epochs=3,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=500,
    evaluation_strategy="steps",
    eval_steps=500,
    load_best_model_at_end=True
)



In [11]:
# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

In [12]:
# Train the model
trainer.train()

Antecedents
In a memorandum dated 28 September 2004, issued by Engineer Alexander D. Paltao of the Technical Services COA Regional Office No. IV, and in an audit observation memorandum (AOM) dated 14 October 2004 issued by the Audit Team Leader of the municipality of Cabuyao, Laguna, Mayor Proceso Aguillo (Mayor Aguillo), municipal accountant Atty. Felix L. Galang, Jr. (Atty. Galang), municipal treasurer Elena A. Estalilla (Estalilla), municipal treasurer, building official Engineer Manolito Barundia (Barundia), and all the members of the Bids and Awards Committee (BAC) were requested to submit certain documents to facilitate the COA’s investigation, viz.: the approved detailed plan, “approved” statement of work accomplished, copies of contracts and bidding documents. The requested documents pertain to anomalous projects entered into by the municipal government of Cabuyao, Laguna with Golden Deer Enterprises and RDC Construction Development Corporation.

On 26 February 2007, Notice of 

OutOfMemoryError: CUDA out of memory. Tried to allocate 386.00 MiB (GPU 0; 2.00 GiB total capacity; 16.43 GiB already allocated; 0 bytes free; 16.96 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# Save the fine-tuned model
trainer.save_model('./fine_tuned_led_2')