In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import LEDForConditionalGeneration, LEDTokenizer
from transformers import Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class CourtCaseDataset(Dataset):
    def __init__(self, tokenizer, csv_file, max_input_length=4096, max_output_length=4096):
        self.tokenizer = tokenizer
        self.data = pd.read_csv(csv_file)
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Retrieve the case data
        facts = self.data.iloc[idx]['facts']
        issues = self.data.iloc[idx]['issues']
        ruling = self.data.iloc[idx]['ruling']
        
        # Combine sections into a single input string
        input_text = f"{facts} [SEP] {issues} [SEP] {ruling}"
        labels = self.data.iloc[idx]['whole_text']  # The entire text as the target
        print(input_text)
        print(labels)
        
        # Process inputs and labels
        inputs = self.tokenizer(
            input_text,
            padding="max_length",
            truncation=True,
            max_length=self.max_input_length,
            return_tensors="pt"
        )
        
        outputs = self.tokenizer(
            labels,
            padding="max_length",
            truncation=True,
            max_length=self.max_output_length,
            return_tensors="pt"
        )

        # Create attention masks
        batch = {
            "input_ids": inputs["input_ids"].flatten(),
            "attention_mask": inputs["attention_mask"].flatten(),
            "labels": outputs["input_ids"].flatten()
        }

        # Create global attention mask with the first token set to 1
        batch["global_attention_mask"] = torch.zeros_like(batch["input_ids"]).tolist()
        batch["global_attention_mask"][0] = 1

        # Make sure that the PAD token is ignored
        batch["labels"] = [
            [-100 if token == self.tokenizer.pad_token_id else token for token in outputs["input_ids"].flatten().tolist()]
        ]

        return batch

In [3]:
# Load the model and tokenizer
model_name = "allenai/led-large-16384"
tokenizer = LEDTokenizer.from_pretrained(model_name)
model = LEDForConditionalGeneration.from_pretrained(model_name)



In [4]:
# Prepare the dataset
train_dataset = CourtCaseDataset(tokenizer, 'new_court_cases.csv')  
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

In [5]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    num_train_epochs=3,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=500,
    evaluation_strategy="steps",
    eval_steps=500,
    load_best_model_at_end=True
)



In [6]:
# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

: 

In [7]:
# Train the model
trainer.train()

  0%|          | 0/15 [00:00<?, ?it/s]

The Facts and the Case
The City Prosecutor of Manila charged the accused Luis Pajarin and Efren Pallaya before the Regional Trial Court (RTC) of Manila in Criminal Cases 05-237756 and 05-237757 with violation of Section 5 in relation to Sections 26 and 11 (3) in relation to Section 13, respectively, of Article II of Republic Act (R.A.) 9165 or the Comprehensive Dangerous Drugs Act of 2002.

The prosecution presented PO2 Nestor Lehetemas, member of the buy-bust team and PO2 James Nolan Ibañez, the poseur-buyer. They testified that on June 1, 2005 at around 10:00 p.m., an informant arrived at their Station Anti-Illegal Drugs (SAID) with the report that drugs would be sold on P. Ocampo and Dominga Streets the next day at around 5:00 pm. As the poseur-buyer, PO2 Ibañez marked a P500.00 bill with SAID on top of its serial number.

On June 2, 2005 the buy-bust team went to the site of the operation on board a Tamaraw FX which they parked near Dominga Street. The informant pointed to the two 

In [None]:
# Save the fine-tuned model
trainer.save_model('./fine_tuned_led_2')