In [1]:
from preprocessing_v2 import *




In [2]:
from transformers import LEDTokenizer, LEDForConditionalGeneration
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW
import time
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

In [3]:
# Prepare the data, model, and tokenizer before training
preprocessor = preprocess('new_court_cases.csv')
model, tokenizer, train_data, eval_data = preprocessor.return_model_tokenizer_data()

  return self.fget.__get__(instance, owner)()


In [4]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        # Return the data directly
        item = self.data[idx]
        
        # Convert to tensors and ensure they have the correct shape
        input_ids = torch.tensor(item['input_ids']).squeeze(0)  # shape should be [4096]
        attention_mask = torch.tensor(item['attention_mask']).squeeze(0)
        global_attention_mask = torch.tensor(item['global_attention_mask']).squeeze(0)
        labels = torch.tensor(item['labels']).squeeze(0)

        # Add batch dimension
        return {
            "input_ids": input_ids.unsqueeze(0),  # Now shape: [1, 4096]
            "attention_mask": attention_mask.unsqueeze(0),
            "global_attention_mask": global_attention_mask.unsqueeze(0),
            "labels": labels.unsqueeze(0)
        }


In [5]:
class modelling:
    def __init__(self, model, tokenizer, train_data, eval_data, epochs=3):
        self.train_data = train_data
        self.eval_data = eval_data
        self.model = model
        self.tokenizer = tokenizer
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        self.model.gradient_checkpointing_enable()
        self.model.config.use_cache = False

        # Optimizer and criterion
        self.optimizer = AdamW(self.model.parameters(), lr=5e-5)
        self.criterion = torch.nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)

        # Dataset and Dataloaders
        self.train_dataset = CustomDataset(data=self.train_data)
        self.eval_dataset = CustomDataset(data=self.eval_data)

        # Training arguments
        self.training_args = Seq2SeqTrainingArguments(
            output_dir="./results",
            num_train_epochs=epochs,
            per_device_train_batch_size=2,
            per_device_eval_batch_size=2,
            fp16=True,
            evaluation_strategy="steps",
            logging_dir="./logs",
            logging_steps=10,
            save_steps=500,
            gradient_checkpointing=True,
            label_smoothing_factor=0.1,
            load_best_model_at_end=True,
            metric_for_best_model="rouge2",
        )

        # Trainer object
        self.trainer = Seq2SeqTrainer(
            model=self.model,
            args=self.training_args,
            train_dataset=self.train_dataset,
            eval_dataset=self.eval_dataset,
            compute_metrics=self.compute_metrics,  # Custom metrics
        )

    # Custom metric calculation (adjust if needed)
    def compute_metrics(self, eval_pred):
        predictions, labels = eval_pred
        decoded_preds = self.tokenizer.batch_decode(predictions, skip_special_tokens=True)
        decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)

        # Compute ROUGE score
        result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
        return {k: round(v.mid.fmeasure * 100, 2) for k, v in result.items()}

    # Fine-tuning function
    def finetune(self):
        self.trainer.train()

In [6]:
modeller = modelling(model, tokenizer, train_data, eval_data)



In [7]:
modeller.finetune()

  input_ids = torch.tensor(item['input_ids']).squeeze(0)  # shape should be [4096]
  attention_mask = torch.tensor(item['attention_mask']).squeeze(0)
  global_attention_mask = torch.tensor(item['global_attention_mask']).squeeze(0)
  labels = torch.tensor(item['labels']).squeeze(0)
Input ids are automatically padded from 1 to 1024 to be a multiple of `config.attention_window`: 1024


RuntimeError: Tensors must have same number of dimensions: got 4 and 3