In [1]:
from preprocessing_v3 import *




In [2]:
import torch
from datasets import Dataset
from transformers import AdamW
import time
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import evaluate
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [3]:
def process_data_to_model_inputs(batch):
    # Tokenize the inputs
    inputs = tokenizer(
        batch["encoder_input_string"],
        padding="max_length",
        truncation=True,
        max_length=1024,
    )
    
    # Prepare input IDs and attention masks
    batch["input_ids"] = inputs["input_ids"]
    batch["attention_mask"] = inputs["attention_mask"]

    labels = []
    global_attention_mask = []
    for i in range(len(batch["input_ids"])):
        # Prepare labels
        labels.append(batch["segment_label"][i])
        print(f"this is the {i}th batch: ", batch["segment_label"][i])
        
        # Ensure global attention mask is padded to 1024 tokens
        token_len = len(batch["input_ids"][i]) - 1
        global_attention_mask.append([1] + [0] * token_len)
    
    # Convert global attention mask to tensor
    batch["global_attention_mask"] = torch.tensor(global_attention_mask, dtype=torch.long)
    batch["labels"] = torch.tensor(labels, dtype=torch.long)

    print('global: ',batch["global_attention_mask"].shape)
    print('labels: ',batch["labels"].shape)

    return batch

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)  # Get the predicted class (highest score)

    # Compute accuracy, precision, recall, and F1-score
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)

    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

In [4]:
# Prepare the data, model, and tokenizer before training
preprocessor = preprocess('court_cases_headings_labels.csv')
model, tokenizer, xdata, ydata = preprocessor.return_model_tokenizer_data()
xdata = xdata[:10]
ydata = ydata[:10]



added 0 new tokens



In [12]:
# Convert to huggingface Dataset
train_data = Dataset.from_pandas(xdata)
eval_data = Dataset.from_pandas(ydata)
train_data[0]

{'encoder_input_string': '4 batolacongan d . abdullah abdullah director finance budget and management services',
 'segment_label': 0,
 '__index_level_0__': 15191}

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
device

device(type='cuda')

In [7]:
# Map datasets to the model's expected input format
train_dataset = train_data.map(
    process_data_to_model_inputs,
    batched=True,
    remove_columns=["encoder_input_string", "segment_label"]
)

eval_dataset = eval_data.map(
    process_data_to_model_inputs,
    batched=True,
    remove_columns=["encoder_input_string", "segment_label"]
)

Map:   0%|          | 0/53146 [00:00<?, ? examples/s]

Map:   0%|          | 0/5906 [00:00<?, ? examples/s]

In [8]:
train_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "global_attention_mask", "labels"],
)
val_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "global_attention_mask", "labels"],
)

In [9]:
print(train_dataset['input_ids'].shape)
print(train_dataset['global_attention_mask'].shape)
print(train_dataset['attention_mask'].shape)
print(train_dataset['labels'].shape)

torch.Size([53146, 1024])
torch.Size([53146, 1024])
torch.Size([53146, 1024])
torch.Size([53146])


In [10]:
print(model.config.max_encoder_position_embeddings)
print(model.config.max_decoder_position_embeddings)

1024
1024


In [11]:
print(model.config)
print(len(xdata))

LEDConfig {
  "_name_or_path": "./",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "architectures": [
    "LEDForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "attention_window": [
    1024,
    1024,
    1024,
    1024,
    1024,
    1024
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_decoder_position_embeddings": 1024,
  "max_encoder_position_embeddings": 1024,
  "model_type"

In [None]:
# Set training arguments
training_args = Seq2SeqTrainingArguments(
    evaluation_strategy="steps",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    output_dir="./",
    logging_steps=5,
    eval_steps=10,
    save_steps=10,
    save_total_limit=2,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    fp16=True,  # Optional: Enable mixed precision training if using GPU
)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()