In [3]:
pip install datasets



In [None]:
import argparse
import json
from pathlib import Path
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
    default_data_collator,
    EarlyStoppingCallback
)

import os
os.environ["WANDB_DISABLED"] = "true"


file_path = "/content/drive/My Drive/Dissertation/cuad_qa_dataset.json"


model_candidates = [
    "Jasu/bert-finetuned-squad-legalbert",
    "nlpaueb/legal-bert-base-uncased",
    "atharvamundada99/bert-large-question-answering-finetuned-legal",
    "facebook/bart-base"
]

# ========================================================
# 1. Mount Google Drive
# ========================================================

from google.colab import drive
drive.mount('/content/drive')

#========================================================
# 2. Load dataset
# ========================================================


def read_json(file_path):
    # Assumes the JSON file is a list of examples.
    print("Loading dataset from:", file_path)

    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data


def data_preparation(data):
    # We'll just take a small subset (5430) for demonstration
    raw_data = data["train"][:3430]


    # Convert dataset to a huggingface Dataset
    dataset = Dataset.from_list(raw_data)
    print(dataset)

    split_dataset = dataset.train_test_split(test_size=0.1, seed=42)

    dataset_dict = DatasetDict({
        "train": split_dataset["train"],    # 90%
        "validation": split_dataset["test"], # 10%
    })

    print("DataSet Structure: ", dataset_dict)
    print("Train size:", len(dataset_dict["train"]))
    print("Validation size:", len(dataset_dict["validation"]))


    return dataset_dict


def prepare_features(entries, tokenizer, max_length, doc_stride):
    """
    Tokenizes a batch of examples with the fast tokenizer.
    Splits contexts into overlapping chunks.
    Returns a dict with the tokenized inputs as well as computed start_positions and end_positions.
    """
    # Tokenize with truncation on context only
    encodings = tokenizer(
        entries["question"],
        entries["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Debug: Print tokenization keys and sample mapping
    print("Tokenization keys:", list(encodings.keys()))
    print("Sample mapping (first 3):", encodings.get("overflow_to_sample_mapping", [])[:3])

    sample_mapping = encodings.pop("overflow_to_sample_mapping")
    offset_mapping = encodings.pop("offset_mapping")

    start_positions = []
    end_positions = []

    # Loop over each chunk and add debug prints
    for i, offsets in enumerate(offset_mapping):
        encoding = encodings.encodings[i]
        sequence_ids = encoding.sequence_ids

        # print(f"\n--- Chunk {i} Debug ---")
        # print("Sequence IDs:", sequence_ids)
        # print("Offset mapping:", offsets)

        sample_index = sample_mapping[i]
        # Get the original context for this example.
        context = entries["context"][sample_index]

        # --- Print the context chunk text for debugging ---
        # context_token_indices = [j for j, sid in enumerate(sequence_ids) if sid == 1]
        # if context_token_indices:
        #     ctx_start = offsets[context_token_indices[0]][0]
        #     ctx_end = offsets[context_token_indices[-1]][1]
        #     context_chunk = context[ctx_start:ctx_end]
        #     print("Context chunk text:", context_chunk)
        # else:
        #     print("No context tokens found in this chunk.")

        answers = entries["answers"][sample_index]
        # print("Original question:", entries["question"][sample_index])
        # print("Original answer info:", answers)

        # If no answer is provided, label as unanswerable.
        if len(answers["answer_start"]) == 0:
            start_positions.append(0)
            end_positions.append(0)
        else:
            answer_start = answers["answer_start"][0]
            answer_text = answers["text"][0]
            answer_end = answer_start + len(answer_text)

            token_start_index = 0
            while token_start_index < len(sequence_ids) and sequence_ids[token_start_index] != 1:
                token_start_index += 1
            token_end_index = len(sequence_ids) - 1
            while token_end_index >= 0 and sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # print("Token start index before check:", token_start_index)
            # print("Token end index before check:", token_end_index)
            # print("Answer span in context (char indices):", answer_start, answer_end)

            if not (offsets[token_start_index][0] <= answer_start and offsets[token_end_index][1] >= answer_end):
                # print("Answer not fully in this chunk.")
                start_positions.append(0)
                end_positions.append(0)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= answer_start:
                    token_start_index += 1
                start_pos = token_start_index - 1

                while token_end_index >= 0 and offsets[token_end_index][1] >= answer_end:
                    token_end_index -= 1
                end_pos = token_end_index + 1

                # print("Computed start_pos:", start_pos)
                # print("Computed end_pos:", end_pos)
                # # Print the text for the computed tokens using the context and offsets.
                # token_start_text = context[offsets[start_pos][0]:offsets[start_pos][1]]
                # token_end_text = context[offsets[end_pos][0]:offsets[end_pos][1]]
                # print(f"Token {start_pos} text: '{token_start_text}'")
                # print(f"Token {end_pos} text: '{token_end_text}'")

                start_positions.append(start_pos)
                end_positions.append(end_pos)

    encodings["start_positions"] = start_positions
    encodings["end_positions"] = end_positions

    return encodings



def main():
    raw_data = read_json(file_path)
    data = data_preparation(raw_data)

    for model_name in model_candidates:
        print(f"\n==== Training model: {model_name} ====\n")


        # Create a safe folder name from the model name (replace "/" with "_")
        safe_model_name = model_name.replace("/", "_")
        output_dir = f"/content/drive/My Drive/Dissertation/model_output/{safe_model_name}"
        os.makedirs(output_dir, exist_ok=True)

        # Load tokenizer and model.
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        model = AutoModelForQuestionAnswering.from_pretrained(model_name)

        # Preprocess the dataset: tokenize and chunk.
        tokenized_train_dataset = data['train'].map(
            lambda examples: prepare_features(examples, tokenizer, 512, 128),
            batched=True,
            remove_columns=data['train'].column_names,
        )

                # Assuming your dataset_dict has a "validation" split:
        tokenized_val_dataset = data['validation'].map(
            lambda examples: prepare_features(examples, tokenizer, 512, 128),
            batched=True,
            remove_columns=data['validation'].column_names,
        )

        # print("Tokenized dataset samples:")
        # for entry in tokenized_train_dataset.select(range(3)):
        #     print(entry)


        # Define training arguments.
        training_args = TrainingArguments(
            output_dir=output_dir,
            evaluation_strategy="steps",
            learning_rate=5e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=2,
            weight_decay=0.01,
            save_strategy="steps",
            logging_steps=500,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            save_total_limit=1,
            gradient_accumulation_steps=1,
            disable_tqdm=False,
            push_to_hub=False,
            fp16=True,
        )

        # Create the Trainer.
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_train_dataset,
            eval_dataset=tokenized_val_dataset,
            tokenizer=tokenizer,
            data_collator=default_data_collator,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
        )

        # Train and save the fine-tuned model.
        trainer.train()
        trainer.save_model(output_dir)
        print(f"Model {model_name} saved to {output_dir}")


if __name__ == "__main__":
    main()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading dataset from: /content/drive/My Drive/Dissertation/cuad_qa_dataset.json
Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 3430
})
DataSet Structure:  DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 3087
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 343
    })
})
Train size: 3087
Validation size: 343

==== Training model: Jasu/bert-finetuned-squad-legalbert ====



Map:   0%|          | 0/3087 [00:00<?, ? examples/s]

Tokenization keys: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping']
Sample mapping (first 3): [0, 0, 0]
Tokenization keys: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping']
Sample mapping (first 3): [0, 0, 0]
Tokenization keys: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping']
Sample mapping (first 3): [0, 0, 0]
Tokenization keys: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping']
Sample mapping (first 3): [0, 0, 0]


Map:   0%|          | 0/343 [00:00<?, ? examples/s]

Tokenization keys: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping']
Sample mapping (first 3): [0, 0, 0]


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss,Validation Loss
500,0.1339,0.108094
1000,0.0968,0.081144
1500,0.0853,0.11017
2000,0.0849,0.106172


Model Jasu/bert-finetuned-squad-legalbert saved to /content/drive/My Drive/Dissertation/model_output/Jasu_bert-finetuned-squad-legalbert

==== Training model: nlpaueb/legal-bert-base-uncased ====



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3087 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Tokenization keys: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping']
Sample mapping (first 3): [0, 0, 0]
Tokenization keys: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping']
Sample mapping (first 3): [0, 0, 0]
Tokenization keys: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping']
Sample mapping (first 3): [0, 0, 0]
Tokenization keys: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping']
Sample mapping (first 3): [0, 0, 0]


Map:   0%|          | 0/343 [00:00<?, ? examples/s]

Tokenization keys: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping']
Sample mapping (first 3): [0, 0, 0]


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss,Validation Loss
500,0.25,0.130446
1000,0.1202,0.124364
1500,0.108,0.129207
2000,0.1152,0.120429
2500,0.1183,0.103892
3000,0.0986,0.101809
3500,0.1021,0.078523
4000,0.0955,0.073253
4500,0.0915,0.10213
5000,0.0936,0.068156


Model nlpaueb/legal-bert-base-uncased saved to /content/drive/My Drive/Dissertation/model_output/nlpaueb_legal-bert-base-uncased

==== Training model: atharvamundada99/bert-large-question-answering-finetuned-legal ====



tokenizer_config.json:   0%|          | 0.00/321 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Map:   0%|          | 0/3087 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Tokenization keys: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping']
Sample mapping (first 3): [0, 0, 0]
Tokenization keys: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping']
Sample mapping (first 3): [0, 0, 0]
Tokenization keys: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping']
Sample mapping (first 3): [0, 0, 0]
Tokenization keys: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping']
Sample mapping (first 3): [0, 0, 0]


Map:   0%|          | 0/343 [00:00<?, ? examples/s]

Tokenization keys: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping']
Sample mapping (first 3): [0, 0, 0]


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss,Validation Loss
500,0.2785,0.141475
1000,0.173,0.122654
1500,0.1492,0.118574
2000,0.2556,0.126748
2500,0.1523,0.135229


Model atharvamundada99/bert-large-question-answering-finetuned-legal saved to /content/drive/My Drive/Dissertation/model_output/atharvamundada99_bert-large-question-answering-finetuned-legal

==== Training model: facebook/bart-base ====



config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Some weights of BartForQuestionAnswering were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3087 [00:00<?, ? examples/s]

Tokenization keys: ['input_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping']
Sample mapping (first 3): [0, 0, 0]
Tokenization keys: ['input_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping']
Sample mapping (first 3): [0, 0, 0]
Tokenization keys: ['input_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping']
Sample mapping (first 3): [0, 0, 0]
Tokenization keys: ['input_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping']
Sample mapping (first 3): [0, 0, 0]


Map:   0%|          | 0/343 [00:00<?, ? examples/s]

Tokenization keys: ['input_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping']
Sample mapping (first 3): [0, 0, 0]


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss,Validation Loss
500,0.2408,0.128869


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}


Step,Training Loss,Validation Loss
500,0.2408,0.128869


In [None]:
from google.colab import files
files.download('/content/drive/My Drive/Colab Notebooks/main_new.ipynb')