In [1]:
import os
os.chdir('../')
os.chdir('../')
%pwd

'e:\\Deep Learning\\pytorch\\PDF_Q&A'

In [2]:
import json
import torch
from datasets import Dataset, DatasetDict
from transformers import BertTokenizerFast, BertForQuestionAnswering, TrainingArguments, Trainer

In [3]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
torch.cuda.is_available()

True

In [5]:
with open('datasets/train-v1.1.json', 'r') as f:
    squad_train = json.load(f)

with open('datasets/dev-v1.1.json', 'r') as f:
    squad_dev = json.load(f)

In [6]:
def squad_json_to_dataset(squad_dict):
    contexts = []
    questions = []
    answers = []

    for group in squad_dict['data']:
        for paragraph in group['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                if 'answers' in qa:
                    for answer in qa['answers']:
                        contexts.append(context)
                        questions.append(question)
                        answers.append(answer)
                else:
                    contexts.append(context)
                    questions.append(question)
                    answers.append({'answer_start': -1, 'text': ''})

    return Dataset.from_dict({'context': contexts, 'question': questions, 'answers': answers})

In [7]:
train_dataset = squad_json_to_dataset(squad_train)
dev_dataset = squad_json_to_dataset(squad_dev)

In [8]:
datasets = DatasetDict({'train': train_dataset, 'validation': dev_dataset})

In [9]:
def preprocess_function(examples):
    # Strip extra spaces from questions and contexts
    questions = [q.strip() for q in examples['question']]
    contexts = [c.strip() for c in examples['context']]

    # Tokenize the input
    inputs = tokenizer(
        questions,
        contexts,
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Extract offset mappings and other data
    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        answer = answers[i]
        if answer['answer_start'] == -1:
            start_positions.append(0)
            end_positions.append(0)
        else:
            start_char = answer['answer_start']
            end_char = start_char + len(answer['text'])

            sequence_ids = inputs.sequence_ids(i)
            context_start = sequence_ids.index(1)
            context_end = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)

            # Find the start and end token positions within the context
            start_token_idx = None
            end_token_idx = None

            for idx, (start, end) in enumerate(offsets):
                if start <= start_char < end:
                    start_token_idx = idx
                if start < end_char <= end:
                    end_token_idx = idx

            if start_token_idx is None or end_token_idx is None or sequence_ids[start_token_idx] != 1 or sequence_ids[end_token_idx] != 1:
                start_positions.append(0)
                end_positions.append(0)
            else:
                start_positions.append(start_token_idx)
                end_positions.append(end_token_idx)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [10]:
tokenized_dataset = datasets.map(preprocess_function, batched=True, remove_columns=datasets["train"].column_names)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/34726 [00:00<?, ? examples/s]

In [11]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
)



In [12]:
trainer.train()

eval_results = trainer.evaluate()
print(eval_results)

model.save_pretrained('model_bert_uncased')
tokenizer.save_pretrained('tokenizer_bert_uncased')

  0%|          | 0/27375 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


KeyboardInterrupt: 