In [1]:
!pip install sentencepiece



In [2]:
import requests
import json
import torch
import os
from tqdm import tqdm
from transformers import RobertaTokenizer
from torch.utils.data import DataLoader
from transformers import DebertaV2TokenizerFast, DebertaV2ForQuestionAnswering
from transformers import AdamW

In [3]:
def read_data(path):  
  
  with open(path, 'rb') as f:
    contract = json.load(f)

  contexts = []
  questions = []
  answers = []

  for c in contract:
    context = c['context']
    for i in range(len(c['questions'])):
        question = c["questions"][i]['input_text']
        questions.append(question)
    for i in range(len(c['answers'])):
        answer = c["answers"][i]
        contexts.append(context)
        answers.append(answer)

  return contexts, questions, answers

In [4]:
train_contexts, train_questions, train_answers = read_data('data/def_qa2.json')
valid_contexts, valid_questions, valid_answers = read_data('data/val_qa.json')

In [5]:
# getting the model and its tokenizer (currently training on only 1000 rows as it is very time consuming)

tokenizer = DebertaV2TokenizerFast.from_pretrained("microsoft/deberta-v3-large")

train_encodings = tokenizer(train_contexts[:1000], train_questions[:1000], truncation=True, padding=True)
valid_encodings = tokenizer(valid_contexts[:100], valid_questions[:100], truncation=True, padding=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [6]:
# adding the answers in the training set for fine tuning
def add_token_positions(encodings, answers):
  start_positions = []
  end_positions = []
  for i in range(len(answers)):
    start_positions.append(encodings.char_to_token(i, answers[i]['start']))
    end_positions.append(encodings.char_to_token(i, answers[i]['end'] - 1))

    # if start position is None, the answer passage has been truncated
    if start_positions[-1] is None:
      start_positions[-1] = tokenizer.model_max_length
    if end_positions[-1] is None:
      end_positions[-1] = tokenizer.model_max_length

  encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(valid_encodings, valid_answers)

In [7]:
# creating the dataset in the format it is required for fine tuning BERT
class Def_Dataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self.encodings = encodings
  def __getitem__(self, idx):
    return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  def __len__(self):
    return len(self.encodings.input_ids)

In [8]:
train_dataset = Def_Dataset(train_encodings)
valid_dataset = Def_Dataset(valid_encodings)

In [9]:
# Define the dataloaders
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=2)

In [10]:
# loading the BERT model which we will fine tune
model = DebertaV2ForQuestionAnswering.from_pretrained("microsoft/deberta-v3-large")

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2ForQuestionAnswering: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2ForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertFor

In [11]:
# checking the device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Working on {device}')

Working on cuda


In [12]:
# Fine tuning it per batch
N_EPOCHS = 5
optim = AdamW(model.parameters(), lr=5e-5)

model.to(device)
model.train()

for epoch in range(N_EPOCHS):
  loop = tqdm(train_loader, leave=True)
  for batch in loop:
    optim.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_positions = batch['start_positions'].to(device)
    end_positions = batch['end_positions'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
    loss = outputs[0]
    loss.backward()
    optim.step()

    loop.set_description(f'Epoch {epoch+1}')
    loop.set_postfix(loss=loss.item())

Epoch 1: 100%|██████████| 19/19 [00:04<00:00,  4.17it/s, loss=4.36]
Epoch 2: 100%|██████████| 19/19 [00:04<00:00,  4.57it/s, loss=4.42]
Epoch 3: 100%|██████████| 19/19 [00:04<00:00,  4.56it/s, loss=1.78]
Epoch 4: 100%|██████████| 19/19 [00:04<00:00,  4.57it/s, loss=1.88] 
Epoch 5: 100%|██████████| 19/19 [00:04<00:00,  4.57it/s, loss=0.945]


In [13]:
# checking the performance
model.eval()

acc = []

for batch in tqdm(valid_loader):
  with torch.no_grad():
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_true = batch['start_positions'].to(device)
    end_true = batch['end_positions'].to(device)
    
    outputs = model(input_ids, attention_mask=attention_mask)

    start_pred = torch.argmax(outputs['start_logits'], dim=1)
    end_pred = torch.argmax(outputs['end_logits'], dim=1)

    acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
    acc.append(((end_pred == end_true).sum()/len(end_pred)).item())

acc = sum(acc)/len(acc)

100%|██████████| 4/4 [00:00<00:00, 19.39it/s]


In [14]:
acc

0.4375