<a href="https://colab.research.google.com/github/hjunjie0324/SQuAD_project/blob/main/setup2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [43]:
import numpy as np
import pandas as pd
import json
from tqdm import tqdm
import spacy

In [44]:
!pip install transformers



In [45]:
import requests
import urllib

In [46]:
def load_data(train_df):
    contexts = []
    questions = []
    answers = []
    for i in range(train_df['data'].shape[0]):
        topic = train_df['data'].iloc[i]['paragraphs']
        for sub_para in topic:
            context = sub_para['context']
            for q_a in sub_para['qas']:
                question = q_a['question']
                for answer in q_a['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

In [47]:
train_response = urllib.request.urlopen("https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json")
train_raw = pd.read_json(train_response)

In [48]:
val_response = urllib.request.urlopen("https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json")
val_raw = pd.read_json(val_response)

In [49]:
train_contexts, train_questions, train_answers = load_data(train_raw)
val_contexts, val_questions, val_answers = load_data(val_raw)

In [50]:
len(train_questions)

86821

In [51]:
len(train_answers)

86821

In [52]:
len(val_questions)

20302

In [53]:
def add_end_idx(answers,contexts):
  for answer, context in zip(answers, contexts):
    gold_text = answer['text']
    start_idx = answer['answer_start']
    end_idx = start_idx + len(gold_text)

    #sometimes SQuAD answers are off by a character or two 
    if context[start_idx:end_idx] == gold_text:
      answer['answer_end'] = end_idx
    elif context[start_idx-1:end_idx-1] == gold_text:
      answer['answer_start'] = start_idx - 1
      answer['answer_end'] = end_idx - 1
    elif context[start_idx-2: end_idx-2] == gold_text:
      answer['answer_start'] = start_idx - 2
      answer['answer_end'] = end_idx - 2

In [54]:
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [55]:
from transformers import BertTokenizer, BertForQuestionAnswering, BertTokenizerFast
import torch

In [56]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [57]:
small_train_contexts = train_contexts[0:100]
small_train_questions = train_questions[0:100]
small_train_answers = train_answers[0:100]

In [58]:
small_val_contexts = val_contexts[0:100]
small_val_questions = val_questions[0:100]
small_val_answers = val_answers[0:100]

In [59]:
small_train_encodings = tokenizer(small_train_contexts, small_train_questions, truncation=True, padding=True)
small_val_encodings = tokenizer(small_val_contexts, small_val_questions, truncation=True, padding=True)

In [60]:
#start and end position in SQuAD dataset is character position. change it to token position below
def add_token_positions(encodings, answers):
  start_positions = []
  end_positions = []
  for i in range(len(answers)):
    start_positions.append(encodings.char_to_token(i,answers[i]['answer_start']))
    end_positions.append(encodings.char_to_token(i,answers[i]['answer_end']-1))
    if start_positions[-1] is None:
      start_positions[-1] = tokenizer.model_max_length
    if end_positions[-1] is None:
      end_positions[-1] = tokenizer.model_max_length
    encodings.update({'start_positions':start_positions,'end_positions':end_positions})

In [61]:
add_token_positions(small_train_encodings,small_train_answers)
add_token_positions(small_val_encodings,small_val_answers)

In [62]:
class SquadDataset(torch.utils.data.Dataset):
  def __init__(self,encodings):
    self.encodings = encodings
  def __getitem__(self,idx):
    return {key:torch.tensor(val[idx]) for key, val in self.encodings.items()}
  def __len__(self):
    return len(self.encodings.input_ids)

In [63]:
small_train_dataset = SquadDataset(small_train_encodings)
small_val_dataset = SquadDataset(small_val_encodings)

In [64]:
small_train_encodings.items()

dict_items([('input_ids', [[101, 20773, 21025, 19358, 22815, 1011, 5708, 1006, 1013, 12170, 23432, 29715, 3501, 29678, 12325, 29685, 1013, 10506, 1011, 10930, 2078, 1011, 2360, 1007, 1006, 2141, 2244, 1018, 1010, 3261, 1007, 2003, 2019, 2137, 3220, 1010, 6009, 1010, 2501, 3135, 1998, 3883, 1012, 2141, 1998, 2992, 1999, 5395, 1010, 3146, 1010, 2016, 2864, 1999, 2536, 4823, 1998, 5613, 6479, 2004, 1037, 2775, 1010, 1998, 3123, 2000, 4476, 1999, 1996, 2397, 4134, 2004, 2599, 3220, 1997, 1054, 1004, 1038, 2611, 1011, 2177, 10461, 1005, 1055, 2775, 1012, 3266, 2011, 2014, 2269, 1010, 25436, 22815, 1010, 1996, 2177, 2150, 2028, 1997, 1996, 2088, 1005, 1055, 2190, 1011, 4855, 2611, 2967, 1997, 2035, 2051, 1012, 2037, 14221, 2387, 1996, 2713, 1997, 20773, 1005, 1055, 2834, 2201, 1010, 20754, 1999, 2293, 1006, 2494, 1007, 1010, 2029, 2511, 2014, 2004, 1037, 3948, 3063, 4969, 1010, 3687, 2274, 8922, 2982, 1998, 2956, 1996, 4908, 2980, 2531, 2193, 1011, 2028, 3895, 1000, 4689, 1999, 2293, 1000, 1

In [65]:
from torch.utils.data import DataLoader
from transformers import AdamW

In [66]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [67]:
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased",return_dict=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

In [68]:
model.to(device)
model.train()

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

In [69]:
train_loader = DataLoader(small_train_dataset,batch_size=10,shuffle=True)

In [70]:
optim = AdamW(model.parameters(),lr=5e-5)

In [71]:
#train_process
for epoch in range(3):
  for batch in train_loader:
    optim.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_positions = batch['start_positions'].to(device)
    end_positions = batch['end_positions'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions,
                    end_positions=end_positions)
    loss = outputs[0]
    loss.backward()
    optim.step()

In [89]:
val_loader = DataLoader(small_val_dataset, batch_size=10, shuffle=False)

In [120]:
ii = small_val_dataset[0]['input_ids']
at = small_val_dataset[0]['attention_mask']

In [128]:
output = model(torch.unsqueeze(ii,0),torch.unsqueeze(at,0))

In [142]:
torch.argmax(output[0][0])

tensor(84)

In [153]:
torch.argmax(output[1][0])

tensor(85)

In [140]:
output[0][0][41]

tensor(1.5695, grad_fn=<SelectBackward>)

In [143]:
output[0][0][84]

tensor(2.6893, grad_fn=<SelectBackward>)

In [165]:
nlp = spacy.blank("en")

In [166]:
def word_tokenize(sent):
    doc = nlp(sent)
    return [token.text for token in doc]

In [172]:
import collections

In [192]:
def get_F1_score(golden_answer, prediction):
    golden_tokens = word_tokenize(golden_answer)
    pred_tokens = word_tokenize(prediction)
    
    common = collections.Counter(golden_tokens) & collections.Counter(pred_tokens)
    num_same = sum(common.values())

    if pred_tokens == 0 or golden_tokens == 0:  #ignore the case of no-answer at this stage
        return 0

    precision = num_same / len(pred_tokens)
    recall = num_same / len(golden_tokens)
    if precision + recall == 0:
        f1 = 0
    else:
        f1 = (2 * precision * recall) / (precision + recall)
    return f1

In [174]:
collections.Counter(a)

Counter({'I': 1, 'love': 1, 'you': 1})

In [193]:
def evaluate(eval_dataset, answers):
    n = len(eval_dataset)
    exact_match = 0
    f1_sum = 0
    for i in range(n):
        input_ids = eval_dataset[i]['input_ids']
        attention_mask = eval_dataset[i]['attention_mask']
        golden_answer = answers[i]['text']

        output = model(torch.unsqueeze(input_ids,0), torch.unsqueeze(attention_mask,0))
        start = torch.argmax(output[0][0])
        end = torch.argmax(output[1][0])
        tokens = tokenizer.convert_ids_to_tokens(input_ids)
        prediction = ' '.join(tokens[start:end+1])

        #exact match
        if(prediction == golden_answer):
            exact_match = exact_match + 1

        #F1_score
        f1_sum = f1_sum + get_F1_score(golden_answer, prediction)
        
        
    accuracy = exact_match/n
    f1 = f1_sum / n
    return accuracy, f1


In [None]:
evaluate(small_val_dataset, small_val_answers)

In [178]:
a = "I love you"
b = "I love"

In [179]:
get_F1_score(a,b)

0.8

In [151]:
tokens = tokenizer.convert_ids_to_tokens(ii)

In [155]:
tokens[84:86]

['charles', 'iii']

In [156]:
answer = ' '.join(tokens[84:85+1])
answer

'charles iii'