In [1]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import collections, time, spacy, copy, string, re, json, matplotlib
from layers.bert_plus_bidaf import BERT_plus_BiDAF
from eval_test import evaluate
from utils import data_processing
from torch.utils.data import DataLoader
from transformers import BertTokenizerFast
from training import SquadDataset

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [3]:
val_encodings = torch.load(r'D:\OneDrive\Courses\ECS289 NLP\val_encodings.pt')
val_answer=torch.load(r'D:\OneDrive\Courses\ECS289 NLP\val_answer.pt')
val_dataset = SquadDataset(val_encodings)
nlp = spacy.blank("en")
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [4]:
model = BERT_plus_BiDAF(if_bidirectional=True, if_extra_modeling=True)
model.load_state_dict(torch.load(r'D:\OneDrive\Courses\ECS289 NLP\bert_bidaf_bidirectionalLSTM.pt'))

<All keys matched successfully>

In [5]:
logits = torch.load('pred_logits.pt')

In [6]:
def predict(logits_start, logits_end, threshold = 0.1):
    """
    Input:
    logits_start, logits_end: torch.tensor() of shape [batch_size, sequence length]
    return the index i,j such that i<=j and logits_start[i]+logits[j] is maximized
    """
    # compute probability
    p_start = F.softmax(logits_start, dim=-1)
    p_end = F.softmax(logits_end, dim=-1)
    # compute joint probability
    p_joint = torch.triu(torch.bmm(p_start.unsqueeze(dim=2), p_end.unsqueeze(dim=1)))
    # get the batchwise indices
    max_row, _ = torch.max(p_joint, dim=2)
    max_col, _ = torch.max(p_joint, dim=1)
    start = torch.argmax(max_row, dim=-1)
    end = torch.argmax(max_col, dim=-1)
    # check if indices are greater than no answer probability by threshold
    p_na = p_joint[:,0,0]
    max_prob,_ = torch.max(max_row,dim=-1)
    start[p_na + threshold > max_prob] = 0
    end[p_na + threshold > max_prob] = 0
    return start, end
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
        return re.sub(regex, ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))
def compare(logits, eval_dataset, tokenizer, nlp, threshold):
    """ To compare the predictions and answers, run this function"""
    n = len(eval_dataset)
    exact_match = 0
    incorrectIdx = []
    for i in range(n):
        if i%1000==0:
            print('compared {}/{}:'.format(i, n))
        input_ids = eval_dataset[i]['input_ids']
        tokens = tokenizer.convert_ids_to_tokens(input_ids)
        golden_start, golden_end = eval_dataset[i]['start_positions'], eval_dataset[i]['end_positions']
        if golden_start == 0 and golden_end == 0:
            golden_answer = "noanswer"
        else:
            golden_answer = normalize_answer(' '.join(tokens[golden_start:golden_end + 1]))
        
        start_logits, end_logits = logits[i]['start_logits'], logits[i]['end_logits']
        # compute null score and make prediction:
        pred_start, pred_end = predict(torch.unsqueeze(start_logits,dim=0),torch.unsqueeze(end_logits,dim=0), threshold)
        # adjust to the context padding
        pred_start[pred_start!=0] += 62
        pred_end[pred_end!=0] += 62
        if pred_start == 0 or pred_end == 0:
                pred_answer = "noanswer"
        else:
            pred_answer = normalize_answer(' '.join(tokens[pred_start:pred_end + 1]))
        if pred_answer == golden_answer:
            exact_match += 1
        else:
            incorrectIdx.append(i)
    acc = 100 * exact_match / n
    return acc, incorrectIdx

In [7]:
acc, idx = compare(logits, val_dataset, tokenizer, nlp, threshold = 0)

compared 0/11873:
compared 1000/11873:
compared 2000/11873:
compared 3000/11873:
compared 4000/11873:
compared 5000/11873:
compared 6000/11873:
compared 7000/11873:
compared 8000/11873:
compared 9000/11873:
compared 10000/11873:
compared 11000/11873:


In [13]:
num_error = len(idx)

In [87]:
print(acc, num_error)

61.65248884022572 4691


In [18]:
samples = torch.randperm(num_error)[0:50]
print(samples)

tensor([2332, 3764, 4499, 1724, 2693, 2526,  587, 3405, 2259, 2729, 3505, 3081,
        2572, 4233, 4168, 2234, 2765,  182, 4138, 2265, 3192,  823, 1600, 3499,
         556, 2013, 4511, 2521, 4433,  174, 3637,  985, 2774, 3503,  883, 2692,
        1156, 1138,  956, 4471, 4010, 3569, 1373,  913, 1986, 4553, 3073, 4244,
        1293, 4645])


In [23]:
sampled_errors = [idx[i] for i in samples]
print(sampled_errors)

[6003, 9663, 11453, 4350, 6919, 6568, 1450, 8809, 5726, 7000, 9024, 7872, 6685, 10851, 10670, 5668, 7097, 480, 10577, 5737, 8174, 2060, 3984, 9011, 1381, 5172, 11488, 6560, 11321, 454, 9317, 2478, 7129, 9022, 2245, 6918, 2788, 2755, 2412, 11417, 10286, 9156, 3395, 2329, 5061, 11571, 7857, 10872, 3151, 11780]


In [8]:
import pandas as pd
import urllib
val_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json"
response = urllib.request.urlopen(val_url)
raw = pd.read_json(response)
contexts, questions, answers, ids = data_processing.load_data(raw)

In [50]:
errors = []
for error_idx in sampled_errors:
    error_data = {}
    input_ids = val_dataset[error_idx]['input_ids']
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    start_logits, end_logits = logits[error_idx]['start_logits'], logits[error_idx]['end_logits']
    pred_start, pred_end = predict(torch.unsqueeze(start_logits,dim=0),torch.unsqueeze(end_logits,dim=0), 0)
    # adjust to the context padding
    pred_start[pred_start!=0] += 62
    pred_end[pred_end!=0] += 62
    if pred_start == 0 and pred_end == 0:
                pred_answer = "noanswer"
    else:
            pred_answer = normalize_answer(' '.join(tokens[pred_start:pred_end + 1]))
    error_data['context'] = contexts[error_idx]
    error_data['question'] = questions[error_idx]
    error_data['answer'] = val_answer[error_idx]['text']
    error_data['predict answer'] = pred_answer
    error_data['answer span'] = (val_dataset[error_idx]['start_positions'].item(), val_dataset[error_idx]['end_positions'].item())
    error_data['predict answer span'] = (pred_start.item(), pred_end.item())
    errors.append(error_data)

In [51]:
with open('error.json', 'w') as output:
    json.dump(errors, output)

In [179]:
index = 18
print(sampled_errors[index])
for key in errors[index].keys():
    print(key+':', errors[index][key])

10577
context: One key figure in the plans for what would come to be known as American Empire, was a geographer named Isiah Bowman. Bowman was the director of the American Geographical Society in 1914. Three years later in 1917, he was appointed to then President Woodrow Wilson's inquiry in 1917. The inquiry was the idea of President Wilson and the American delegation from the Paris Peace Conference. The point of this inquiry was to build a premise that would allow for U.S authorship of a 'new world' which was to be characterized by geographical order. As a result of his role in the inquiry, Isiah Bowman would come to be known as Wilson's geographer. 
question: What was the premise of Woodrow Wilson's inquiry?
answer: U.S authorship of a 'new world'
predict answer: noanswer
answer span: (153, 162)
predict answer span: (0, 0)
