# Setup

In [1]:
import os
import re
import json
import string
import numpy as np
from pprint import pprint
from tqdm.notebook import tqdm 

from tensorflow import keras
import torch

from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, BertModel

In [2]:
from utils import progress_bar

In [3]:
import warnings
warnings.filterwarnings(action='ignore')

## Configuration

In [4]:
max_len = 384
batch_size = 16 # 1080ti

In [5]:
device = torch.device(f'cuda:1' if torch.cuda.is_available() else 'cpu')

# Set-up BERT tokenizer

In [6]:
# Save the slow pretrained tokenizer
slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
save_path = "bert_base_uncased/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)

# Load the fast tokenizer from saved file
tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True)

# Load the data

In [7]:
train_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json"
train_path = keras.utils.get_file("train.json", train_data_url)
eval_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json"
eval_path = keras.utils.get_file("eval.json", eval_data_url)

# Data structure

```bash
train.json
├── version
└── data
    ├── data[0]
    │   ├── title
    │   └── paragraphs
    │       ├── paragraphs[0]
    │       │   ├── context
    │       │   └── qas
    │       │       ├── qas[0]
    │       │       │   ├── answers
    │       │       │   │   ├── answers_start
    │       │       │   │   └── text
    │       │       │   ├── id
    │       │       │   └── question
    │       │       │   
    │       │       │   ...
    │       │       │   
    │       │       └── qas[Q]
    │       │       
    │       │   ...    
    │       │       
    │       └── paragraphs[P]
    │      
    │   ...
    │      
    └── data[N]

```

In [8]:
with open(train_path) as f:
    raw_train_data = json.load(f)

with open(eval_path) as f:
    raw_eval_data = json.load(f)

In [9]:
train_path

'/root/.keras/datasets/train.json'

In [10]:
pprint(raw_train_data, depth=1)

{'data': [...], 'version': '1.1'}


In [11]:
nb_data = len(raw_train_data['data'])
print('The number of title: ',nb_data)

The number of title:  442


In [12]:
nb_paragraphs = 0
for i in range(nb_data):
    nb_paragraphs += len(raw_train_data['data'][i]['paragraphs'])
    
print('The number of paragraphs: ',nb_paragraphs)
print('The average of the number of paragraphs: ',np.around(nb_paragraphs / nb_data, 2))

The number of paragraphs:  18896
The average of the number of paragraphs:  42.75


In [13]:
nb_questions = 0
for i in range(nb_data):
    for paragraph in raw_train_data['data'][i]['paragraphs']:
        nb_questions += len(paragraph['qas'])

print('The number of questions: ',nb_questions)
print('The average of the number of paragraphs: ',np.around(nb_questions / nb_paragraphs, 2))

The number of questions:  87599
The average of the number of paragraphs:  4.64


# Preprocessing the data

In [14]:
class SquadExample:
    def __init__(self, question, context, start_char_idx, answer_text, all_answers):
        self.question = question
        self.context = context
        self.start_char_idx = start_char_idx
        self.answer_text = answer_text
        self.all_answers = all_answers
        self.skip = False

    def preprocess(self):
        context = self.context
        question = self.question
        answer_text = self.answer_text
        start_char_idx = self.start_char_idx

        # Clean context, answer and question
        context = " ".join(str(context).split())
        question = " ".join(str(question).split())
        answer = " ".join(str(answer_text).split())

        # Find end character index of answer in context
        end_char_idx = start_char_idx + len(answer)
        if end_char_idx >= len(context):
            self.skip = True
            return

        # Mark the character indexes in context that are in answer
        is_char_in_ans = [0] * len(context)
        for idx in range(start_char_idx, end_char_idx):
            is_char_in_ans[idx] = 1

        # Tokenize context
        tokenized_context = tokenizer.encode(context)

        # Find tokens that were created from answer characters
        ans_token_idx = []
        for idx, (start, end) in enumerate(tokenized_context.offsets):
            if sum(is_char_in_ans[start:end]) > 0:
                ans_token_idx.append(idx)

        if len(ans_token_idx) == 0:
            self.skip = True
            return

        # Find start and end token index for tokens from answer
        start_token_idx = ans_token_idx[0]
        end_token_idx = ans_token_idx[-1]

        # Tokenize question
        tokenized_question = tokenizer.encode(question)

        # Create inputs
        input_ids = tokenized_context.ids + tokenized_question.ids[1:]
        token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(
            tokenized_question.ids[1:]
        )
        attention_mask = [1] * len(input_ids)

        # Pad and create attention masks.
        # Skip if truncation is needed
        padding_length = max_len - len(input_ids)
        if padding_length > 0:  # pad
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)
        elif padding_length < 0:  # skip
            self.skip = True
            return

        self.input_ids = input_ids
        self.token_type_ids = token_type_ids
        self.attention_mask = attention_mask
        self.start_token_idx = start_token_idx
        self.end_token_idx = end_token_idx
        self.context_token_to_char = tokenized_context.offsets

def create_squad_examples(raw_data):
    squad_examples = []
    for item in tqdm(raw_data["data"], desc='DATA'):
        for para in item["paragraphs"][:2]: # sampling for fast training test
            context = para["context"]
            for qa in para["qas"]:
                question = qa["question"]
                answer_text = qa["answers"][0]["text"]
                all_answers = [_["text"] for _ in qa["answers"]]
                start_char_idx = qa["answers"][0]["answer_start"]
                squad_eg = SquadExample(
                    question, context, start_char_idx, answer_text, all_answers
                )
                squad_eg.preprocess()
                squad_examples.append(squad_eg)
    return squad_examples


def create_inputs_targets(squad_examples):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "start_token_idx": [],
        "end_token_idx": [],
    }
    for item in squad_examples:
        if item.skip == False:
            for key in dataset_dict:
                # getattr(item, key) == item.input_ids
                dataset_dict[key].append(getattr(item, key))
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])

    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]
    y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]]
    return x, y


train_squad_examples = create_squad_examples(raw_train_data)
x_train, y_train = create_inputs_targets(train_squad_examples)
print(f"{len(train_squad_examples)} training points created.")

eval_squad_examples = create_squad_examples(raw_eval_data)
x_eval, y_eval = create_inputs_targets(eval_squad_examples)
print(f"{len(eval_squad_examples)} evaluation points created.")

HBox(children=(FloatProgress(value=0.0, description='DATA', max=442.0, style=ProgressStyle(description_width='…


4980 training points created.


HBox(children=(FloatProgress(value=0.0, description='DATA', max=48.0, style=ProgressStyle(description_width='i…


627 evaluation points created.


Create the Question-Answering Model using BERT and Functional API

# Create dataloader

In [15]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, x_data, y_data):
        super(SquadDataset, self).__init__()        
        self.input_ids_lst, self.token_type_ids_lst, self.attention_mask_lst = x_data
        self.start_token_idx_lst, self.end_token_idx_lst = y_data
        
    def __len__(self):
        return len(self.input_ids_lst)
    
    def __getitem__(self, idx):
        X = {
            'input_ids':self.input_ids_lst[idx],
            'token_type_ids':self.token_type_ids_lst[idx],
            'attention_mask':self.attention_mask_lst[idx]
        }
        y = {
            'start_token_idx':self.start_token_idx_lst[idx],
            'end_token_idx':self.end_token_idx_lst[idx]
        }
        
        return X, y

In [16]:
trainset = SquadDataset(x_data=x_train, y_data=y_train)
validset = SquadDataset(x_data=x_eval, y_data=y_eval)

In [17]:
trainloader = torch.utils.data.DataLoader(trainset,
                                          batch_size=batch_size,
                                          shuffle=True)
validloader = torch.utils.data.DataLoader(validset,
                                          batch_size=batch_size,
                                          shuffle=False)

# Build Model

In [18]:
class QABert(torch.nn.Module):
    def __init__(self):
        super(QABert, self).__init__()
        # Bert encoder
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        
        # start token layer
        self.linear_start = torch.nn.Linear(in_features=self.bert.config.hidden_size, 
                                            out_features=1,
                                            bias=False)
        # end token layer
        self.linear_end = torch.nn.Linear(in_features=self.bert.config.hidden_size, 
                                          out_features=1,
                                          bias=False)
        
    def forward(self, 
                input_ids,
                token_type_ids,
                attention_mask):
        embedding = self.bert(input_ids,
                              token_type_ids=token_type_ids,
                              attention_mask=attention_mask)
        
        start_output = self.linear_start(embedding[0]).squeeze()
        end_output = self.linear_end(embedding[0]).squeeze()
        
        return start_output, end_output

In [19]:
model = QABert().to(device)

In [20]:
keras_model_nb_params = 109483776
torch_model_nb_params = np.sum([np.prod(param.size()) for param in model.parameters()])
print(f'Keras model params - Pytorch model params')
print(f'{keras_model_nb_params} - {torch_model_nb_params}: ',
      keras_model_nb_params - torch_model_nb_params)

Keras model params - Pytorch model params
109483776 - 109483776:  0


**[Keras] CateogicalCrossEntropy VS SparseCategoricalCrossEntropy**

- 설명: https://ahnjg.tistory.com/88

In [21]:
def normalize_text(text):
    text = text.lower()

    # Remove punctuations
    exclude = set(string.punctuation)
    text = "".join(ch for ch in text if ch not in exclude)

    # Remove articles
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    text = re.sub(regex, " ", text)

    # Remove extra white space
    text = " ".join(text.split())
    return text

class ExactMatch(object):
    def __init__(self, squad_examples=None):
        self.squad_examples = [_ for _ in squad_examples if _.skip == False]
        
    def evaluate(self, start_preds, end_preds):
        # ExactMatch
        count = 0

        for idx, (start, end) in enumerate(zip(start_preds, end_preds)):
            squad_eg = self.squad_examples[idx]
            pred_ans, true_ans = self._inference(start=start,
                                                end=end,
                                                squad_example=squad_eg)
            if (pred_ans is None) or (true_ans is None):
                continue
                
            if pred_ans in true_ans:
                count += 1
            
        acc = count / len(start_preds)
        print(f"Exact Match Score={acc:.2%}")
        
        
    def _inference(self, start, end, squad_example):
        return self.inference(start, end, squad_example)
    
    
    @staticmethod
    def inference(start, end, squad_example):
        offsets = squad_example.context_token_to_char
        
        # if answer start token index larger than offset length, then return None
        if start >= len(offsets):
            return None, None
        
        pred_char_start = offsets[start][0]

        if end < len(offsets):
            pred_char_end = offsets[end][1]
            pred_ans = squad_example.context[pred_char_start:pred_char_end]
        else:
            pred_ans = squad_example.context[pred_char_start:]

        normalized_pred_ans = normalize_text(pred_ans)
        normalized_true_ans = [normalize_text(_) for _ in squad_example.all_answers]
        
        return normalized_pred_ans, normalized_true_ans
    
    


In [22]:
def train(model, dataloader, criterion, optimizer, device):
    total_loss = 0
    
    model.train()
    for batch_idx, batch_i in enumerate(dataloader):
        # inputs and targets
        input_ids = batch_i[0]['input_ids'].to(device)
        attention_mask = batch_i[0]['attention_mask'].to(device)
        token_type_ids = batch_i[0]['token_type_ids'].to(device)
        start_targets = batch_i[1]['start_token_idx'].to(device)
        end_targets = batch_i[1]['end_token_idx'].to(device)
        
        # reset optimizer
        optimizer.zero_grad()
        
        # model output
        start_outputs, end_outputs = model(input_ids, attention_mask, token_type_ids)
        
        # loss
        start_loss = criterion(start_outputs, start_targets)
        end_loss = criterion(end_outputs, end_targets)
        loss = start_loss + end_loss
        loss.backward()
        
        # update optimizer
        optimizer.step()
        
        total_loss += loss.item()
    
        # massage
        progress_bar(current=batch_idx, 
                     total=len(dataloader),
                     msg='Loss: %.3f' % (total_loss/(batch_idx + 1)),
                     term_width=100,
                     notebook=True)
        

# ExactMaching
def validation(model, dataloader, criterion, device, exactmatch):
    total_loss = 0
    
    total_start_preds, total_end_preds = [], []
    
    model.eval()
    with torch.no_grad():
        for batch_idx, batch_i in enumerate(dataloader):
            # inputs and targets
            input_ids = batch_i[0]['input_ids'].to(device)
            attention_mask = batch_i[0]['attention_mask'].to(device)
            token_type_ids = batch_i[0]['token_type_ids'].to(device)
            start_targets = batch_i[1]['start_token_idx'].to(device)
            end_targets = batch_i[1]['end_token_idx'].to(device)

            # model output
            start_outputs, end_outputs = model(input_ids, attention_mask, token_type_ids)
            
            _, start_preds = start_outputs.max(1)
            _, end_preds = end_outputs.max(1)
            total_start_preds.extend(start_preds.cpu().numpy())
            total_end_preds.extend(end_preds.cpu().numpy())
            
            # loss
            start_loss = criterion(start_outputs, start_targets)
            end_loss = criterion(end_outputs, end_targets)
            loss = start_loss + end_loss
            
            total_loss += loss.item()

            # massage
            progress_bar(current=batch_idx, 
                         total=len(dataloader),
                         msg='Loss: %.3f' % (total_loss/(batch_idx + 1)),
                         term_width=100,
                         notebook=True)
            
        exactmatch.evaluate(start_preds=total_start_preds,
                            end_preds=total_end_preds)


            
def fit(model, epochs, trainloader, criterion, optimizer, device, validloader=None, eval_squad_examples=None):
    for epoch in range(epochs):
        print('Fit start')
        print(f'\nEpochs: {epoch+1}/{epochs}')
        train(model, trainloader, criterion, optimizer, device)
        if validloader is not None:
            eval_exactmatch = ExactMatch(squad_examples=eval_squad_examples)
            validation(model, validloader, criterion, device, eval_exactmatch)

In [23]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.00005)

In [24]:
# fit(model=model,
#     epochs=1,
#     trainloader=trainloader,
#     validloader=validloader,
#     criterion=criterion,
#     optimizer=optimizer,
#     device=device,
#     eval_squad_examples=eval_squad_examples)

# Inference Exact Match

In [25]:
save_info = torch.load('./Text_Extraction_with_BERT/QABert.pth')
model.load_state_dict(save_info['params'])

<All keys matched successfully>

In [26]:
# sample index
sample_idx = 0
sample_input_ids = torch.LongTensor(x_eval[0][[sample_idx]])
sample_token_type_ids = torch.LongTensor(x_eval[1][[sample_idx]])
sample_attention_mask = torch.LongTensor(x_eval[2][[sample_idx]])

# predict start and end index
start, end = model(sample_input_ids.to(device),
                   sample_token_type_ids.to(device),
                   sample_attention_mask.to(device))

_, start = start.unsqueeze(0).max(1)
_, end = end.unsqueeze(0).max(1)

# inference predict answer
eval_examples_no_skip = [_ for _ in eval_squad_examples if _.skip == False]

pred_ans, true_ans = ExactMatch.inference(start=start,
                                          end=end,
                                          squad_example=eval_examples_no_skip[sample_idx])

print('[Predict Answer]')
print(pred_ans)
print()
print('[True Answer]')
for idx, ans in enumerate(true_ans):
    print(f'- Asnwer{idx}: {ans}')

[Predict Answer]
50 was american football game to determine champion of national football league nfl for 2015 season american football conference afc champion denver broncos defeated national football conference nfc champion carolina panthers 24–10 to earn their third super bowl title game was played on february 7 2016 at levis stadium in san francisco bay area at santa clara california as this was 50th super bowl league emphasized golden anniversary

[True Answer]
- Asnwer0: denver broncos
- Asnwer1: denver broncos
- Asnwer2: denver broncos


In [26]:
# sample index
sample_idx = 0
sample_input_ids = torch.LongTensor(x_eval[0][[sample_idx]])
sample_token_type_ids = torch.LongTensor(x_eval[1][[sample_idx]])
sample_attention_mask = torch.LongTensor(x_eval[2][[sample_idx]])

# predict start and end index
start, end = model(sample_input_ids.to(device),
                   sample_token_type_ids.to(device),
                   sample_attention_mask.to(device))

_, start = start.unsqueeze(0).max(1)
_, end = end.unsqueeze(0).max(1)

# inference predict answer
eval_examples_no_skip = [_ for _ in eval_squad_examples if _.skip == False]

pred_ans, true_ans = ExactMatch.inference(start=start,
                                          end=end,
                                          squad_example=eval_examples_no_skip[sample_idx])

print('[Predict Answer]')
print(pred_ans)
print()
print('[True Answer]')
for idx, ans in enumerate(true_ans):
    print(f'- Asnwer{idx}: {ans}')

[Predict Answer]
50 was american football game to determine champion of national football league nfl for 2015 season american football conference afc champion denver broncos defeated national football conference nfc champion carolina panthers 24–10 to earn their third super bowl title game was played on february 7 2016 at levis stadium in san francisco bay area at santa clara california as this was 50th super bowl league emphasized golden anniversary

[True Answer]
- Asnwer0: denver broncos
- Asnwer1: denver broncos
- Asnwer2: denver broncos
