In this notebook we train our model: a BERT-like model with attention flow inspired by BiDAF.

Dependencies:

In [2]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import collections, time, spacy, copy
from layers.bert_plus_bidaf import BERT_plus_BiDAF
from utils import data_processing
from torch.utils.data import DataLoader

This part should be data loading and processing.

Input: SQuAD dataset handler/url/json

Output: processed dict/list/whatever: train_question, train_context, train_answer

In [3]:
train_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json"
val_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json"
train_encodings =  data_processing.data_processing(train_url)
val_encodings = data_processing.data_processing(val_url)

Create a smaller dataset for debugging

In [62]:
for key in train_encodings.keys():
    train_encodings[key] = train_encodings[key][0:100]

In [67]:
for key in val_encodings.keys():
    val_encodings[key] = val_encodings[key][0:100]

Templates for S/L to save time preprocessing

In [68]:
torch.save(train_encodings,r'D:\OneDrive\Courses\ECS289 NLP\train_encodings.pt')
torch.save(val_encodings,r'D:\OneDrive\Courses\ECS289 NLP\val_encodings.pt')

In [3]:
train_encodings = torch.load(r'D:\OneDrive\Courses\ECS289 NLP\train_encodings.pt')
val_encodings = torch.load(r'D:\OneDrive\Courses\ECS289 NLP\val_encodings.pt')

In [4]:
class SquadDataset(torch.utils.data.Dataset):
  def __init__(self,encodings):
    self.encodings = encodings
  def __getitem__(self,idx):
    return {key:torch.tensor(val[idx]) for key, val in self.encodings.items()}
  def __len__(self):
    return len(self.encodings.input_ids)

In [5]:
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

This part should be model construction.

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [7]:
model = BERT_plus_BiDAF(if_extra_modeling=True)

In [8]:
model.to(device)

BERT_plus_BiDAF(
  (bert_layer): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
           

This part should be declaration of the optimizer and the loss function. 

In [9]:
parameters = model.parameters()
print("Parameters to learn:")
for name, param in model.named_parameters():
    if param.requires_grad:
        print("\t", name)
optimizer = optim.Adam(parameters, lr=5e-5)

Parameters to learn:
	 bert_layer.embeddings.word_embeddings.weight
	 bert_layer.embeddings.position_embeddings.weight
	 bert_layer.embeddings.token_type_embeddings.weight
	 bert_layer.embeddings.LayerNorm.weight
	 bert_layer.embeddings.LayerNorm.bias
	 bert_layer.encoder.layer.0.attention.self.query.weight
	 bert_layer.encoder.layer.0.attention.self.query.bias
	 bert_layer.encoder.layer.0.attention.self.key.weight
	 bert_layer.encoder.layer.0.attention.self.key.bias
	 bert_layer.encoder.layer.0.attention.self.value.weight
	 bert_layer.encoder.layer.0.attention.self.value.bias
	 bert_layer.encoder.layer.0.attention.output.dense.weight
	 bert_layer.encoder.layer.0.attention.output.dense.bias
	 bert_layer.encoder.layer.0.attention.output.LayerNorm.weight
	 bert_layer.encoder.layer.0.attention.output.LayerNorm.bias
	 bert_layer.encoder.layer.0.intermediate.dense.weight
	 bert_layer.encoder.layer.0.intermediate.dense.bias
	 bert_layer.encoder.layer.0.output.dense.weight
	 bert_layer.encode

This part should be the definition of training process:

In [10]:
def predict(logits_start, logits_end, threshold = 0.1):
    """
    Input:
    logits_start, logits_end: torch.tensor() of shape [batch_size, sequence length]
    return the index i,j such that i<=j and logits_start[i]+logits[j] is maximized
    """
    # compute probability
    p_start = F.softmax(logits_start, dim=-1)
    p_end = F.softmax(logits_end, dim=-1)
    # compute joint probability
    p_joint = torch.triu(torch.bmm(p_start.unsqueeze(dim=2), p_end.unsqueeze(dim=1)))
    # get the batchwise indices
    max_row, _ = torch.max(p_joint, dim=2)
    max_col, _ = torch.max(p_joint, dim=1)
    start = torch.argmax(max_in_row, dim=-1)
    end = torch.argmax(max_in_col, dim=-1)
    # check if indices are greater than no answer probability by threshold
    p_na = p_joint[:,0,0]
    max_prob = torch.max(max_row,dim=-1)
    start[p_na + threshold > max_prob] = 0
    end[p_na + threshold > max_prob] = 0
    # adjust to the encoding structure
    start[start!=0] += 63
    end[end!=0] += 63
    return start, end

In [11]:
def train(model, optimizer, dataloader, num_epochs = 3):
    """
    Inputs:
    model: a pytorch model
    dataloader: a pytorch dataloader
    loss_func: a pytorch criterion, e.g. torch.nn.CrossEntropyLoss()
    optimizer: an optimizer: e.g. torch.optim.SGD()
    """
    start = time.time()

    for epoch in range(num_epochs):
        print('Epoch {}/{}:'.format(epoch, num_epochs - 1))
        print('-'*10)
        # Each epoch we make a training and a validation phase
        model.train()
            
        # Initialize the loss and binary classification error in each epoch
        running_loss = 0.0

        for batch in dataloader:
            # zero the parameter gradients
            optimizer.zero_grad()
            # Send data to GPU
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)
            # Forward computation
            # Get the model outputs
            outputs = model(input_ids, attention_mask, start_positions, end_positions)
            loss = outputs[0]
            # In training phase, backprop and optimize
            loss.backward()
            optimizer.step()                   
            # Compute running loss/accuracy
            running_loss += loss

        epoch_loss = running_loss
        print('Loss: {:.4f}'.format(epoch_loss))

    # Output info after training
    time_elapsed = time.time() - start
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    return copy.deepcopy(model.state_dict())

In [12]:
nlp = spacy.blank("en")
def word_tokenize(sent, nlp):
    doc = nlp(sent)
    return [token.text for token in doc]

In [13]:
def compute_f1(a_gold, a_pred):
    gold_toks = word_tokenize(a_gold)
    pred_toks = word_tokenize(a_pred)
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return int(gold_toks == pred_toks)
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

In [14]:
def evaluate(model, eval_dataset, answers, threshold=0.1):
    """ TODO: debug"""
    n = len(eval_dataset)
    exact_match = 0
    f1_sum = 0
    model.eval()
    for i in range(n):
        input_ids = eval_dataset[i]['input_ids']
        attention_mask = eval_dataset[i]['attention_mask']
        golden_answer = answers[i]['text']

        _, start_logits, end_logits = model(torch.unsqueeze(input_ids,0), torch.unsqueeze(attention_mask,0))

        # compute null score and make prediction:
        start, end = predict_index(start_logits, end_logits, threshold)
        if start == 0 and end == 0:
            prediction = ""
        else:
            tokens = tokenizer.convert_ids_to_tokens(input_ids)
            prediction = ' '.join(tokens[start:end+1])
        
        #exact match
        if(prediction == golden_answer):
            exact_match = exact_match + 1
        #F1_score
        f1_sum = f1_sum + get_F1_score(golden_answer, prediction)       
    accuracy = exact_match/n
    f1 = f1_sum / n
    return accuracy, f1

Rest part is for experiments:

In [25]:
dataloader = DataLoader(train_dataset,batch_size=2,shuffle=True)

In [27]:
trained_model = train(model, optimizer, dataloader, num_epochs=30)

Epoch 0/29:
----------
Loss: 299.2440
Epoch 1/29:
----------
Loss: 304.1715
Epoch 2/29:
----------
Loss: 305.5480
Epoch 3/29:
----------


RuntimeError: CUDA error: CUBLAS_STATUS_INTERNAL_ERROR when calling `cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)`
Exception raised from gemm at ..\aten\src\ATen\cuda\CUDABlas.cpp:165 (most recent call first):
00007FF8577075A200007FF857707540 c10.dll!c10::Error::Error [<unknown file> @ <unknown line number>]
00007FFFE110384600007FFFE1102810 torch_cuda.dll!at::native::sparse_mask_cuda [<unknown file> @ <unknown line number>]
00007FFFE060C89700007FFFE060B790 torch_cuda.dll!at::native::lerp_cuda_tensor_out [<unknown file> @ <unknown line number>]
00007FFFE060E2D200007FFFE060DD60 torch_cuda.dll!at::native::addmm_out_cuda [<unknown file> @ <unknown line number>]
00007FFFE060F44300007FFFE060F360 torch_cuda.dll!at::native::mm_cuda [<unknown file> @ <unknown line number>]
00007FFFE1171E6F00007FFFE110E400 torch_cuda.dll!at::native::set_storage_cuda_ [<unknown file> @ <unknown line number>]
00007FFFE1161E8200007FFFE110E400 torch_cuda.dll!at::native::set_storage_cuda_ [<unknown file> @ <unknown line number>]
00007FF81B40D94900007FF81B408FA0 torch_cpu.dll!at::bucketize_out [<unknown file> @ <unknown line number>]
00007FF81B44057700007FF81B440520 torch_cpu.dll!at::mm [<unknown file> @ <unknown line number>]
00007FF81C79EC7900007FF81C6AE010 torch_cpu.dll!torch::autograd::GraphRoot::apply [<unknown file> @ <unknown line number>]
00007FF81AF5715700007FF81AF56290 torch_cpu.dll!at::indexing::TensorIndex::boolean [<unknown file> @ <unknown line number>]
00007FF81B40D94900007FF81B408FA0 torch_cpu.dll!at::bucketize_out [<unknown file> @ <unknown line number>]
00007FF81B52210700007FF81B5220B0 torch_cpu.dll!at::Tensor::mm [<unknown file> @ <unknown line number>]
00007FF81C611F1600007FF81C611B30 torch_cpu.dll!torch::autograd::generated::MmBackward::apply [<unknown file> @ <unknown line number>]
00007FF81C5E7E9100007FF81C5E7B50 torch_cpu.dll!torch::autograd::Node::operator() [<unknown file> @ <unknown line number>]
00007FF81CB4F9BA00007FF81CB4F300 torch_cpu.dll!torch::autograd::Engine::add_thread_pool_task [<unknown file> @ <unknown line number>]
00007FF81CB503AD00007FF81CB4FFD0 torch_cpu.dll!torch::autograd::Engine::evaluate_function [<unknown file> @ <unknown line number>]
00007FF81CB54FE200007FF81CB54CA0 torch_cpu.dll!torch::autograd::Engine::thread_main [<unknown file> @ <unknown line number>]
00007FF81CB54C4100007FF81CB54BC0 torch_cpu.dll!torch::autograd::Engine::thread_init [<unknown file> @ <unknown line number>]
00007FF801AE0A2700007FF801ABA100 torch_python.dll!THPShortStorage_New [<unknown file> @ <unknown line number>]
00007FF81CB4BF1400007FF81CB4B780 torch_cpu.dll!torch::autograd::Engine::get_base_engine [<unknown file> @ <unknown line number>]
00007FF887FFE3FE00007FF887FFE3A0 ucrtbase.dll!o_strcat_s [<unknown file> @ <unknown line number>]
00007FF88A08403400007FF88A084020 KERNEL32.DLL!BaseThreadInitThunk [<unknown file> @ <unknown line number>]
00007FF88BCB369100007FF88BCB3670 ntdll.dll!RtlUserThreadStart [<unknown file> @ <unknown line number>]


In [None]:
em, f1 = evaluate(trained_model, val_dataset, )

In [17]:
torch.cuda.set_device(device)

In [24]:
torch.cuda.memory_allocated(device)

2512487424