In this notebook we train our model: a BERT-like model with attention flow inspired by BiDAF.

Dependencies:

In [2]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

This part should be data loading and processing.

Input: SQuAD dataset handler/url/json

Output: processed dict/list/whatever: train_question, train_context, train_answer

In [3]:
from utils import data_processing
encodings =  data_processing.data_processing("https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json")

max question length: 62


Test on the encodings

In [4]:
print(encodings.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'])


In [5]:
input_ids = torch.tensor(encodings['input_ids'], dtype = torch.long)
input_ids.shape

torch.Size([130319, 512])

In [6]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [7]:
from pytorch_pretrained_bert import BertModel
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
model = BertModel.from_pretrained('bert-base-uncased')
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Lin

In [9]:
attention_mask = torch.tensor(encodings['attention_mask'], dtype = torch.long)
print(attention_mask[:, 0:10:1])

tensor([[1, 1, 1,  ..., 1, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])


In [14]:
dataset = torch.utils.data.TensorDataset(input_ids[0:10:1,:], attention_mask[0:10:1,:])
dataloader = torch.utils.data.DataLoader(dataset, batch_size=1)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Lin

In [16]:
hidden_states = []
count = 0
for ids, masks in dataloader:
    ids = ids.to(device)
    masks = masks.to(device)
    hidden_state, _ = model(input_ids=ids, token_type_ids = None, attention_mask = masks, output_all_encoded_layers=False)
    print(hidden_state)
    hidden_states.append(hidden_state)
    count += 1
    if count == 5:
        break

tensor([[[-0.5419, -0.9272, -0.8189,  ...,  0.4710,  0.3021, -0.0118],
         [-0.3399, -0.1935, -0.5332,  ...,  0.1112,  0.4785,  0.3461],
         [-0.0205, -0.7843, -0.0974,  ...,  0.4526,  0.6314, -0.1924],
         ...,
         [-0.3781, -0.2497,  0.3438,  ...,  0.0324,  0.3491, -0.6030],
         [-0.4485, -0.1967,  0.4778,  ...,  0.1938,  0.2390, -0.3650],
         [-0.4070, -0.5381,  0.5768,  ...,  0.3203,  0.3056, -0.4724]]],
       device='cuda:0', grad_fn=<AddBackward0>)
tensor([[[-0.5570, -0.8969, -0.7864,  ...,  0.4562,  0.2554, -0.0233],
         [ 0.0074, -0.4338, -0.7164,  ...,  0.4659,  0.2227, -0.2710],
         [-0.3958,  0.1218, -0.2656,  ...,  0.1296, -0.0840,  0.1145],
         ...,
         [-0.4158, -0.2677,  0.3231,  ...,  0.0601,  0.3443, -0.5011],
         [-0.4683, -0.2284,  0.4723,  ...,  0.2165,  0.2381, -0.3401],
         [-0.4227, -0.5225,  0.5778,  ...,  0.3117,  0.3053, -0.4476]]],
       device='cuda:0', grad_fn=<AddBackward0>)
tensor([[[-0.5842, -

In [17]:
hidden_states[0].shape

torch.Size([1, 512, 768])

This part should be model construction.

In [12]:
from layers.bert_plus_bidaf import BERT_plus_BiDAF
""" TODO: implement BERT_plus_BiDAF model class """

SyntaxError: invalid syntax (bert_plus_bidaf.py, line 23)

This part should be declaration of the optimizer and the loss function. 

In [None]:
""" TODO: optimizer, and loss function(not urgent)"""

This part should be the definition of training process:

In [None]:
def train(model, inputs):
""" TODO: determine inputs """

Rest part is for experiments: