In this notebook we train our model: a BERT-like model with attention flow inspired by BiDAF.

Dependencies:

In [1]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

This part should be data loading and processing.

Input: SQuAD dataset handler/url/json

Output: processed dict/list/whatever: train_question, train_context, train_answer

In [2]:
from utils import data_processing
NUM_TRAIN_DATA = 130319
train_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json"
val_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json"
encodings =  data_processing.data_processing(train_url, val_url)

train_encodings = {key: encodings[key][0:NUM_TRAIN_DATA, :] for key in encodings.keys()}
val_encodings = {key: encodings[key][NUM_TRAIN_DATA:, :] for key in encodings.keys()}

max question length: 62
max question length: 39


Test on the encodings

In [4]:
print(encodings.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'])


In [18]:
input_ids = torch.tensor(train_encodings['input_ids'], dtype = torch.long)
input_ids.shape

torch.Size([130319, 512])

In [19]:
attention_mask = torch.tensor(encodings['attention_mask'], dtype = torch.long)
print(attention_mask[:, 0])

tensor([1, 1, 1,  ..., 1, 1, 1])


In [21]:
dataset = torch.utils.data.TensorDataset(input_ids, attention_mask)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=16)

This part should be model construction.

In [35]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [36]:
from layers.bert_plus_bidaf import BERT_plus_BiDAF
model = BERT_plus_BiDAF()

In [37]:
model.to(device)

BERT_plus_BiDAF(
  (bert_layer): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
           

This part should be declaration of the optimizer and the loss function. 

In [38]:
parameters = model.parameters()
print("Parameters to learn:")
for name, param in model.named_parameters():
    if param.requires_grad:
        print("\t", name)
optimizer = optim.Adam(parameters)

Parameters to learn:
	 bert_layer.embeddings.word_embeddings.weight
	 bert_layer.embeddings.position_embeddings.weight
	 bert_layer.embeddings.token_type_embeddings.weight
	 bert_layer.embeddings.LayerNorm.weight
	 bert_layer.embeddings.LayerNorm.bias
	 bert_layer.encoder.layer.0.attention.self.query.weight
	 bert_layer.encoder.layer.0.attention.self.query.bias
	 bert_layer.encoder.layer.0.attention.self.key.weight
	 bert_layer.encoder.layer.0.attention.self.key.bias
	 bert_layer.encoder.layer.0.attention.self.value.weight
	 bert_layer.encoder.layer.0.attention.self.value.bias
	 bert_layer.encoder.layer.0.attention.output.dense.weight
	 bert_layer.encoder.layer.0.attention.output.dense.bias
	 bert_layer.encoder.layer.0.attention.output.LayerNorm.weight
	 bert_layer.encoder.layer.0.attention.output.LayerNorm.bias
	 bert_layer.encoder.layer.0.intermediate.dense.weight
	 bert_layer.encoder.layer.0.intermediate.dense.bias
	 bert_layer.encoder.layer.0.output.dense.weight
	 bert_layer.encode

In [39]:
loss = nn.CrossEntropyLoss()

This part should be the definition of training process:

In [None]:
def train(model, dataloader, loss, optimizer, num_epochs = 5, learning_rate = 5e-5):
    """
    Inputs:
    model: a pytorch model
    dataloader: a pytorch dataloader
    loss_func: a pytorch criterion, e.g. torch.nn.CrossEntropyLoss()
    optimizer: an optimizer: e.g. torch.optim.SGD()
    """

    start = time.time()
    val_acc_history = []

    best_model = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}:'.format(epoch, num_epochs - 1))
        print('-'*10)

        # Each epoch we make a training and a validation phase
        for phase in ['Train', 'Val']:
            if phase == 'Train':
                model.train() # This sets the model in train mode instead of training the model
            else:
                model.eval()  # Similarly this sets the model in evaluate mode
            
            # Initialize the loss and binary classification error in each epoch
            running_loss = 0.0
            running_corrects = 0

            for inputs, labels in dataloader[phase]:
                # Send data to GPU
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()
                # Forward computation
                # Track accuracy history in training phase
                with torch.set_grad_enabled(phase == 'Train'):
                    # Get the model outputs
                    outputs = model(inputs)
                    loss = loss_func(outputs, labels)

                    # Make predictions
                    _, preds = torch.max(outputs, 1)

                    # In training phase, backprop and optimize
                    if phase == 'Train':
                        loss.backward()
                        optimizer.step()

                # Compute running loss/accuracy
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / len(dataloader[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloader[phase].dataset)

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

            # Deepcopy the best model so far
            if phase == 'Val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model = copy.deepcopy(model.state_dict())
            if phase == 'Val':
                val_acc_history.append(epoch_acc)

        print()

    # Output info after training
    time_elapsed = time.time() - start
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # Return the best model
    model.load_state_dict(best_model)
    return model, val_acc_history

In [None]:
def compute_F1():
    """
    helper function on computing F1 score
    """

Rest part is for experiments: