## Import

In [1]:
import torch
from tqdm import tqdm_notebook as tqdm
from transformers import BertTokenizer, PreTrainedTokenizer, BertModel, BertForMaskedLM, BertForQuestionAnswering

I0813 07:43:35.758250 140504480094016 file_utils.py:39] PyTorch version 1.2.0+cu92 available.


In [2]:
import pandas as pd
import numpy as np

In [3]:
from torch.utils.data import DataLoader

In [4]:
from torch.nn import functional as F
from torch.nn.utils.rnn import pad_sequence 
import torchvision
import torch.nn as nn

In [5]:
from transformers.optimization import AdamW
from transformers.optimization import AdamW, get_linear_schedule_with_warmup

In [6]:
device = torch.device("cuda")

In [7]:
dev_data = pd.read_json("FGC_release_1.7.13/FGC_release_all_dev.json")
training_data = pd.read_json("FGC_release_1.7.13/FGC_release_all_train.json")
test_data = pd.read_json("FGC_release_1.7.13/FGC_release_all_test.json")

In [8]:
# Remove all the questions where there's no supporting evidence to it
training_data = training_data[training_data['QUESTIONS'].apply(lambda x: len(x[0]['SHINT_']) > 0)]
dev_data = dev_data[dev_data['QUESTIONS'].apply(lambda x: len(x[0]['SHINT_']) > 0)]
test_data = test_data[test_data['QUESTIONS'].apply(lambda x: len(x[0]['SHINT_']) > 0)]

## Data Preprocessing

In [9]:
#tokenizer = PreTrainedTokenizer(model_max_length = 512, padding_side = 'right', model_input_names = ['token_ids', 'token_type_ids', 'attention_mask'])
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

I0813 07:43:47.067914 140504480094016 tokenization_utils_base.py:1254] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt from cache at /root/.cache/torch/transformers/8a0c070123c1f794c42a29c6904beb7c1b8715741e235bee04aca2c7636fc83f.9b42061518a39ca00b8b52059fd2bede8daa613f8a8671500e518a8c29de8c00


In [12]:
def data_preprocessing(data):
    all_instances = []
    questions = data['QUESTIONS'].apply(lambda x: [x[0]['QTEXT_CN'], len(x[0]['SHINT'][1])]).tolist()
    sentences = [sentence['text'] for sentence_dict in data['SENTS'] for sentence in sentence_dict]
    lengths = np.array(questions)[:, 1].astype(int).tolist()
    cumulative_lengths = np.array(lengths).cumsum()
    indices = data['QUESTIONS'].apply(lambda x: x[0]['SHINT_']).tolist()
    labels = [[0] * length for length in lengths]
    all_labels = []
    # Inpute index into labels
    for i in range(len(labels)):
        np_label = np.array(labels[i])
        np_index = np.array(indices[i])
        np_label[np_index] = 1
        label = np_label.tolist()
        #labels[i] = label
        all_labels = all_labels + label

    counter = 0
    for question in questions:
        question_text = question[0]
        for j in range(counter, counter + question[1]):
            all_instances.append([question_text, sentences[j]])
        counter = counter + question[1]

    all_tokenized = []
    pad_left = False
    pad_right = False
    for i in range(len(all_instances)):
        instance = []
        
        if i == 0:
            pad_left = True
            instance.append(all_instances[i])
            instance.append(all_instances[i+1])
            #print("instance:", all_instances[i], "label:", all_labels[i])
        elif i + 1 in cumulative_lengths:
            pad_right = True
            instance.append(all_instances[i-1])
            instance.append(all_instances[i])
            #print("instance:", all_instances[i], "label:", all_labels[i])

        elif i in cumulative_lengths:
            pad_left = True
            instance.append(all_instances[i])
            instance.append(all_instances[i+1])
            #print("instance:", all_instances[i], "label:", all_labels[i])

        else:
            instance.append(all_instances[i-1])
            instance.append(all_instances[i])
            instance.append(all_instances[i+1])
        """
        for j in range(i - 1, i + 2):
            if j < 0:
                pad_left = True
            elif j + 1 in cumulative_lengths:
                pad_right = True
                instance.append(all_instances[j])
                break
            elif j in cumulative_lengths:
                pad_left = True
                instance.append(all_instances[j])
            elif j >= len(all_instances):
                pad_right = True
            else:
                instance.append(all_instances[j])
        """
        # originally it was [all_instances[i]]
        tokenized = tokenizer(instance, padding='max_length', truncation='longest_first', max_length=300, return_tensors = 'pt')
        if pad_left:
            tokenized['input_ids'] = F.pad(tokenized['input_ids'], (0,0,1,0), 'constant', 0)
            tokenized['token_type_ids'] = F.pad(tokenized['token_type_ids'], (0,0,1,0), 'constant', 0)
            tokenized['attention_mask'] = F.pad(tokenized['attention_mask'], (0,0,1,0), 'constant', 0)
            pad_left = False
        if pad_right:
            tokenized['input_ids'] = F.pad(tokenized['input_ids'], (0,0,0,1), 'constant', 0)
            tokenized['token_type_ids'] = F.pad(tokenized['token_type_ids'], (0,0,0,1), 'constant', 0)
            tokenized['attention_mask'] = F.pad(tokenized['attention_mask'], (0,0,0,1), 'constant', 0)
            pad_right = False
        tokenized['input_ids'] = tokenized['input_ids'].to(device)
        tokenized['input_ids'] = tokenized['input_ids'].squeeze(0)
        tokenized['token_type_ids'] = tokenized['token_type_ids'].to(device)
        tokenized['token_type_ids'] = tokenized['token_type_ids'].squeeze(0)
        tokenized['attention_mask'] = tokenized['attention_mask'].to(device)
        tokenized['attention_mask'] = tokenized['attention_mask'].squeeze(0)
        tokenized['label'] = torch.tensor([all_labels[i]])
        tokenized['label'] = tokenized['label'].to(device, dtype=float)
        all_tokenized.append(tokenized)
    return all_tokenized

In [13]:
train_all_instances = data_preprocessing(training_data)
#dev_all_instances = data_preprocessing(dev_data)
#test_all_instances = data_preprocessing(test_data)

In [34]:
dev_all_instances = data_preprocessing(dev_data)

In [14]:
dataloader_train = DataLoader(train_all_instances[:100], shuffle=True, batch_size=2)
#dataloader_dev = DataLoader(dev_all_instances, shuffle=True, batch_size=2)
#dataloader_test = DataLoader(test_all_instances, shuffle=True, batch_size=2)

In [35]:
dataloader_dev = DataLoader(dev_all_instances, batch_size = 2)

In [30]:
%store dataloader_train

Stored 'dataloader_train' (DataLoader)


In [36]:
%store training_data

Stored 'training_data' (DataFrame)


## Baseline Model

In [55]:
class baseline_model(nn.Module):

    def __init__(self):
        
        super(baseline_model, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-chinese')
        self.linear = nn.Linear(768, 1)

    def forward(self, batch):
        # batch['ids'] = (batch_size, sent_len)
        # batch['segment_ids'] = (batch_size, sent_len)
        # batch['mask_ids'] = (batch_size, sent_len)
        # pooler_output = (batch_size, 768)
        # output = (batch_size, 1)
        
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        token_type_ids = batch['token_type_ids']
        
        hidden_state, pooler_output = self.bert(input_ids = input_ids, 
                                                attention_mask = attention_mask,
                                                token_type_ids = token_type_ids)        
        linear_output = self.linear(pooler_output)

        return linear_output

    def loss(self, batch):
        
        loss_fn = nn.BCEWithLogitsLoss()
        output = self.forward(batch)
        target = batch['labels'].float().to(device)
        
        return loss_fn(output, target)
    
    def _predict(self, batch):
        
        output = self.forward(batch)
        scores = torch.sigmoid(output)
        scores = scores.cpu().numpy()[:,0].tolist()
        
        return scores
    
    def predict_fgc(self, batch, threshold=0.5):
        
        scores = self._predict(batch)
        max_i = 0
        max_score = 0
        sp = []
        
        for i, score in enumerate(scores):

            if score > max_score:
                max_i = i
                max_score = score
            if score >= threshold:
                sp.append(i)

        if not sp:
            sp.append(max_i)

        return {'sp': sp, 'sp_scores': scores}

In [56]:
baseline = baseline_model()
baseline.to(device)

I0813 07:52:35.799878 140504480094016 configuration_utils.py:264] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json from cache at /root/.cache/torch/transformers/8a3b1cfe5da58286e12a0f5d7d182b8d6eca88c08e26c332ee3817548cf7e60a.f12a4f986e43d8b328f5b067a641064d67b91597567a06c7b122d1ca7dfd9741
I0813 07:52:35.802062 140504480094016 configuration_utils.py:300] Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_he

baseline_model(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [57]:
baseline.load_state_dict(torch.load('Models_SEs/model_epoch0_eval_em:0.180_precision:0.732_recall:0.454_f1:0.528_loss:0.246.m'))

<All keys matched successfully>

## Improved Model

In [34]:
class baselineAgg(nn.Module):

    def __init__(self, number_of_sentence, baseline_model):
        
        super(baselineAgg, self).__init__()
        self.number_of_sentence = number_of_sentence
        self.baseline = baseline_model
        self.softmax = nn.Softmax(dim=1)
        self.linearAgg = nn.Linear(1536, 1)
        self.linear = baseline_model.linear#nn.Linear(768, 1)
        
    def forward_nn(self, batch, adjust_weight=False):
        # batch = (batch_size, sentence length)
        # pooler_output = (batch_size, 768)
        # Sent objects into CUDA
        
        batch_size = batch['input_ids'].shape[0]
        max_sentence_length = batch['input_ids'].shape[2]

        input_ids = batch['input_ids'].view(-1, max_sentence_length)
        token_type_ids = batch['token_type_ids'].view(-1, max_sentence_length)
        attention_mask = batch['attention_mask'].view(-1, max_sentence_length)
        hidden_state, pooler_output = self.baseline.bert(input_ids=input_ids,
                                                         attention_mask=attention_mask, 
                                                         token_type_ids=token_type_ids)
        
        # Aggregate
        pooler_output = pooler_output.view(batch_size, -1, 768) # (batch, 3, 768)
        
        
        if adjust_weight:      
            target_pair = pooler_output[:, self.number_of_sentence // 2, :].unsqueeze(1) # (batch, 1, 768)
            target_pair = target_pair.expand(-1, self.number_of_sentence, -1) # (batch, 3, 768)
            concatenated = torch.cat((target_pair, pooler_output), dim=-1) # (batch, 3, 768*2)
            weight = self.linearAgg(concatenated)
            weight = self.softmax(weight)
        else:
            weight = torch.tensor([[0.0], [1.0], [0.0]])
            
        weight = weight.to(device)
        aggregated_sentence = torch.matmul(weight.transpose(0, 1), pooler_output) # (batch, 1, 768)
        aggregated_sentence = aggregated_sentence.squeeze(1) # (batch, 768)
        
        final_output = self.linear(aggregated_sentence) # (batch, 1)
        
        return final_output
    
    def forward(self, batch):
        
        output = self.forward_nn(batch)
        labels = batch['label'].type(torch.float)
        loss_fn = nn.BCEWithLogitsLoss()
        loss = loss_fn(output, labels)
        #print("output:", output)
        #print("labels:", labels)
        print("loss:",  loss)
        

        return loss
    
    def _predict(self, batch):
        
        with torch.no_grad():
            
            output, att_weight = self.forward_nn(batch)
            scores = torch.sigmoid(output)
            scores = scores.cpu().numpy().tolist()
        
        return scores
    
    def predict_fgc(self, batch, threshold=0.5):
        scores = self._predict(batch)
        max_i = 0
        max_score = 0
        sp = []

        for i, score in enumerate(scores[0]):

            if score > max_score:
                max_i = i
                max_score = score
            if score >= threshold:
                sp.append(i)

        # This is to ensure there's no empty supporting evidences
        if not sp:
            sp.append(max_i)
        return {'sp': sp, 'sp_scores': scores}

In [35]:
baselineAgg = baselineAgg(3, baseline)
baselineAgg.to(device)

baselineAgg(
  (baseline): baseline_model(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(21128, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
           

## Test forward_nn

In [32]:
#baselineAgg(a)
c, d = baselineAgg.forward_nn(a)

RuntimeError: Expected object of backend CUDA but got backend CPU for argument #2 'mat2'

In [37]:
test_linear = nn.Linear(768, 1)
test_linear.to(device)

Linear(in_features=768, out_features=1, bias=True)

In [39]:
output = test_linear(torch.matmul(c.transpose(0,1).to(device), d).squeeze(1))

In [51]:
l = torch.tensor([[0], [0]])
l = l.to(device, dtype=torch.float)

In [52]:
loss_fn = nn.BCEWithLogitsLoss()

In [53]:
loss_fn(output, l)

tensor(0.7396, device='cuda:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward>)

## Train

In [49]:
def optim(nn, num_epochs, lr):
    param_optimizer = list(nn.bert.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    num_epochs = num_epochs
    num_train_optimization_steps = len(dataloader_train) * num_epochs
    optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                     num_warmup_steps=int(
                                                         num_train_optimization_steps * 0.1),
                                                     num_training_steps=num_train_optimization_steps)
    return optimizer, scheduler

In [50]:
def _update_sp(metrics, sp_gold, sp_pred):
    tp, fp, fn = 0, 0, 0
        
    for p in sp_pred:
        if p in sp_gold:
            tp += 1
        else:
            fp += 1
    for g in sp_gold:
        if g not in sp_pred:
            fn += 1
            
    precision = 1.0 * tp / (tp + fp) if tp + fp > 0 else 0.0
    recall = 1.0 * tp / (tp + fn) if tp + fn > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0
    em = 1.0 if fp + fn == 0 else 0.0
    
    metrics['sp_em'] += em
    metrics['sp_f1'] += f1
    metrics['sp_prec'] += precision
    metrics['sp_recall'] += recall
    
    return precision, recall, f1

In [51]:
def eval_sp_fgc(sp_golds, sp_preds):
    
    metrics = {'sp_em': 0, 'sp_prec': 0, 'sp_recall': 0, 'sp_f1': 0}
    
    assert len(sp_golds) == len(sp_preds)
    
    for sp_gold, sp_pred in zip(sp_golds, sp_preds):
        _update_sp(metrics, sp_gold, sp_pred)
    
    N = len(sp_golds)
    for k in metrics.keys():
        metrics[k] /= N
        metrics[k] = round(metrics[k], 3)
    print(metrics)
    return metrics

In [52]:
def eval(network, dev_batches, current_epoch, sp_golds, avg_loss):
    
    network.eval()
    
    with torch.no_grad():
        sp_preds = []
        for batch in tqdm(dev_batches):
            
            out_dct = network.predict_fgc(batch)
            sp_preds.append(out_dct['sp'])
                
    metrics = eval_sp_fgc(sp_golds, sp_preds)
    print('epoch %d eval_recall: %.3f eval_f1: %.3f loss: %.3f' % (
            current_epoch, metrics['sp_recall'], metrics['sp_f1'], avg_loss))
        
    #torch.save(network.state_dict(), "Models_SEs/model_epoch{0}_eval_em:{1:.3f}_precision:{2:.3f}_recall:{3:.3f}_f1:{4:.3f}_loss:{5:.3f}.m".format(current_epoch, metrics['sp_em'], metrics['sp_prec'], metrics['sp_recall'], metrics['sp_f1'], avg_loss))
    
    return sp_preds, sp_golds

In [53]:
def train(network, data, dev_batches, num_epochs, lr):
    
    loss_fn = nn.BCEWithLogitsLoss()
    optimizer, scheduler = optim(network, num_epochs, lr)
    
    sp_golds = dev_data['QUESTIONS'].apply(lambda x: x[0]['SHINT_']).tolist()
    
    """
    for name, param in network.baseline.named_parameters():
        if 'bert' in name:
            param.requires_grad = False
    """     
    for current_epoch in range(num_epochs):
        network.train()
        running_loss = 0.0
        for batch in tqdm(data):
            current_loss = network(batch)
            current_loss.backward()
            torch.nn.utils.clip_grad_norm_(network.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            running_loss += current_loss.item()
            
        learning_rate_scalar = scheduler.get_lr()[0]
        print('lr = %f' % learning_rate_scalar)
        avg_loss = running_loss/len(data)
        print('epoch %d train_loss: %.3f' % (current_epoch, avg_loss))
        #eval(network, dev_batches, current_epoch, sp_golds, avg_loss)

In [58]:
train(baseline, dataloader_train, [], 20, 2e-5)

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




RuntimeError: The expanded size of the tensor (300) must match the existing size (3) at non-singleton dimension 2.  Target sizes: [2, 3, 300].  Tensor sizes: [1, 3]

## Evaluate

In [38]:
def sent_eval_preprocessing(data, dataset): 
    
    len_array = np.cumsum(np.array(data['SENTS'].apply(lambda x: len(x))))

    dictionary_lists = []
    batches = []
    for i in range(len(dataset)):
        
        instance = dataset.instances[i]
        dictionary_lists.append(instance)
        
        if i in len_array - 1:

            
            padded_ids = pad_sequence([torch.tensor(instance['ids']) for instance in dictionary_lists], batch_first=True)
            padded_ids = padded_ids.to(device)

            #padded_segment_ids = pad_sequence([torch.tensor(instance['segment_ids']) for instance in dictionary_lists], batch_first=True)
            #padded_segment_ids = padded_segment_ids.to(device)

            padded_mask_ids = pad_sequence([torch.tensor(instance['mask_ids']) for instance in dictionary_lists], batch_first=True)
            padded_mask_ids = padded_mask_ids.to(device)
            
            padded_sentence_masks = pad_sequence([torch.tensor(instance['sentence_mask']) for instance in dictionary_lists], batch_first=True)
            padded_sentence_masks = padded_sentence_masks.to(device)
            
            padded_q_ids = pad_sequence([torch.tensor(instance['q_ids']) for instance in dictionary_lists], batch_first=True)
            padded_q_ids = padded_q_ids.to(device)
            
            padded_q_mask_ids = pad_sequence([torch.tensor(instance['q_mask_ids']) for instance in dictionary_lists], batch_first=True)
            padded_q_mask_ids = padded_q_mask_ids.to(device)

            labels = torch.stack([torch.tensor(instance['labels']) for instance in dictionary_lists])
            labels = labels.to(device)

            
            current_dev_batch = {'ids': padded_ids, 'mask_ids': padded_mask_ids, 'sentence_mask': padded_sentence_masks,
                                 'labels': labels, 'q_ids': padded_q_ids, "q_mask_ids": padded_q_mask_ids}

            batches.append(current_dev_batch)
            dictionary_lists = []

    return batches

In [39]:
sent_dev_batches = sent_eval_preprocessing(dev_data, dataloader_dev)

AttributeError: 'DataLoader' object has no attribute 'instances'