In [1]:
import torch
from tqdm import tqdm_notebook as tqdm

from transformers import BertTokenizer, BertModel, BertForMaskedLM

I0708 06:09:54.228258 139711450031936 file_utils.py:39] PyTorch version 1.1.0 available.


In [2]:
import pandas as pd
import numpy as np

In [3]:
from torch.nn.utils.rnn import pad_sequence 
import torchvision
import torch.nn as nn

In [4]:
from transformers.optimization import AdamW
from transformers.optimization import AdamW, get_linear_schedule_with_warmup

In [5]:
validation_data = pd.read_json("FGC_release_1.7.13/FGC_release_all_dev.json")
training_data = pd.read_json("FGC_release_1.7.13/FGC_release_all_train.json")
test_data = pd.read_json("FGC_release_1.7.13/FGC_release_all_test.json")

In [6]:
device = torch.device("cuda")

## Data Preprocessing

In [7]:
def datapreprocessing(data, return_df=False):
    
    # Save all the questions, potential supporting evidence and indices in three lists
    textQ_to_be_tokenized = []
    textA_to_be_tokenized = []
    sp_index = []
    
    for dictionary in data['QUESTIONS']:
        for element in dictionary:
            textQ_to_be_tokenized.append(element['QTEXT_CN'])
            sp_index.append(element['SHINT_'])
    for dictionary in data['SENTS']:
        current_text_sentence = []
        for element in dictionary:
            current_text_sentence.append(element['text'])
        textA_to_be_tokenized.append(current_text_sentence)
    
    QandA_label = pd.DataFrame({'Question': textQ_to_be_tokenized,
                                'Sentence_List': textA_to_be_tokenized,
                                'SE_Index': sp_index,
                                'Label': sp_index})

    QandA_label['Length'] = QandA_label['Sentence_List'].apply(lambda x: len(x))
    QandA_label['SE_Index'] = QandA_label['SE_Index'].apply(lambda x: [0])
    QandA_label['SE_Index'] = QandA_label['SE_Index'] * QandA_label['Length']
    QandA_label['SE_Index'] = list(zip(QandA_label['SE_Index'], QandA_label['Label']))

    # Extract label index
    for row in QandA_label['SE_Index']:
        for index in row[1]:
            row[0][index] = 1
        
    indexed = [i[0] for i in list(QandA_label['SE_Index'])]
    QandA_label['Label'] = indexed

    if return_df:
        return QandA_label
    
    Q_and_Sentence_all_Comb = pd.DataFrame({'Question':np.repeat(QandA_label['Question'].values, QandA_label['Sentence_List'].str.len()),
                        'Sentence':np.concatenate(QandA_label['Sentence_List'].values)})
    Q_and_Sentence_all_Comb['Label'] = QandA_label['Label'].sum()
    
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
            
    # Put all question and sentence combination into a list 
    All_instances = []
    for i in range(len(QandA_label)):
        for sentence in QandA_label['Sentence_List'][i]:
            question_token = tokenizer.tokenize(QandA_label['Question'][i])
            sentence_token = tokenizer.tokenize(sentence)
            instance = ['[CLS]'] + question_token + ['[SEP]'] + sentence_token + ['[SEP]']
            if len(instance) > 512:
                instance = instance[:512]
            All_instances.append(instance)
            
    # Convert ids to segment_ids
    segment_ids = []
    for token in All_instances:
        length_of_zeros = token.index('[SEP]') - token.index('[CLS]') + 1
        length_of_ones = len(token) - length_of_zeros
        zeros_and_ones = [0] * length_of_zeros + [1] * length_of_ones
        segment_ids.append(zeros_and_ones)
        
    ids = []
    for token in All_instances:
        ids.append(tokenizer.convert_tokens_to_ids(token))
        
    mask_ids = []
    for token in All_instances:
        mask_ids.append([1] * len(token))
        
    labels = list(Q_and_Sentence_all_Comb['Label'])
    labels = [[i] for i in labels]
    
    return All_instances, ids, segment_ids, mask_ids, labels

In [8]:
dev_instances, dev_ids, dev_seg_ids, dev_mask_ids, dev_labels = datapreprocessing(validation_data)
train_instances, train_ids, train_seg_ids, train_mask_ids, train_labels = datapreprocessing(training_data)

I0708 06:09:58.958140 139711450031936 tokenization_utils.py:375] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt from cache at /root/.cache/torch/transformers/8a0c070123c1f794c42a29c6904beb7c1b8715741e235bee04aca2c7636fc83f.9b42061518a39ca00b8b52059fd2bede8daa613f8a8671500e518a8c29de8c00
I0708 06:10:03.892944 139711450031936 tokenization_utils.py:375] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt from cache at /root/.cache/torch/transformers/8a0c070123c1f794c42a29c6904beb7c1b8715741e235bee04aca2c7636fc83f.9b42061518a39ca00b8b52059fd2bede8daa613f8a8671500e518a8c29de8c00


## Loading Data

In [9]:
from torch.utils.data import Dataset

In [10]:
class SentenceDataset(Dataset):
    
    def __init__(self, ids, segment_ids, mask_ids, labels):
        self.instances = []
        for ids_i, segment_ids_i, mask_ids, label in zip(ids, segment_ids, mask_ids, labels):
            self.instances.append({"ids": ids_i, "segment_ids": segment_ids_i, 
                                   "mask_ids": mask_ids, "labels": label})  
    def __len__(self):
        return len(self.instances)

    def __getitem__(self, idx):
        sample = self.instances[idx]

        return sample

In [11]:
train_dataset = SentenceDataset(train_ids, train_seg_ids, train_mask_ids, train_labels)

In [12]:
dev_dataset = SentenceDataset(dev_ids, dev_seg_ids, dev_mask_ids, dev_labels)

In [13]:
from torch.utils.data import DataLoader

In [14]:
def collate(batch):
    padded_ids = pad_sequence([torch.tensor(instance['ids']) for instance in batch], batch_first=True)
    padded_ids = padded_ids.to(device)
    
    padded_segment_ids = pad_sequence([torch.tensor(instance['segment_ids']) for instance in batch], batch_first=True)
    padded_segment_ids = padded_segment_ids.to(device)
    
    padded_mask_ids = pad_sequence([torch.tensor(instance['mask_ids']) for instance in batch], batch_first=True)
    padded_mask_ids = padded_mask_ids.to(device)
    
    labels = torch.stack([torch.tensor(instance['labels']) for instance in batch])
    labels = labels.to(device)
    return {'ids': padded_ids, 'mask_ids': padded_mask_ids, 'segment_ids': padded_segment_ids, 'labels': labels}

In [15]:
dataloader_train = DataLoader(train_dataset, batch_size=8, shuffle = True, collate_fn = collate)

In [16]:
def eval_preprocessing(data, dataset):
    
    len_array = np.cumsum(np.array(data['SENTS'].apply(lambda x: len(x))))

    dictionary_lists = []
    batches = []
    for i in range(len(dataset.instances)):

        instance = dataset.instances[i]
        dictionary_lists.append(instance)

        if i in len_array - 1:

            padded_ids = pad_sequence([torch.tensor(instance['ids']) for instance in dictionary_lists], batch_first=True)
            padded_ids = padded_ids.to(device)

            padded_segment_ids = pad_sequence([torch.tensor(instance['segment_ids']) for instance in dictionary_lists], batch_first=True)
            padded_segment_ids = padded_segment_ids.to(device)

            padded_mask_ids = pad_sequence([torch.tensor(instance['mask_ids']) for instance in dictionary_lists], batch_first=True)
            padded_mask_ids = padded_mask_ids.to(device)

            labels = torch.stack([torch.tensor(instance['labels']) for instance in dictionary_lists])
            labels = labels.to(device)

            current_dev_batch = {'ids': padded_ids, 'mask_ids': padded_mask_ids, 'segment_ids': padded_segment_ids, 'labels': labels}

            batches.append(current_dev_batch)
            dictionary_lists = []

    return batches

## Creating Baseline Neural Network

In [17]:
class FGC_Network(nn.Module):

    def __init__(self):
        
        super(FGC_Network, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-chinese')
        self.linear = nn.Linear(768, 1)

    def forward(self, batch):
        # batch['ids'] = (batch_size, sent_len)
        # batch['segment_ids'] = (batch_size, sent_len)
        # batch['mask_ids'] = = (batch_size, sent_len)
        # output = (batch_size, 1)
        hidden_state, pooler_output = self.bert(batch['ids'], batch['mask_ids'], batch['segment_ids'])
        linear_output = self.linear(pooler_output)
        
        return linear_output

    def loss(self, batch):
        
        loss_fn = nn.BCEWithLogitsLoss()
        output = self.forward(batch)
        target = batch['labels'].float().to(device)
        
        return loss_fn(output, target)
    
    def _predict(self, batch):
        
        output = self.forward(batch)
        scores = torch.sigmoid(output)
        scores = scores.cpu().numpy()[:,0].tolist()
        
        return scores
    
    def predict_fgc(self, batch, threshold=0.5):
        
        scores = self._predict(batch)

        max_i = 0
        max_score = 0
        sp = []
        
        for i, score in enumerate(scores):

            if score > max_score:
                max_i = i
                max_score = score
            if score >= threshold:
                sp.append(i)

        if not sp:
            sp.append(max_i)

        return {'sp': sp, 'sp_scores': scores}

In [18]:
network = FGC_Network()
network.to(device)

I0708 06:10:19.353571 139711450031936 configuration_utils.py:152] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json from cache at /root/.cache/torch/transformers/8a3b1cfe5da58286e12a0f5d7d182b8d6eca88c08e26c332ee3817548cf7e60a.f12a4f986e43d8b328f5b067a641064d67b91597567a06c7b122d1ca7dfd9741
I0708 06:10:19.356597 139711450031936 configuration_utils.py:169] Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pad_token_id": 0,
  

FGC_Network(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
        

## Training & Evaluating Data 

In [19]:
def optim(nn, num_epochs, lr):
    param_optimizer = list(nn.bert.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    num_epochs = num_epochs
    num_train_optimization_steps = len(dataloader_train) * num_epochs
    optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                     num_warmup_steps=int(
                                                         num_train_optimization_steps * 0.1),
                                                     num_training_steps=num_train_optimization_steps)
    return optimizer, scheduler

In [20]:
def _update_sp(metrics, sp_gold, sp_pred):
    tp, fp, fn = 0, 0, 0
        
    for p in sp_pred:
        if p in sp_gold:
            tp += 1
        else:
            fp += 1
    for g in sp_gold:
        if g not in sp_pred:
            fn += 1
            
    precision = 1.0 * tp / (tp + fp) if tp + fp > 0 else 0.0
    recall = 1.0 * tp / (tp + fn) if tp + fn > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0
    em = 1.0 if fp + fn == 0 else 0.0
    
    metrics['sp_em'] += em
    metrics['sp_f1'] += f1
    metrics['sp_prec'] += precision
    metrics['sp_recall'] += recall
    
    return precision, recall, f1

In [21]:
def eval_sp_fgc(sp_golds, sp_preds):
    
    metrics = {'sp_em': 0, 'sp_prec': 0, 'sp_recall': 0, 'sp_f1': 0}
    
    assert len(sp_golds) == len(sp_preds)
    
    for sp_gold, sp_pred in zip(sp_golds, sp_preds):
        _update_sp(metrics, sp_gold, sp_pred)
    
    N = len(sp_golds)
    for k in metrics.keys():
        metrics[k] /= N
        metrics[k] = round(metrics[k], 3)
    print(metrics)
    return metrics

In [22]:
def eval_fgc_atype(atype_golds, atype_preds):
    
    pos = 0
    neg = 0
    
    for gold, atype in zip(atype_golds, atype_preds):
        if atype == gold:
            pos += 1
        else:
            neg += 1
    return pos/len(atypes_preds)

In [23]:
def eval(network, dev_batches, current_epoch, sp_golds, avg_loss):
    
    network.eval()
    
    with torch.no_grad():
        sp_preds = []
        
        for batch in tqdm(dev_batches):
            
            out_dct = network.predict_fgc(batch)
            sp_preds.append(out_dct['sp'])
                
  
    metrics = eval_sp_fgc(sp_golds, sp_preds)
    print('epoch %d eval_recall: %.3f eval_f1: %.3f loss: %.3f' % (
            current_epoch, metrics['sp_recall'], metrics['sp_f1'], avg_loss))
        
    #torch.save(network.state_dict(), "FGC_release_1.7.13/models_with_scheduler/model_epoch{0}_eval_em:{1:.3f}_precision:{2:.3f}_recall:{3:.3f}_f1:{4:.3f}_loss:{5:.3f}.m".format(current_epoch, metrics['sp_em'], metrics['sp_prec'], metrics['sp_recall'], metrics['sp_f1'], avg_loss))
    
    return sp_preds, sp_golds

In [24]:
def train(network, num_epochs, lr):
    
    loss_fn = nn.BCEWithLogitsLoss()
    optimizer, scheduler = optim(network, num_epochs, lr)
    
    sp_golds = validation_data['QUESTIONS'].apply(lambda x: x[0]['SHINT_']).tolist()
    
    for current_epoch in range(num_epochs):
        network.train()
        running_loss = 0.0
        dr = True
        for batch in tqdm(dataloader_train):
            optimizer.zero_grad()
            current_output = network(batch)
            current_target = batch['labels'].to(dtype=torch.float, device=device)
            current_loss = loss_fn(current_output, current_target)

            current_loss.backward()
            torch.nn.utils.clip_grad_norm_(network.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            running_loss += current_loss.item()
            
        learning_rate_scalar = scheduler.get_lr()[0]
        print('lr = %f' % learning_rate_scalar)
        avg_loss = running_loss/len(dataloader_train)
        print('epoch %d train_loss: %.3f' % (current_epoch, avg_loss))
        eval(network, dev_batches, current_epoch, sp_golds, avg_loss)

In [25]:
#train(network, 20, 0.00002)

In [26]:
trained_network = FGC_Network()
trained_network.load_state_dict(torch.load('FGC_release_1.7.13/models_with_scheduler/model_epoch19_eval_em:0.154_precision:0.599_recall:0.609_f1:0.547_loss:0.001.m'))

I0708 06:10:27.493682 139711450031936 configuration_utils.py:152] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json from cache at /root/.cache/torch/transformers/8a3b1cfe5da58286e12a0f5d7d182b8d6eca88c08e26c332ee3817548cf7e60a.f12a4f986e43d8b328f5b067a641064d67b91597567a06c7b122d1ca7dfd9741
I0708 06:10:27.497238 139711450031936 configuration_utils.py:169] Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pad_token_id": 0,
  

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [27]:
sp_golds = training_data['QUESTIONS'].apply(lambda x: x[0]['SHINT_']).tolist()
batches = eval_preprocessing(training_data, train_dataset)

trained_network.to("cuda")
train_pred, train_obs = eval(trained_network, batches, 0, sp_golds, 0.001)


HBox(children=(IntProgress(value=0, max=882), HTML(value='')))


{'sp_em': 0.985, 'sp_prec': 0.986, 'sp_recall': 0.988, 'sp_f1': 0.987}
epoch 0 eval_recall: 0.988 eval_f1: 0.987 loss: 0.001


In [28]:
sp_golds = validation_data['QUESTIONS'].apply(lambda x: x[0]['SHINT_']).tolist()
dev_batches = eval_preprocessing(validation_data, dev_dataset)
dev_preds, dev_obs = eval(trained_network, dev_batches, 0, sp_golds, 0.001)

HBox(children=(IntProgress(value=0, max=247), HTML(value='')))


{'sp_em': 0.154, 'sp_prec': 0.599, 'sp_recall': 0.609, 'sp_f1': 0.547}
epoch 0 eval_recall: 0.609 eval_f1: 0.547 loss: 0.001


## Training Data Error Analysis

In [47]:
validation_data_with_performance = datapreprocessing(validation_data, True)
training_data_with_performance = datapreprocessing(training_data, True)

In [48]:
training_data_with_performance['train_pred'] = train_pred
training_data_with_performance['train_obs'] = train_obs

In [49]:
correct_sp = []
for i in range(training_data_with_performance.shape[0]):
    para = training_data_with_performance['Sentence_List'][i]
    sen = []
    for index in training_data_with_performance['train_pred'][i]:
        sen.append(para[index])
    correct_sp.append(sen)
training_data_with_performance['Pred_List'] = correct_sp
correct_sp = []
for i in range(training_data_with_performance.shape[0]):
    para = training_data_with_performance['Sentence_List'][i]
    sen = []
    for index in training_data_with_performance['train_obs'][i]:
        sen.append(para[index])
    correct_sp.append(sen)
training_data_with_performance['Obs_List'] = correct_sp

In [50]:
training_data_with_performance.drop(['SE_Index', 'Label', 'train_pred', 'train_obs'], axis=1, inplace=True)

In [51]:
train_mismatch = training_data_with_performance[training_data_with_performance['Pred_List'] != training_data_with_performance['Obs_List']]
train_mismatch

Unnamed: 0,Question,Sentence_List,Length,Pred_List,Obs_List
20,「阿拉伯之春」运动中，走上街头的民众的诉求为何?,[阿拉伯之春（阿拉伯语：الثورات العربية‎）是西方主流媒体所称的阿拉伯世界的一...,18,[只有突尼西亚成为阿拉伯之春中，],[]
70,第二次签订的北美贸易协定从签署至生效过了几日?,[北美自由贸易协定（英语：North American Free Trade Agreeme...,15,[美国、墨西哥和加拿大就更新北美自由贸易协定达成一致，],[]
156,聊天机器人仰赖哪些方法让回答愈来愈准确?,"[聊天机器人并非最近几年出现的新应用，, 麻省理工学院（MIT）人工智慧实验室早在1966年...",33,[麻省理工学院（MIT）人工智慧实验室早在1966年即研发出名为「Eliza」的机器人，],[]
284,不可再生能源的意义是什么？,"[在自然界中，, 能源可以采取几种不同的形式存在：热，, 电，辐射，化学能等。, 许多这些形...",21,[许多这些形式可以很容易转化为另一种的帮助下，],[]
324,伊甸基金会成立的宗旨为何?,[财团法人伊甸社会福利基金会（英语：Eden Social Welfare Foundati...,27,[因著上帝的呼召及一颗爱身心障碍者的同理心，],[]
370,三大健康照护体系保险制度中，政府涉入程度低的是哪一种？,"[根据政府公权力介入的程度，, 健康照护体系一般可以区分为以下三种不同的体系。, \n社会保...",33,"[\n公医制（政府介入最多）：以英国为代表。, \n自由市场（政府一般不介入）：以2013年...",[\n自由市场（政府一般不介入）：以2013年前的美国为代表。]
371,三大健康照护体系保险制度中，政府涉入程度高的是哪一种？,"[根据政府公权力介入的程度，, 健康照护体系一般可以区分为以下三种不同的体系。, \n社会保...",33,"[\n公医制（政府介入最多）：以英国为代表。, \n自由市场（政府一般不介入）：以2013年...",[\n公医制（政府介入最多）：以英国为代表。]
395,熬夜是否能减低得到癌症的风险?,"[在流行病学家及医学研究者继续探讨癌症的相关生活因素的同时，, 美国医学会所出版的著名医学杂...",22,[皆强烈建议减少或避免动物性食品摄取，],[]
449,高屏地区国庆烟火试放管制时间是从晚上几点开始？,"[国庆焰火在9月24号晚间试放3分钟，, 为确保施放安全、顺利，, 屏东县政府表示24号当天...",22,[屏东县政府表示24号当天屏东河滨公园将管制不开放，],[]
502,为何圣伯多禄大殿只能重建不能整修就好?,"[到了15世纪，, 圣伯多禄大殿结构日益变旧，, 亚维农教廷迁回罗马后，, 莱昂·巴蒂斯塔·...",20,[教宗犹利二世决定重建圣伯多禄大殿，],[]


#20 (Pred:[9], Actual:[])

Question:「阿拉伯之春」运动中，走上街头的民众的诉求为何? <br>
Predicted SP: 只有突尼西亚成为阿拉伯之春中，<br>
Actual: None

Comment: While the predicted SP is incorrect, I found out that there is supporting evidence in the paragraph. (...要求推翻本国的专制政体的行动)This might be a case of incorrect input.

#70 (Pred:[11], Actual:[])

Question: 第二次签订的北美贸易协定从签署至生效过了几日? <br>
Predicted SP: 美国、墨西哥和加拿大就更新北美自由贸易协定达成一致，<br>
Actual: None

Comment: Same as #20. (美国、加拿大及墨西哥在1992年8月12日签署了关于三国间全面贸易的协议。...，北美自由贸易协议于1994年1月1日正式生效。)

#156 (Pred:[1], Actual:[])

Question: 聊天机器人仰赖哪些方法让回答愈来愈准确? <br>
Predicted SP: 麻省理工学院（MIT）人工智慧实验室早在1966年即研发出名为「Eliza」的机器人， <br>
Actual: None <br>

Comment: Same (聊天机器人的作答准确度要透过程式化的方法改善)

#284 (Pred:[3], Actual:[])

Question: 不可再生能源的意义是什么？ <br>
Predicted SP: 许多这些形式可以很容易转化为另一种的帮助下， <br>
Actual: None <br>

Comment: Same (是无法经过短时间内再生的能源，而且它们的消耗速度远远超过它们再生的速度)

#324 (Pred:[4], Actual:[])

Question: 伊甸基金會成立的宗旨為何? <br>
Predicted SP: 因著上帝的呼召及一颗爱身心障碍者的同理心，<br>
Actual: None <br>

Comment: No SP in the paragraph. But I think SP is pretty close to being a supporting evidence. This 
must be a borderline case.

#370 (Pred:[12,25], Actual:[25])

Question: 三大健康照护体系保险制度中，政府涉入程度低的是哪一种？<br>
Predicted SP: 公医制（政府介入最多）：以英国为代表。 AND 自由市场（政府一般不介入）：以2013年前的美国为代表。<br>
Actual: 公医制（政府介入最多）：以英国为代表。<br>

Comment: I think this is a very reasonable mismatch. As two supporting evidences are very similar
syntax-wise but drastically different in meaning.

#371 (Pred:[12,25], Actual:[12])

Question: 三大健康照護體系保險制度中，政府涉入程度高的是哪一種？ <br>
Predicted SP: 公医制（政府介入最多）：以英国为代表。 AND 自由市场（政府一般不介入）：以2013年前的美国为代表。<br>
Actual: 公医制（政府介入最多）：以英国为代表。<br>

Comment: Same as #370

#395 (Pred:[10], Actual:[])

Question: 熬夜是否能减低得到癌症的风险? <br>
Predicted SP: 皆强烈建议减少或避免动物性食品摄取， <br>
Actual: None <br>

Comment: Another potential case of incorrect input. In the paragraph I found this sentence (所以防癌守则：...，注重睡眠品质)

#449 (Pred:[2], Actual:[])

Question: 高屏地区国庆烟火试放管制时间是从晚上几点开始？ <br>
Predicted SP: 屏东县政府表示24号当天屏东河滨公园将管制不开放， <br>
Actual: None <br>

Comment: Another similar case. This time I strongly believe this is an incorrect input. 
当晚7时并会进行全面清场 <- This is sufficient to be a supporting evidence

#502 (Pred:[7], Actual:[])

Question: 为何圣伯多禄大殿只能重建不能整修就好? <br>
Predicted SP: 教宗犹利二世决定重建圣伯多禄大殿 <br>
Actual: None 

Comment: Another similar case. (无疑再改动有机会让建筑倒塌)

#630 (Pred:[62], Actual:[])

Question: 毛笔、铅笔、钢笔，这三种笔中哪个笔尖的硬度高？ <br>
Predicted SP: 更进一步看：我们无论使用那一种笔，<br>
Actual: None <br>

Comment: Again, I think this counts as a supporting evidence (钢笔的笔尖用金属制成，弹性大，硬度高)

#731 (Pred:[9], Actual:[])

Question: 为什么古埃及人要把死人做成木乃伊? <br>
Predicted SP: 是做什么用的呢？ <br>
Actual: None <br>

Comment: I think this counts (古埃及人相信：人死后只要把遗体保存好，就可以在另一个世界得到永生。)

#874 (Pred:[22], Actual:[])

Question: 要如何降低肠病毒的传播风险？
Predicted SP: 今(2019)年累计37例肠病毒并发重症病例，
Actual: None

Comment: No doubt, these are supporting evidences (应加强居家环境、教室及游乐设施等的通风、整洁与消毒，并教导学童落实「湿、搓、冲、捧、擦」正确洗手步骤，及生病在家休息等良好卫生观念，)

## Validation Data Error Analysis

In [52]:
validation_data_with_performance['dev_pred'] = dev_preds
validation_data_with_performance['dev_obs'] = dev_obs

In [53]:
correct_sp = []
for i in range(validation_data_with_performance.shape[0]):
    para = validation_data_with_performance['Sentence_List'][i]
    sen = []
    for index in validation_data_with_performance['dev_pred'][i]:
        sen.append(para[index])
    correct_sp.append(sen)
validation_data_with_performance['Pred_List'] = correct_sp
correct_sp = []
for i in range(validation_data_with_performance.shape[0]):
    para = validation_data_with_performance['Sentence_List'][i]
    sen = []
    for index in validation_data_with_performance['dev_obs'][i]:
        sen.append(para[index])
    correct_sp.append(sen)
validation_data_with_performance['Obs_List'] = correct_sp

In [54]:
validation_data_with_performance.drop(['SE_Index', 'Label'], axis=1, inplace=True)

In [55]:
validation_data_with_performance.drop(['Sentence_List', 'dev_pred', 'dev_obs'], axis=1, inplace=True)

In [431]:
dev_mismatch = validation_data_with_performance[validation_data_with_performance['Obs_List'] != validation_data_with_performance['Pred_List']]

In [466]:
dev_mismatch

Unnamed: 0,Question,Length,Pred_List,Obs_List
0,苏东坡在中国历史上，是哪一个朝代的人？,36,[苏轼（1037年1月8日－1101年8月24日），],"[苏轼（1037年1月8日－1101年8月24日），, 北宋时著名的文学家、政治家、艺术家、医学家。, 号东坡居士、铁冠道人。]"
1,苏东坡是中国哪个省份的人？,36,"[苏轼（1037年1月8日－1101年8月24日），, 眉州眉山（今四川省眉山市）人，]","[苏轼（1037年1月8日－1101年8月24日），, 眉州眉山（今四川省眉山市）人，, 号东坡居士、铁冠道人。]"
2,苏东坡的爸爸叫什么名字?,36,"[苏轼（1037年1月8日－1101年8月24日），, 字子瞻，一字和仲，, 更与父亲苏洵、弟苏辙合称「三苏」，]","[苏轼（1037年1月8日－1101年8月24日），, 号东坡居士、铁冠道人。, 苏轼的散文为唐宋四家（韩愈、柳宗元、欧苏）之末，, 更与父亲苏洵、弟苏辙合称「三苏」，]"
3,苏文忠公指的是谁?,36,[加赐谥号文忠，],"[苏轼（1037年1月8日－1101年8月24日），, 加赐谥号文忠，]"
4,《苏文忠公全集》是由何人编纂？,36,[编有《苏文忠公全集》。],"[宋人王宗稷收其作品，, 编有《苏文忠公全集》。]"
5,韩愈在中国历史上，是哪一个朝代的人？,36,"[苏轼的散文为唐宋四家（韩愈、柳宗元、欧苏）之末，, 与唐代的古文运动发起者韩愈并称为「韩潮苏海」，]",[与唐代的古文运动发起者韩愈并称为「韩潮苏海」，]
6,苏东坡与韩愈是否为好朋友?,36,[与唐代的古文运动发起者韩愈并称为「韩潮苏海」，],"[苏轼（1037年1月8日－1101年8月24日），, 北宋时著名的文学家、政治家、艺术家、医学家。, 号东坡居士、铁冠道人。, 与唐代的古文运动发起者韩愈并称为「韩潮苏海」，]"
7,苏东坡曾担任过哪些职位?,36,"[苏轼（1037年1月8日－1101年8月24日），, 北宋时著名的文学家、政治家、艺术家、医学家。, 号东坡居士、铁冠道人。, 嘉佑二年进士，, 累官至端明殿学士兼翰林学士，, 礼部尚书。南宋理学方炽时，, 与唐代的古文运动发起者韩愈并称为「韩潮苏海」，, 艺术方面，书法名列「苏、黄、米、蔡」北宋四大书法家（宋四家）之首；]","[苏轼（1037年1月8日－1101年8月24日），, 号东坡居士、铁冠道人。, 累官至端明殿学士兼翰林学士，, 礼部尚书。南宋理学方炽时，]"
9,苏东坡与韩愈是否认识?,36,"[苏轼（1037年1月8日－1101年8月24日），, 与唐代的古文运动发起者韩愈并称为「韩潮苏海」，]","[苏轼（1037年1月8日－1101年8月24日），, 北宋时著名的文学家、政治家、艺术家、医学家。, 与唐代的古文运动发起者韩愈并称为「韩潮苏海」，]"
10,苏东坡为何被后人认为是文学艺术史上的通才?,36,"[苏轼（1037年1月8日－1101年8月24日），, 是文学艺术史上的通才，, 苏轼之诗与黄庭坚并称「苏黄」，, 并在题画文学史上占有举足轻重的地位。]",[]


## Improving Baseline Neural Network

In [445]:
class FGC_LSTM_Network(nn.Module):

    def __init__(self):
        
        super(FGC_LSTM_Network, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-chinese')
        self.lstm = nn.LSTM(768, 768, batch_first=True, bidirectional=True)
        self.h_state = torch.randn(2, 8, 768).to(device)
        self.c_state = torch.randn(2, 8, 768).to(device)
        self.linear = nn.Linear(768, 1)

    def forward(self, batch):
        # batch['ids'] = (batch_size*number of sentence, sent_len)
        # batch['segment_ids'] = (batch_size*number of sentence, sent_len)
        # batch['mask_ids'] = (batch_size*number of sentence, sent_len)
        # pooler_output = (batch_size, 768)
        # hidden_state = (batch_size, sent_len, 768)
        # output = (batch_size, 1)
        hidden_state, pooler_output = self.bert(batch['ids'], batch['mask_ids'], batch['segment_ids'])
        return pooler_output
        hidden_state.to(device)
        lstm_output, (hn, cn) = self.lstm(hidden_state, (self.h_state, self.c_state))
        #tanh_output = nn.Tanh(lstm_output)
        #linear_output = self.linear(lstm_output)
        
        return lstm_output#linear_output
    def loss(self, batch):
        
        loss_fn = nn.BCEWithLogitsLoss()
        output = self.forward(batch)
        target = batch['labels'].float().to(device)
        
        return loss_fn(output, target)
    
    def _predict(self, batch):
        
        output = self.forward(batch)
        scores = torch.sigmoid(output)
        scores = scores.cpu().numpy()[:,0].tolist()
        
        return scores
    
    def predict_fgc(self, batch, threshold=0.5):
        
        scores = self._predict(batch)

        max_i = 0
        max_score = 0
        sp = []
        
        for i, score in enumerate(scores):

            if score > max_score:
                max_i = i
                max_score = score
            if score >= threshold:
                sp.append(i)

        if not sp:
            sp.append(max_i)

        return {'sp': sp, 'sp_scores': scores}

In [446]:
new_network = FGC_LSTM_Network()

I0709 07:28:50.307493 139711450031936 configuration_utils.py:152] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json from cache at /root/.cache/torch/transformers/8a3b1cfe5da58286e12a0f5d7d182b8d6eca88c08e26c332ee3817548cf7e60a.f12a4f986e43d8b328f5b067a641064d67b91597567a06c7b122d1ca7dfd9741
I0709 07:28:50.311503 139711450031936 configuration_utils.py:169] Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pad_token_id": 0,
  

In [447]:
new_network.to(device)

FGC_LSTM_Network(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
   

In [448]:
test_batch = eval_preprocessing(training_data, train_dataset)

In [398]:
def collate_3d(batch):
    
    batch_size = len(batch)
    max_num_sentences, max_sentence_length = find_max_dimension(batch)
    target_ids = torch.zeros(batch_size, max_num_sentences, max_sentence_length)
    target_segment_ids = torch.zeros(batch_size, max_num_sentences, max_sentence_length)
    target_mask_ids = torch.zeros(batch_size, max_num_sentences, max_sentence_length)
    target_labels = torch.zeros(batch_size, max_num_sentences, 1)
    
    for i in range(len(batch)):
        
        source_id_dimension = batch[i]['ids'].shape
        target_ids[i, :source_id_dimension[0], :source_id_dimension[1]] = batch[i]['ids']
        
        source_segment_id_dimension = batch[i]['segment_ids'].shape
        target_segment_ids[i, :source_segment_id_dimension[0], :source_segment_id_dimension[1]] = batch[i]['segment_ids']
        
        source_mask_id_dimension = batch[i]['mask_ids'].shape
        target_mask_ids[i, :source_mask_id_dimension[0], :source_mask_id_dimension[1]] = batch[i]['mask_ids']
        
        source_label_dimension = batch[i]['labels'].shape
        target_labels[i, :source_label_dimension[0], :source_label_dimension[1]] = batch[i]['labels']
    
    target_ids = target_ids.to(device)
    target_segment_ids = target_segment_ids.to(device)
    target_mask_ids = target_segment_ids.to(device)
    target_labels = target_labels.to(device)
    
    return {'ids': target_ids, 'mask_ids': target_mask_ids, 'segment_ids': target_segment_ids, 'labels': target_labels}

In [399]:
def find_max_dimension(batch):
    num_sentences = []
    sentence_lengths = []
    for question in batch:
        num_sentences.append(question['ids'].shape[0])
        sentence_lengths.append(question['ids'].shape[1])
    return max(num_sentences), max(sentence_lengths)

In [400]:
dataloader_train_3d = DataLoader(test_batch, batch_size=8, shuffle = True, collate_fn = collate_3d)