## Import

In [1]:
import torch
from tqdm import tqdm_notebook as tqdm
from transformers import BertTokenizer, BertModel, BertForMaskedLM, BertForQuestionAnswering

I0812 07:04:58.167424 140680368949056 file_utils.py:39] PyTorch version 1.2.0+cu92 available.


In [2]:
import pandas as pd
import numpy as np

In [3]:
from torch.nn.utils.rnn import pad_sequence 
import torchvision
import torch.nn as nn

In [4]:
from transformers.optimization import AdamW
from transformers.optimization import AdamW, get_linear_schedule_with_warmup

In [5]:
validation_data = pd.read_json("FGC_release_1.7.13/FGC_release_all_dev.json")
training_data = pd.read_json("FGC_release_1.7.13/FGC_release_all_train.json")
test_data = pd.read_json("FGC_release_1.7.13/FGC_release_all_test.json")

In [6]:
# Remove all the questions where there's no supporting evidence to it
training_data = training_data[training_data['QUESTIONS'].apply(lambda x: len(x[0]['SHINT_']) > 0)]
validation_data = validation_data[validation_data['QUESTIONS'].apply(lambda x: len(x[0]['SHINT_']) > 0)]
test_data = test_data[test_data['QUESTIONS'].apply(lambda x: len(x[0]['SHINT_']) > 0)]

In [7]:
device = torch.device("cuda")

In [8]:
torch.tensor([1,2]).to(device)

tensor([1, 2], device='cuda:0')

## New Data Preprocessing

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

I0812 07:09:56.867145 140680368949056 tokenization_utils_base.py:1254] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt from cache at /root/.cache/torch/transformers/8a0c070123c1f794c42a29c6904beb7c1b8715741e235bee04aca2c7636fc83f.9b42061518a39ca00b8b52059fd2bede8daa613f8a8671500e518a8c29de8c00


In [11]:
def data_preprocessing(data):
    all_instances = []
    questions = data['QUESTIONS'].apply(lambda x: [x[0]['QTEXT_CN'], len(x[0]['SHINT'][1])]).tolist()
    sentences = [sentence['text'] for sentence_dict in data['SENTS'] for sentence in sentence_dict]
    lengths = np.array(questions)[:, 1].astype(int).tolist()
    indices = data['QUESTIONS'].apply(lambda x: x[0]['SHINT_']).tolist()
    labels = [[0] * length for length in lengths]
    all_labels = []
    # Inpute index into labels
    for i in range(len(labels)):
        np_label = np.array(labels[i])
        np_index = np.array(indices[i])
        np_label[np_index] = 1
        label = np_label.tolist()
        labels[i] = label
        all_labels = all_labels + label
    counter = 0
    for question in questions:
        question_text = question[0]
        for j in range(counter, counter + question[1]):
            all_instances.append([question_text, sentences[j]])
    
    all_tokenized = []
    for i in range(len(all_instances)):
        tokenized = tokenizer([all_instances[i]], padding='max_length', truncation=True, max_length=512, return_tensors = 'pt')
        tokenized['input_ids'] = tokenized['input_ids'].to(device)
        tokenized['input_ids'] = tokenized['input_ids'].squeeze(0)
        tokenized['token_type_ids'] = tokenized['token_type_ids'].to(device)
        tokenized['token_type_ids'] = tokenized['token_type_ids'].squeeze(0)
        tokenized['attention_mask'] = tokenized['attention_mask'].to(device)
        tokenized['attention_mask'] = tokenized['attention_mask'].squeeze(0)
        tokenized['label'] = torch.tensor(all_labels[i])
        all_tokenized.append(tokenized)
    return all_tokenized

In [12]:
train_all_instances = data_preprocessing(training_data)
dev_all_instances = data_preprocessing(dev_data)
test_all_instances = data_preprocessing(test_data)

NameError: name 'dev_data' is not defined

## Data Preprocessing

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

I0812 02:15:41.785210 139797494843200 tokenization_utils_base.py:1254] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt from cache at /root/.cache/torch/transformers/8a0c070123c1f794c42a29c6904beb7c1b8715741e235bee04aca2c7636fc83f.9b42061518a39ca00b8b52059fd2bede8daa613f8a8671500e518a8c29de8c00


In [9]:
def datapreprocessing(data, return_df=False):
    
    # Save all the questions, potential supporting evidence and indices in three lists
    textQ_to_be_tokenized = []
    textA_to_be_tokenized = []
    sp_index = []
    max_counter = 0
    for dictionary in data['QUESTIONS']:
        for element in dictionary:
            textQ_to_be_tokenized.append(element['QTEXT_CN'])
            sp_index.append(element['SHINT_'])
    for dictionary in data['SENTS']:
        current_text_sentence = []
        for element in dictionary:
            current_text_sentence.append(element['text'])
        textA_to_be_tokenized.append(current_text_sentence)
    
    QandA_label = pd.DataFrame({'Question': textQ_to_be_tokenized,
                                'Sentence_List': textA_to_be_tokenized,
                                'SE_Index': sp_index,
                                'Label': sp_index})

    QandA_label['Length'] = QandA_label['Sentence_List'].apply(lambda x: len(x))
    QandA_label['SE_Index'] = QandA_label['SE_Index'].apply(lambda x: [0])
    QandA_label['SE_Index'] = QandA_label['SE_Index'] * QandA_label['Length']
    QandA_label['SE_Index'] = list(zip(QandA_label['SE_Index'], QandA_label['Label']))

    # Extract label index
    for row in QandA_label['SE_Index']:
        for index in row[1]:
            row[0][index] = 1
        
    indexed = [i[0] for i in list(QandA_label['SE_Index'])]
    QandA_label['Label'] = indexed

    if return_df:
        return QandA_label
    
    Q_and_Sentence_all_Comb = pd.DataFrame({'Question':np.repeat(QandA_label['Question'].values, QandA_label['Sentence_List'].str.len()),
                        'Sentence':np.concatenate(QandA_label['Sentence_List'].values)})
    Q_and_Sentence_all_Comb['Label'] = QandA_label['Label'].sum()
    
            
    # Put all question and sentence combination into a list 
    All_instances = []
    
        
    for i in range(len(QandA_label)):
        
        for sentence in QandA_label['Sentence_List'][i]:
            question_token = tokenizer.tokenize(QandA_label['Question'][i])
            sentence_token = tokenizer.tokenize(sentence)
            instance = ['[CLS]'] + question_token + ['[SEP]'] + sentence_token + ['[SEP]'] 

            
            if len(instance) > 512:
                instance = instance[:511] + ['[SEP]']
                #max_counter += 1

            #instance = instance[:100]
            All_instances.append(instance)
            
    # Convert ids to segment_ids
    segment_ids = []
    for token in All_instances:
        length_of_zeros = token.index('[SEP]') - token.index('[CLS]') + 1
        length_of_ones = len(token) - length_of_zeros
        zeros_and_ones = [0] * length_of_zeros + [1] * length_of_ones
        segment_ids.append(zeros_and_ones)
        
    ids = []
    for token in All_instances:
        ids.append(tokenizer.convert_tokens_to_ids(token))
        
    mask_ids = []
    for token in All_instances:
        mask_ids.append([1] * len(token))
        
    labels = list(Q_and_Sentence_all_Comb['Label'])
    labels = [[i] for i in labels]
    return All_instances, ids, segment_ids, mask_ids, labels

In [10]:
def data_to_sentence(data, number_of_sentence):
    
    QandA_label = datapreprocessing(training_data, True)
    
    Q_and_Sentence_all_Comb = pd.DataFrame({'Question':np.repeat(QandA_label['Question'].values, QandA_label['Sentence_List'].str.len()),
                        'Sentence':np.concatenate(QandA_label['Sentence_List'].values)})
    Q_and_Sentence_all_Comb['Label'] = QandA_label['Label'].sum()
    
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

    len_array = np.cumsum(np.array(data['SENTS'].apply(lambda x: len(x))))

    # Put all question and sentence combination into a list 
    All_instances = []
    padded_zeros = [0] * 250
    
    for i in range(len(QandA_label)):
        
        for j in range(len(QandA_label['Sentence_List'][i])):
            
            question_token = tokenizer.tokenize(QandA_label['Question'][i])
            q_instance = ['[CLS]'] + question_token + ['[SEP]']
            if len(q_instance) > 250:
                q_instance = q_instance[:249] + ['[SEP]']

            sentences = []
            
            for k in range(j - number_of_sentence//2, j + number_of_sentence//2 + 1):
                if k < 0 or k >= len(QandA_label['Sentence_List'][i]):
                    sentences.append(padded_zeros)
                else:
                    sentence_token = tokenizer.tokenize(QandA_label['Sentence_List'][i][k])
                    s_instance = ['[CLS]'] + sentence_token + ['[SEP]']
                    if len(s_instance) > 250:
                        s_instance = s_instance[:249] + ['[SEP]']
                    sentences.append(s_instance)
            
                
            # Append the target sentence
            #question_token = tokenizer.tokenize(QandA_label['Question'][i])
            #sentence_token = tokenizer.tokenize(QandA_label['Sentence_List'][i][j])
            #q_instance = ['[CLS]'] + question_token + ['[SEP]']
            #s_instance = ['[CLS]'] + sentence_token + ['[SEP]'] 
            
            #if len(s_instance) > 250:
                #s_instance = s_instance[:249] + ['[SEP]']
            #if len(q_instance) > 250:
                #question = question[:249] + ['[SEP]']

            All_instances.append((q_instance, sentences))

    ids = []
    mask_ids = []
    sentence_masks = []
    
    for token in All_instances:
        
        q_tokenized = tokenizer.convert_tokens_to_ids(token[0])
        q_mask = [1] * 250
        
        if len(q_tokenized) < 250:
            q_tokenized = q_tokenized + (250 - len(q_tokenized)) * [0]
            q_mask = q_tokenized.index(0) * [1] + (250 - q_tokenized.index(0)) * [0]
            
        s_tokens = []
        s_masks = []
        sen_mask = []
        for sentence in token[1]:
            
            if sentence == padded_zeros:
                s_tokens.append(padded_zeros)
                s_masks.append(padded_zeros)
                sen_mask.append([0])
                
            else:
                s_tokenized = tokenizer.convert_tokens_to_ids(sentence)
                s_mask = [1] * 250
        
                if len(s_tokenized) < 250:
                    s_tokenized = s_tokenized + (250 - len(s_tokenized)) * [0]
                    s_mask = s_tokenized.index(0) * [1] + (250 - s_tokenized.index(0)) * [0]
            
                s_tokens.append(s_tokenized)
                s_masks.append(s_mask)
                sen_mask.append([1])
                
        ids.append((q_tokenized, s_tokens))
        mask_ids.append((q_mask, s_masks))
        sentence_masks.append(sen_mask)

    labels = list(Q_and_Sentence_all_Comb['Label'])
    labels = [[i] for i in labels]
    return All_instances, ids, mask_ids, sentence_masks, labels


In [11]:
def window_sentence_preprocessing(data, dataset, number_of_sentences):
    
    len_array = np.cumsum(np.array(data['SENTS'].apply(lambda x: len(x))))
    dictionary_lists = []
    batches = []
    
    unit_counter = 0
    
    for count, instance in enumerate(dataset, 1):
        
        dictionary_lists.append(instance)
        unit_counter += 1
        
        if (unit_counter % number_of_sentences == 0) or (count in len_array):
            
            padded_ids = pad_sequence([torch.tensor(instance['ids']) for instance in dictionary_lists], batch_first=True)
            padded_ids = padded_ids.to(device)

            padded_segment_ids = pad_sequence([torch.tensor(instance['segment_ids']) for instance in dictionary_lists], batch_first=True)
            padded_segment_ids = padded_segment_ids.to(device)

            padded_mask_ids = pad_sequence([torch.tensor(instance['mask_ids']) for instance in dictionary_lists], batch_first=True)
            padded_mask_ids = padded_mask_ids.to(device)

            labels = torch.stack([torch.tensor(instance['labels']) for instance in dictionary_lists])
            labels = labels.to(device)

            current_dev_batch = {'ids': padded_ids, 'mask_ids': padded_mask_ids, 'segment_ids': padded_segment_ids, 'labels': labels}

            batches.append(current_dev_batch)
            dictionary_lists = []
            unit_counter = 0
    
    return batches


In [12]:
def new_window_sentence_preprocessing(data, dataset, number_of_sentences):
    
    len_array = np.cumsum(np.array(data['SENTS'].apply(lambda x: len(x))))
    dictionary_lists = []
    batches = []
    
    unit_counter = 0
    
    for i in range(1, len(dataset) + 1):
        
        # Need to pad zeros
        dictionary_lists.append(dataset[i-1])
        dictionary_lists
        unit_counter += 1
        
        if (unit_counter % number_of_sentences == 0) or (i in len_array):
            
            padded_ids = pad_sequence([torch.tensor(instance['ids']) for instance in dictionary_lists], batch_first=True)
            padded_ids = padded_ids.to(device)

            padded_segment_ids = pad_sequence([torch.tensor(instance['segment_ids']) for instance in dictionary_lists], batch_first=True)
            padded_segment_ids = padded_segment_ids.to(device)

            padded_mask_ids = pad_sequence([torch.tensor(instance['mask_ids']) for instance in dictionary_lists], batch_first=True)
            padded_mask_ids = padded_mask_ids.to(device)

            labels = torch.stack([torch.tensor(instance['labels']) for instance in dictionary_lists])
            labels = labels.to(device)

            current_dev_batch = {'ids': padded_ids, 'mask_ids': padded_mask_ids, 'segment_ids': padded_segment_ids, 'labels': labels}

            batches.append(current_dev_batch)
            dictionary_lists = []
            unit_counter = 0
    
    return batches

In [13]:
def eval_preprocessing(data, dataset):
    
    len_array = np.cumsum(np.array(data['SENTS'].apply(lambda x: len(x))))

    dictionary_lists = []
    batches = []
    for i in range(len(dataset.instances)):
        
        instance = dataset.instances[i]
        dictionary_lists.append(instance)
        
        if i in len_array - 1:

            padded_ids = pad_sequence([torch.tensor(instance['ids']) for instance in dictionary_lists], batch_first=True)
            padded_ids = padded_ids.to(device)

            padded_segment_ids = pad_sequence([torch.tensor(instance['segment_ids']) for instance in dictionary_lists], batch_first=True)
            padded_segment_ids = padded_segment_ids.to(device)

            padded_mask_ids = pad_sequence([torch.tensor(instance['mask_ids']) for instance in dictionary_lists], batch_first=True)
            padded_mask_ids = padded_mask_ids.to(device)

            labels = torch.stack([torch.tensor(instance['labels']) for instance in dictionary_lists])
            labels = labels.to(device)
            
            
            current_dev_batch = {'ids': padded_ids, 'mask_ids': padded_mask_ids, 'segment_ids': padded_segment_ids, 'labels': labels}

            batches.append(current_dev_batch)
            dictionary_lists = []

    return batches

In [14]:
def sent_eval_preprocessing(data, dataset): 
    
    len_array = np.cumsum(np.array(data['SENTS'].apply(lambda x: len(x))))

    dictionary_lists = []
    batches = []
    for i in range(len(dataset.instances)):
        
        instance = dataset.instances[i]
        dictionary_lists.append(instance)
        
        if i in len_array - 1:

            
            padded_ids = pad_sequence([torch.tensor(instance['ids']) for instance in dictionary_lists], batch_first=True)
            padded_ids = padded_ids.to(device)

            #padded_segment_ids = pad_sequence([torch.tensor(instance['segment_ids']) for instance in dictionary_lists], batch_first=True)
            #padded_segment_ids = padded_segment_ids.to(device)

            padded_mask_ids = pad_sequence([torch.tensor(instance['mask_ids']) for instance in dictionary_lists], batch_first=True)
            padded_mask_ids = padded_mask_ids.to(device)
            
            padded_sentence_masks = pad_sequence([torch.tensor(instance['sentence_mask']) for instance in dictionary_lists], batch_first=True)
            padded_sentence_masks = padded_sentence_masks.to(device)
            
            padded_q_ids = pad_sequence([torch.tensor(instance['q_ids']) for instance in dictionary_lists], batch_first=True)
            padded_q_ids = padded_q_ids.to(device)
            
            padded_q_mask_ids = pad_sequence([torch.tensor(instance['q_mask_ids']) for instance in dictionary_lists], batch_first=True)
            padded_q_mask_ids = padded_q_mask_ids.to(device)

            labels = torch.stack([torch.tensor(instance['labels']) for instance in dictionary_lists])
            labels = labels.to(device)

            
            current_dev_batch = {'ids': padded_ids, 'mask_ids': padded_mask_ids, 'sentence_mask': padded_sentence_masks,
                                 'labels': labels, 'q_ids': padded_q_ids, "q_mask_ids": padded_q_mask_ids}

            batches.append(current_dev_batch)
            dictionary_lists = []

    return batches

In [15]:
dev_instances, dev_ids, dev_seg_ids, dev_mask_ids, dev_labels = datapreprocessing(validation_data)
train_instances, train_ids, train_seg_ids, train_mask_ids, train_labels = datapreprocessing(training_data)
test_instances, test_ids, test_seg_ids, test_mask_ids, test_labels = datapreprocessing(test_data)

In [15]:
qs_instances, qs_ids, qs_mask_ids, qs_sen_mask, qs_labels = data_to_sentence(training_data, 3)

I0811 02:29:23.051539 139897081689920 tokenization_utils.py:375] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt from cache at /root/.cache/torch/transformers/8a0c070123c1f794c42a29c6904beb7c1b8715741e235bee04aca2c7636fc83f.9b42061518a39ca00b8b52059fd2bede8daa613f8a8671500e518a8c29de8c00


In [16]:
dev_qs_instances, dev_qs_ids, dev_qs_mask_ids, dev_qs_sen_mask, dev_qs_labels = data_to_sentence(validation_data, 3)

I0811 02:29:56.492766 139897081689920 tokenization_utils.py:375] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt from cache at /root/.cache/torch/transformers/8a0c070123c1f794c42a29c6904beb7c1b8715741e235bee04aca2c7636fc83f.9b42061518a39ca00b8b52059fd2bede8daa613f8a8671500e518a8c29de8c00


## Loading Data

In [16]:
from torch.utils.data import Dataset

In [17]:
class SentenceDataset(Dataset):
    
    def __init__(self, ids, segment_ids, mask_ids, labels):
        self.instances = []
        for ids_i, segment_ids_i, mask_ids, label in zip(ids, segment_ids, mask_ids, labels):
            self.instances.append({"ids": ids_i, "segment_ids": segment_ids_i, 
                                   "mask_ids": mask_ids, "labels": label})  
                                   
    def __len__(self):
        return len(self.instances)

    def __getitem__(self, idx):
        sample = self.instances[idx]

        return sample

In [18]:
class QuestionSentenceDataset(Dataset):
    
    def __init__(self, ids, mask_ids, sen_masks, labels):
        self.instances = []
        for ids_i, mask_ids, sen_mask, label in zip(ids, mask_ids, sen_masks, labels):
            self.instances.append({"ids": torch.tensor(ids_i[1]), "mask_ids": torch.tensor(mask_ids[1]), 
                                   "sentence_mask": torch.tensor(sen_mask), "labels": torch.tensor(label), 
                                   "q_ids": torch.tensor(ids_i[0]), "q_mask_ids": torch.tensor(mask_ids[0])})
            
    def __len__(self):
        return len(self.instances)
    
    def __getitem__(self, idx):
        sample = self.instances[idx]
        return sample

In [19]:
train_dataset = SentenceDataset(train_ids, train_seg_ids, train_mask_ids, train_labels)

In [20]:
dev_dataset = SentenceDataset(dev_ids, dev_seg_ids, dev_mask_ids, dev_labels)

In [21]:
test_dataset = SentenceDataset(test_ids, test_seg_ids, test_mask_ids, test_labels)

In [23]:
sent_train_dataset = QuestionSentenceDataset(qs_ids, qs_mask_ids, qs_sen_mask, qs_labels)

In [24]:
sent_dev_dataset = QuestionSentenceDataset(dev_qs_ids, dev_qs_mask_ids, dev_qs_sen_mask, dev_qs_labels)

In [25]:
len(sent_train_dataset)

31422

In [22]:
from torch.utils.data import DataLoader

In [23]:
def collate(batch):
    padded_ids = pad_sequence([torch.tensor(instance['ids']) for instance in batch], batch_first=True)
    padded_ids = padded_ids.to(device)
    
    padded_segment_ids = pad_sequence([torch.tensor(instance['segment_ids']) for instance in batch], batch_first=True)
    padded_segment_ids = padded_segment_ids.to(device)
    
    padded_mask_ids = pad_sequence([torch.tensor(instance['mask_ids']) for instance in batch], batch_first=True)
    padded_mask_ids = padded_mask_ids.to(device)
    
    labels = torch.stack([torch.tensor(instance['labels']) for instance in batch])
    labels = labels.to(device)
    return {'ids': padded_ids, 'mask_ids': padded_mask_ids, 'segment_ids': padded_segment_ids, 'labels': labels}

In [24]:
def collate_3d(batch):
    
    batch_size = len(batch)
    
    max_num_sentences, max_sentence_length = find_max_dimension(batch)
    
    # sentence weight
    sentence_mask = torch.zeros(batch_size, max_num_sentences)
    
    
    target_ids = torch.zeros(batch_size, max_num_sentences, max_sentence_length)
    target_segment_ids = torch.zeros(batch_size, max_num_sentences, max_sentence_length)
    target_mask_ids = torch.zeros(batch_size, max_num_sentences, max_sentence_length)
    target_labels = torch.zeros(batch_size, max_num_sentences, 1)
    for i in range(len(batch)):
        
        source_id_dimension = batch[i]['ids'].shape
        
        sentence_mask[i, :source_id_dimension[0]] = 1
        
        target_ids[i, :source_id_dimension[0], :source_id_dimension[1]] = batch[i]['ids']
        
        source_segment_id_dimension = batch[i]['segment_ids'].shape
        target_segment_ids[i, :source_segment_id_dimension[0], :source_segment_id_dimension[1]] = batch[i]['segment_ids']
        
        source_mask_id_dimension = batch[i]['mask_ids'].shape
        target_mask_ids[i, :source_mask_id_dimension[0], :source_mask_id_dimension[1]] = batch[i]['mask_ids']
        
        source_label_dimension = batch[i]['labels'].shape
        target_labels[i, :source_label_dimension[0], :source_label_dimension[1]] = batch[i]['labels']
    
    target_labels = target_labels.squeeze(-1)
    
    target_ids = target_ids.to(device).to(torch.long)
    target_segment_ids = target_segment_ids.to(device).to(torch.long)
    target_mask_ids = target_segment_ids.to(device).to(torch.long)
    target_labels = target_labels.to(dtype=torch.float, device=device)
    sentence_mask = sentence_mask.to(device).to(torch.float)
    return {'ids': target_ids, 'mask_ids': target_mask_ids, 'segment_ids': target_segment_ids, 'labels': target_labels, 'sentence_mask': sentence_mask}

In [25]:
def collate_2d(batch):
    
    batch_size = len(batch)
    
    max_num_sentences, max_sentence_length = find_max_dimension(batch)
    
    # sentence weight
    #sentence_mask = torch.zeros(batch_size, max_num_sentences)
    
    
    target_ids = torch.zeros(max_num_sentences, max_sentence_length)
    target_segment_ids = torch.zeros(max_num_sentences, max_sentence_length)
    target_mask_ids = torch.zeros(max_num_sentences, max_sentence_length)
    target_labels = torch.zeros(max_num_sentences, 1)
    for i in range(len(batch)):
        
        source_id_dimension = batch[i]['ids'].shape
        
        #sentence_mask[:source_id_dimension[0]] = 1
        
        target_ids[:source_id_dimension[0], :source_id_dimension[1]] = batch[i]['ids']
        
        source_segment_id_dimension = batch[i]['segment_ids'].shape
        target_segment_ids[:source_segment_id_dimension[0], :source_segment_id_dimension[1]] = batch[i]['segment_ids']
        
        source_mask_id_dimension = batch[i]['mask_ids'].shape
        target_mask_ids[:source_mask_id_dimension[0], :source_mask_id_dimension[1]] = batch[i]['mask_ids']
        
        source_label_dimension = batch[i]['labels'].shape
        target_labels[:source_label_dimension[0], :source_label_dimension[1]] = batch[i]['labels']

    #target_labels = target_labels.squeeze(-1)
    
    target_ids = target_ids.to(device).to(torch.long)
    target_segment_ids = target_segment_ids.to(device).to(torch.long)
    target_mask_ids = target_segment_ids.to(device).to(torch.long)
    target_labels = target_labels.to(dtype=torch.float, device=device)
    #sentence_mask = sentence_mask.to(device).to(torch.float)
    return {'ids': target_ids, 'mask_ids': target_mask_ids, 'segment_ids': target_segment_ids, 'labels': target_labels}

In [26]:
def find_max_dimension(batch):
    num_sentences = []
    sentence_lengths = []
    for question in batch:
        num_sentences.append(question['ids'].shape[0])
        sentence_lengths.append(question['ids'].shape[1])
    return max(num_sentences), max(sentence_lengths)

In [27]:
#new_train_batches = window_sentence_preprocessing(training_data, train_dataset, 10)
#new_train_2d_batches = window_sentence_preprocessing(training_data, train_dataset, 10)

In [28]:
#new_train_batches[4]['ids'].shape

In [29]:
dataloader_train = DataLoader(train_dataset, batch_size=4, shuffle = True, collate_fn = collate)
#dataloader_train_3d = DataLoader(new_train_batches, batch_size=2, shuffle = True, collate_fn = collate_3d)
#dataloader_train_2d = DataLoader(new_train_2d_batches, batch_size=8, shuffle = True, collate_fn = collate_2d)
#dataloader_sent_train = DataLoader(sent_train_dataset, batch_size = 4, shuffle = True)
#dataloader_sent_train_eval = DataLoader(sent_train_dataset, batch_size = 1)
#dataloader_sent_dev = DataLoader(sent_dev_dataset, batch_size = 1)

## Data Analysis

In [27]:
train_sentence_lengths = []
dev_sentence_lengths = []
test_sentence_lengths = []

for instance_dict in train_dataset.instances:
    train_sentence_lengths.append(len(instance_dict['ids']))
    
for instance_dict in dev_dataset.instances:
    dev_sentence_lengths.append(len(instance_dict['ids']))
    
for instance_dict in test_dataset.instances:
    test_sentence_lengths.append(len(instance_dict['ids']))

In [28]:
import matplotlib.pyplot as plt

plt.hist(train_sentence_lengths, bins=100)
plt.hist(dev_sentence_lengths, bins=100)
plt.hist(test_sentence_lengths, bins=100)
print('train data >512 percentage:', (np.array(train_sentence_lengths) > 512).sum() / len(train_sentence_lengths))
print('dev data >512 percentage:', (np.array(dev_sentence_lengths) > 512).sum() / len(dev_sentence_lengths))
print('test data >512 percentage:', (np.array(test_sentence_lengths) > 512).sum() / len(test_sentence_lengths))


train data >512 percentage: 0.0
dev data >512 percentage: 0.0
test data >512 percentage: 0.0


## Baseline Model

In [30]:
class baseline_model(nn.Module):

    def __init__(self):
        
        super(baseline_model, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-chinese')
        self.linear = nn.Linear(768, 1)

    def forward(self, batch):
        # batch['ids'] = (batch_size, sent_len)
        # batch['segment_ids'] = (batch_size, sent_len)
        # batch['mask_ids'] = (batch_size, sent_len)
        # pooler_output = (batch_size, 768)
        # output = (batch_size, 1)
        hidden_state, pooler_output = self.bert(batch['ids'], batch['mask_ids'], batch['segment_ids'])
        
        linear_output = self.linear(pooler_output)

        return linear_output

    def loss(self, batch):
        
        loss_fn = nn.BCEWithLogitsLoss()
        output = self.forward(batch)
        target = batch['labels'].float().to(device)
        
        return loss_fn(output, target)
    
    def _predict(self, batch):
        
        output = self.forward(batch)
        scores = torch.sigmoid(output)
        scores = scores.cpu().numpy()[:,0].tolist()
        
        return scores
    
    def predict_fgc(self, batch, threshold=0.5):
        
        scores = self._predict(batch)
        max_i = 0
        max_score = 0
        sp = []
        
        for i, score in enumerate(scores):

            if score > max_score:
                max_i = i
                max_score = score
            if score >= threshold:
                sp.append(i)

        if not sp:
            sp.append(max_i)

        return {'sp': sp, 'sp_scores': scores}

In [31]:
baseline = baseline_model()
baseline.to(device)

I0812 02:16:03.949645 139797494843200 configuration_utils.py:264] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json from cache at /root/.cache/torch/transformers/8a3b1cfe5da58286e12a0f5d7d182b8d6eca88c08e26c332ee3817548cf7e60a.f12a4f986e43d8b328f5b067a641064d67b91597567a06c7b122d1ca7dfd9741
I0812 02:16:03.951986 139797494843200 configuration_utils.py:300] Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_he

baseline_model(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
     

## Training Baseline Model & Evaluating Performance

In [32]:
def optim(nn, num_epochs, lr):
    param_optimizer = list(nn.bert.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    num_epochs = num_epochs
    num_train_optimization_steps = len(dataloader_train) * num_epochs
    optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                     num_warmup_steps=int(
                                                         num_train_optimization_steps * 0.1),
                                                     num_training_steps=num_train_optimization_steps)
    return optimizer, scheduler

In [33]:
def _update_sp(metrics, sp_gold, sp_pred):
    tp, fp, fn = 0, 0, 0
        
    for p in sp_pred:
        if p in sp_gold:
            tp += 1
        else:
            fp += 1
    for g in sp_gold:
        if g not in sp_pred:
            fn += 1
            
    precision = 1.0 * tp / (tp + fp) if tp + fp > 0 else 0.0
    recall = 1.0 * tp / (tp + fn) if tp + fn > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0
    em = 1.0 if fp + fn == 0 else 0.0
    
    metrics['sp_em'] += em
    metrics['sp_f1'] += f1
    metrics['sp_prec'] += precision
    metrics['sp_recall'] += recall
    
    return precision, recall, f1

In [34]:
def eval_sp_fgc(sp_golds, sp_preds):
    
    metrics = {'sp_em': 0, 'sp_prec': 0, 'sp_recall': 0, 'sp_f1': 0}
    
    assert len(sp_golds) == len(sp_preds)
    
    for sp_gold, sp_pred in zip(sp_golds, sp_preds):
        _update_sp(metrics, sp_gold, sp_pred)
    
    N = len(sp_golds)
    for k in metrics.keys():
        metrics[k] /= N
        metrics[k] = round(metrics[k], 3)
    print(metrics)
    return metrics

In [35]:
def eval_fgc_atype(atype_golds, atype_preds):
    
    pos = 0
    neg = 0
    
    for gold, atype in zip(atype_golds, atype_preds):
        if atype == gold:
            pos += 1
        else:
            neg += 1
    return pos/len(atypes_preds)

In [36]:
def eval(network, dev_batches, current_epoch, sp_golds, avg_loss):
    
    network.eval()
    
    with torch.no_grad():
        sp_preds = []
        for batch in tqdm(dev_batches):
            
            out_dct = network.predict_fgc(batch)
            sp_preds.append(out_dct['sp'])
                
    metrics = eval_sp_fgc(sp_golds, sp_preds)
    print('epoch %d eval_recall: %.3f eval_f1: %.3f loss: %.3f' % (
            current_epoch, metrics['sp_recall'], metrics['sp_f1'], avg_loss))
        
    torch.save(network.state_dict(), "Models_SEs/model_epoch{0}_eval_em:{1:.3f}_precision:{2:.3f}_recall:{3:.3f}_f1:{4:.3f}_loss:{5:.3f}.m".format(current_epoch, metrics['sp_em'], metrics['sp_prec'], metrics['sp_recall'], metrics['sp_f1'], avg_loss))
    
    return sp_preds, sp_golds

In [37]:
def train(network, data, dev_batches, num_epochs, lr):
    
    loss_fn = nn.BCEWithLogitsLoss()
    optimizer, scheduler = optim(network, num_epochs, lr)
    
    sp_golds = validation_data['QUESTIONS'].apply(lambda x: x[0]['SHINT_']).tolist()
    
    for current_epoch in range(num_epochs):
        network.train()
        running_loss = 0.0
        for batch in tqdm(data):
            optimizer.zero_grad()
            current_output = network(batch)
            current_target = batch['labels'].to(dtype=torch.float, device=device)
            current_loss = loss_fn(current_output, current_target)
            current_loss.backward()
            torch.nn.utils.clip_grad_norm_(network.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            running_loss += current_loss.item()
            
        learning_rate_scalar = scheduler.get_lr()[0]
        print('lr = %f' % learning_rate_scalar)
        avg_loss = running_loss/len(data)
        print('epoch %d train_loss: %.3f' % (current_epoch, avg_loss))
        eval(network, dev_batches, current_epoch, sp_golds, avg_loss)

In [38]:
dev_batches = eval_preprocessing(validation_data, dev_dataset)

In [39]:
train(baseline, dataloader_train, dev_batches, 20, 0.00002) # if you want to run this again, rememebr to add the parameter 'batches'

HBox(children=(IntProgress(value=0, max=7767), HTML(value='')))


lr = 0.000010
epoch 0 train_loss: 0.246


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


{'sp_em': 0.18, 'sp_prec': 0.732, 'sp_recall': 0.454, 'sp_f1': 0.528}
epoch 0 eval_recall: 0.454 eval_f1: 0.528 loss: 0.246


HBox(children=(IntProgress(value=0, max=7767), HTML(value='')))


lr = 0.000020
epoch 1 train_loss: 0.202


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


{'sp_em': 0.146, 'sp_prec': 0.593, 'sp_recall': 0.551, 'sp_f1': 0.523}
epoch 1 eval_recall: 0.551 eval_f1: 0.523 loss: 0.202


HBox(children=(IntProgress(value=0, max=7767), HTML(value='')))


lr = 0.000019
epoch 2 train_loss: 0.174


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


{'sp_em': 0.163, 'sp_prec': 0.648, 'sp_recall': 0.562, 'sp_f1': 0.546}
epoch 2 eval_recall: 0.562 eval_f1: 0.546 loss: 0.174


HBox(children=(IntProgress(value=0, max=7767), HTML(value='')))


lr = 0.000018
epoch 3 train_loss: 0.145


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


{'sp_em': 0.113, 'sp_prec': 0.533, 'sp_recall': 0.689, 'sp_f1': 0.538}
epoch 3 eval_recall: 0.689 eval_f1: 0.538 loss: 0.145


HBox(children=(IntProgress(value=0, max=7767), HTML(value='')))


lr = 0.000017
epoch 4 train_loss: 0.115


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


{'sp_em': 0.134, 'sp_prec': 0.605, 'sp_recall': 0.551, 'sp_f1': 0.518}
epoch 4 eval_recall: 0.551 eval_f1: 0.518 loss: 0.115


HBox(children=(IntProgress(value=0, max=7767), HTML(value='')))


lr = 0.000016
epoch 5 train_loss: 0.101


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


{'sp_em': 0.126, 'sp_prec': 0.567, 'sp_recall': 0.637, 'sp_f1': 0.537}
epoch 5 eval_recall: 0.637 eval_f1: 0.537 loss: 0.101


HBox(children=(IntProgress(value=0, max=7767), HTML(value='')))


lr = 0.000014
epoch 6 train_loss: 0.078


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


{'sp_em': 0.146, 'sp_prec': 0.593, 'sp_recall': 0.605, 'sp_f1': 0.541}
epoch 6 eval_recall: 0.605 eval_f1: 0.541 loss: 0.078


HBox(children=(IntProgress(value=0, max=7767), HTML(value='')))


lr = 0.000013
epoch 7 train_loss: 0.065


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


{'sp_em': 0.159, 'sp_prec': 0.619, 'sp_recall': 0.564, 'sp_f1': 0.54}
epoch 7 eval_recall: 0.564 eval_f1: 0.540 loss: 0.065


HBox(children=(IntProgress(value=0, max=7767), HTML(value='')))


lr = 0.000012
epoch 8 train_loss: 0.051


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


{'sp_em': 0.155, 'sp_prec': 0.612, 'sp_recall': 0.562, 'sp_f1': 0.535}
epoch 8 eval_recall: 0.562 eval_f1: 0.535 loss: 0.051


HBox(children=(IntProgress(value=0, max=7767), HTML(value='')))


lr = 0.000011
epoch 9 train_loss: 0.046


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


{'sp_em': 0.155, 'sp_prec': 0.604, 'sp_recall': 0.534, 'sp_f1': 0.511}
epoch 9 eval_recall: 0.534 eval_f1: 0.511 loss: 0.046


HBox(children=(IntProgress(value=0, max=7767), HTML(value='')))


lr = 0.000010
epoch 10 train_loss: 0.035


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


{'sp_em': 0.138, 'sp_prec': 0.577, 'sp_recall': 0.646, 'sp_f1': 0.548}
epoch 10 eval_recall: 0.646 eval_f1: 0.548 loss: 0.035


HBox(children=(IntProgress(value=0, max=7767), HTML(value='')))


lr = 0.000009
epoch 11 train_loss: 0.029


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


{'sp_em': 0.172, 'sp_prec': 0.596, 'sp_recall': 0.556, 'sp_f1': 0.529}
epoch 11 eval_recall: 0.556 eval_f1: 0.529 loss: 0.029


HBox(children=(IntProgress(value=0, max=7767), HTML(value='')))


lr = 0.000008
epoch 12 train_loss: 0.026


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


{'sp_em': 0.121, 'sp_prec': 0.552, 'sp_recall': 0.589, 'sp_f1': 0.515}
epoch 12 eval_recall: 0.589 eval_f1: 0.515 loss: 0.026


HBox(children=(IntProgress(value=0, max=7767), HTML(value='')))


lr = 0.000007
epoch 13 train_loss: 0.022


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


{'sp_em': 0.138, 'sp_prec': 0.576, 'sp_recall': 0.537, 'sp_f1': 0.505}
epoch 13 eval_recall: 0.537 eval_f1: 0.505 loss: 0.022


HBox(children=(IntProgress(value=0, max=7767), HTML(value='')))


lr = 0.000006
epoch 14 train_loss: 0.021


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


{'sp_em': 0.167, 'sp_prec': 0.588, 'sp_recall': 0.557, 'sp_f1': 0.519}
epoch 14 eval_recall: 0.557 eval_f1: 0.519 loss: 0.021


HBox(children=(IntProgress(value=0, max=7767), HTML(value='')))


lr = 0.000004
epoch 15 train_loss: 0.015


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


{'sp_em': 0.167, 'sp_prec': 0.606, 'sp_recall': 0.578, 'sp_f1': 0.538}
epoch 15 eval_recall: 0.578 eval_f1: 0.538 loss: 0.015


HBox(children=(IntProgress(value=0, max=7767), HTML(value='')))


lr = 0.000003
epoch 16 train_loss: 0.010


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


{'sp_em': 0.167, 'sp_prec': 0.592, 'sp_recall': 0.604, 'sp_f1': 0.54}
epoch 16 eval_recall: 0.604 eval_f1: 0.540 loss: 0.010


HBox(children=(IntProgress(value=0, max=7767), HTML(value='')))


lr = 0.000002
epoch 17 train_loss: 0.006


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


{'sp_em': 0.163, 'sp_prec': 0.602, 'sp_recall': 0.587, 'sp_f1': 0.54}
epoch 17 eval_recall: 0.587 eval_f1: 0.540 loss: 0.006


HBox(children=(IntProgress(value=0, max=7767), HTML(value='')))


lr = 0.000001
epoch 18 train_loss: 0.003


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


{'sp_em': 0.138, 'sp_prec': 0.589, 'sp_recall': 0.61, 'sp_f1': 0.542}
epoch 18 eval_recall: 0.610 eval_f1: 0.542 loss: 0.003


HBox(children=(IntProgress(value=0, max=7767), HTML(value='')))


lr = 0.000000
epoch 19 train_loss: 0.004


HBox(children=(IntProgress(value=0, max=239), HTML(value='')))


{'sp_em': 0.142, 'sp_prec': 0.597, 'sp_recall': 0.608, 'sp_f1': 0.543}
epoch 19 eval_recall: 0.608 eval_f1: 0.543 loss: 0.004


In [38]:
trained_baseline = baseline_model()
trained_baseline.load_state_dict(torch.load("Models/baseline_models_with_scheduler/model_epoch8_eval_em:0.198_precision:0.603_recall:0.588_f1:0.545_loss:0.031.m"))

I0724 08:26:42.599873 140341672732480 configuration_utils.py:152] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json from cache at /root/.cache/torch/transformers/8a3b1cfe5da58286e12a0f5d7d182b8d6eca88c08e26c332ee3817548cf7e60a.f12a4f986e43d8b328f5b067a641064d67b91597567a06c7b122d1ca7dfd9741
I0724 08:26:42.603232 140341672732480 configuration_utils.py:169] Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pad_token_id": 0,
  

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [39]:
sp_golds = training_data['QUESTIONS'].apply(lambda x: x[0]['SHINT_']).tolist()
batches = eval_preprocessing(training_data, train_dataset)

trained_network.to("cuda")
train_pred, train_obs = eval(trained_network, batches, 0, sp_golds, 0.001)


HBox(children=(IntProgress(value=0, max=882), HTML(value='')))


{'sp_em': 0.895, 'sp_prec': 0.97, 'sp_recall': 0.955, 'sp_f1': 0.958}
epoch 0 eval_recall: 0.955 eval_f1: 0.958 loss: 0.001


In [40]:
sp_golds = validation_data['QUESTIONS'].apply(lambda x: x[0]['SHINT_']).tolist()
dev_batches = eval_preprocessing(validation_data, dev_dataset)
dev_preds, dev_obs = eval(trained_network, dev_batches, 0, sp_golds, 0.001)

HBox(children=(IntProgress(value=0, max=247), HTML(value='')))


{'sp_em': 0.194, 'sp_prec': 0.599, 'sp_recall': 0.584, 'sp_f1': 0.541}
epoch 0 eval_recall: 0.584 eval_f1: 0.541 loss: 0.001


In [41]:
sp_golds = test_data['QUESTIONS'].apply(lambda x: x[0]['SHINT_']).tolist()
test_batches = eval_preprocessing(test_data, test_dataset)
test_preds, test_obds = eval(trained_network, test_batches, 0, sp_golds, 0.001)

HBox(children=(IntProgress(value=0, max=193), HTML(value='')))


{'sp_em': 0.212, 'sp_prec': 0.643, 'sp_recall': 0.553, 'sp_f1': 0.55}
epoch 0 eval_recall: 0.553 eval_f1: 0.550 loss: 0.001


## Training Data Error Analysis

In [42]:
validation_data_with_performance = datapreprocessing(validation_data, True)
training_data_with_performance = datapreprocessing(training_data, True)
test_data_with_performance = datapreprocessing(test_data, True)

In [43]:
training_data_with_performance['train_pred'] = train_pred
training_data_with_performance['train_obs'] = train_obs

In [44]:
correct_sp = []
for i in range(training_data_with_performance.shape[0]):
    para = training_data_with_performance['Sentence_List'][i]
    sen = []
    for index in training_data_with_performance['train_pred'][i]:
        sen.append(para[index])
    correct_sp.append(sen)
training_data_with_performance['Pred_List'] = correct_sp
correct_sp = []
for i in range(training_data_with_performance.shape[0]):
    para = training_data_with_performance['Sentence_List'][i]
    sen = []
    for index in training_data_with_performance['train_obs'][i]:
        sen.append(para[index])
    correct_sp.append(sen)
training_data_with_performance['Obs_List'] = correct_sp

In [45]:
training_data_with_performance.drop(['SE_Index', 'Label', 'train_pred', 'train_obs'], axis=1, inplace=True)

In [46]:
train_mismatch = training_data_with_performance[training_data_with_performance['Pred_List'] != training_data_with_performance['Obs_List']]
train_mismatch

Unnamed: 0,Question,Sentence_List,Length,Pred_List,Obs_List
1,苏东坡出生于哪一年?,"[嘉佑二年（1057年），, 苏轼才20岁，, 与弟弟苏辙一同进京参加会考，, 苏轼中进士第...",35,[苏轼才20岁，],"[嘉佑二年（1057年），, 苏轼才20岁，]"
2,苏东坡和谁一起进京参加会考?,"[嘉佑二年（1057年），, 苏轼才20岁，, 与弟弟苏辙一同进京参加会考，, 苏轼中进士第...",35,[与弟弟苏辙一同进京参加会考，],"[苏轼才20岁，, 与弟弟苏辙一同进京参加会考，]"
8,在苏东坡被王安石诬陷时，谁为他说话？,"[嘉佑二年（1057年），, 苏轼才20岁，, 与弟弟苏辙一同进京参加会考，, 苏轼中进士第...",35,[范镇极辩苏轼贩盐之诬，],"[范镇极辩苏轼贩盐之诬，, 并愿意退休负责。]"
12,苏东坡哪一年离开海南岛?,"[元祐元年（1086年），, 宋哲宗即位，, 高太皇太后垂帘听政，, 回朝任礼部郎中、中书舍...",32,"[\n绍圣元年（1094年）被哲宗贬谪至惠州、儋州（海南岛）。, 下诏让苏轼北还。]","[\n绍圣元年（1094年）被哲宗贬谪至惠州、儋州（海南岛）。, \n元符三年（1100年）..."
13,苏东坡死于哪一年?,"[元祐元年（1086年），, 宋哲宗即位，, 高太皇太后垂帘听政，, 回朝任礼部郎中、中书舍...",32,[七月二十八日于常州孙氏馆病卒，],"[\n建中靖国元年（1101年），, 七月二十八日于常州孙氏馆病卒，]"
...,...,...,...,...,...
829,青河大学时代在哪一所学校念书?,"[为倾听新住民心声，, 内政部移民署今(18)日下午，, 在台南市举办新住民座谈会，, 有2...",56,"[青河毕业于越南河内国家大学法文系，, 青河得以持续进修，]","[青河毕业于越南河内国家大学法文系，, 青河得以持续进修，, 她不仅取得国立成功大学硕士学位，]"
831,青河的硕士学位在哪一个国家念的?,"[为倾听新住民心声，, 内政部移民署今(18)日下午，, 在台南市举办新住民座谈会，, 有2...",56,"[青河毕业于越南河内国家大学法文系，, 青河得以持续进修，]","[青河毕业于越南河内国家大学法文系，, 青河得以持续进修，, 她不仅取得国立成功大学硕士学位，]"
867,如果我使用了日本买的眼药水却发生过敏红肿，是否可以申请药害救济?,"[「目睭花花，匏仔看做菜瓜」，, 许多民众如果觉得眼睛雾雾时，, 常常会自行购买眼药水来缓解...",39,"[透过网购或国外带回的眼药水，, 并不属于我国合法药物，, 并不适用我国药害救济制度喔!\n...","[透过网购或国外带回的眼药水，, 并不适用我国药害救济制度喔!\n\n 食药署为..."
870,目前在台湾出现的肠病毒有哪几型?,"[国内肠病毒轻症疫情持续上升，, 另新增1例肠病毒71型并发重症病例。, 疾病管制署再次呼吁...",42,"[以感染肠病毒71型为多(28例)，, 其他分别感染肠病毒D68型、克沙奇A6型、A10型(...","[今(2019)年累计37例肠病毒并发重症病例，, 以感染肠病毒71型为多(28例)，, 其..."


#20 (Pred:[9], Actual:[])

Question:「阿拉伯之春」运动中，走上街头的民众的诉求为何? <br>
Predicted SP: 只有突尼西亚成为阿拉伯之春中，<br>
Actual: None

Comment: While the predicted SP is incorrect, I found out that there is supporting evidence in the paragraph. (...要求推翻本国的专制政体的行动)This might be a case of incorrect input.

#70 (Pred:[11], Actual:[])

Question: 第二次签订的北美贸易协定从签署至生效过了几日? <br>
Predicted SP: 美国、墨西哥和加拿大就更新北美自由贸易协定达成一致，<br>
Actual: None

Comment: Same as #20. (美国、加拿大及墨西哥在1992年8月12日签署了关于三国间全面贸易的协议。...，北美自由贸易协议于1994年1月1日正式生效。)

#156 (Pred:[1], Actual:[])

Question: 聊天机器人仰赖哪些方法让回答愈来愈准确? <br>
Predicted SP: 麻省理工学院（MIT）人工智慧实验室早在1966年即研发出名为「Eliza」的机器人， <br>
Actual: None <br>

Comment: Same (聊天机器人的作答准确度要透过程式化的方法改善)

#284 (Pred:[3], Actual:[])

Question: 不可再生能源的意义是什么？ <br>
Predicted SP: 许多这些形式可以很容易转化为另一种的帮助下， <br>
Actual: None <br>

Comment: Same (是无法经过短时间内再生的能源，而且它们的消耗速度远远超过它们再生的速度)

#324 (Pred:[4], Actual:[])

Question: 伊甸基金會成立的宗旨為何? <br>
Predicted SP: 因著上帝的呼召及一颗爱身心障碍者的同理心，<br>
Actual: None <br>

Comment: No SP in the paragraph. But I think SP is pretty close to being a supporting evidence. This 
must be a borderline case.

#370 (Pred:[12,25], Actual:[25])

Question: 三大健康照护体系保险制度中，政府涉入程度低的是哪一种？<br>
Predicted SP: 公医制（政府介入最多）：以英国为代表。 AND 自由市场（政府一般不介入）：以2013年前的美国为代表。<br>
Actual: 公医制（政府介入最多）：以英国为代表。<br>

Comment: I think this is a very reasonable mismatch. As two supporting evidences are very similar
syntax-wise but drastically different in meaning.

#371 (Pred:[12,25], Actual:[12])

Question: 三大健康照護體系保險制度中，政府涉入程度高的是哪一種？ <br>
Predicted SP: 公医制（政府介入最多）：以英国为代表。 AND 自由市场（政府一般不介入）：以2013年前的美国为代表。<br>
Actual: 公医制（政府介入最多）：以英国为代表。<br>

Comment: Same as #370

#395 (Pred:[10], Actual:[])

Question: 熬夜是否能减低得到癌症的风险? <br>
Predicted SP: 皆强烈建议减少或避免动物性食品摄取， <br>
Actual: None <br>

Comment: Another potential case of incorrect input. In the paragraph I found this sentence (所以防癌守则：...，注重睡眠品质)

#449 (Pred:[2], Actual:[])

Question: 高屏地区国庆烟火试放管制时间是从晚上几点开始？ <br>
Predicted SP: 屏东县政府表示24号当天屏东河滨公园将管制不开放， <br>
Actual: None <br>

Comment: Another similar case. This time I strongly believe this is an incorrect input. 
当晚7时并会进行全面清场 <- This is sufficient to be a supporting evidence

#502 (Pred:[7], Actual:[])

Question: 为何圣伯多禄大殿只能重建不能整修就好? <br>
Predicted SP: 教宗犹利二世决定重建圣伯多禄大殿 <br>
Actual: None 

Comment: Another similar case. (无疑再改动有机会让建筑倒塌)

#630 (Pred:[62], Actual:[])

Question: 毛笔、铅笔、钢笔，这三种笔中哪个笔尖的硬度高？ <br>
Predicted SP: 更进一步看：我们无论使用那一种笔，<br>
Actual: None <br>

Comment: Again, I think this counts as a supporting evidence (钢笔的笔尖用金属制成，弹性大，硬度高)

#731 (Pred:[9], Actual:[])

Question: 为什么古埃及人要把死人做成木乃伊? <br>
Predicted SP: 是做什么用的呢？ <br>
Actual: None <br>

Comment: I think this counts (古埃及人相信：人死后只要把遗体保存好，就可以在另一个世界得到永生。)

#874 (Pred:[22], Actual:[])

Question: 要如何降低肠病毒的传播风险？
Predicted SP: 今(2019)年累计37例肠病毒并发重症病例，
Actual: None

Comment: No doubt, these are supporting evidences (应加强居家环境、教室及游乐设施等的通风、整洁与消毒，并教导学童落实「湿、搓、冲、捧、擦」正确洗手步骤，及生病在家休息等良好卫生观念，)

## Validation Data Error Analysis

In [47]:
validation_data_with_performance['dev_pred'] = dev_preds
validation_data_with_performance['dev_obs'] = dev_obs

In [48]:
correct_sp = []
for i in range(validation_data_with_performance.shape[0]):
    para = validation_data_with_performance['Sentence_List'][i]
    sen = []
    for index in validation_data_with_performance['dev_pred'][i]:
        sen.append(para[index])
    correct_sp.append(sen)
validation_data_with_performance['Pred_List'] = correct_sp
correct_sp = []
for i in range(validation_data_with_performance.shape[0]):
    para = validation_data_with_performance['Sentence_List'][i]
    sen = []
    for index in validation_data_with_performance['dev_obs'][i]:
        sen.append(para[index])
    correct_sp.append(sen)
validation_data_with_performance['Obs_List'] = correct_sp

In [49]:
validation_data_with_performance.drop(['SE_Index', 'Label'], axis=1, inplace=True)

In [50]:
validation_data_with_performance.drop(['Sentence_List', 'dev_pred', 'dev_obs'], axis=1, inplace=True)

In [51]:
dev_mismatch = validation_data_with_performance[validation_data_with_performance['Obs_List'] != validation_data_with_performance['Pred_List']]

In [52]:
dev_mismatch

Unnamed: 0,Question,Length,Pred_List,Obs_List
0,苏东坡在中国历史上，是哪一个朝代的人？,36,[苏轼（1037年1月8日－1101年8月24日），],"[苏轼（1037年1月8日－1101年8月24日），, 北宋时著名的文学家、政治家、艺术家、..."
1,苏东坡是中国哪个省份的人？,36,"[苏轼（1037年1月8日－1101年8月24日），, 眉州眉山（今四川省眉山市）人，]","[苏轼（1037年1月8日－1101年8月24日），, 眉州眉山（今四川省眉山市）人，, 号..."
2,苏东坡的爸爸叫什么名字?,36,[苏轼（1037年1月8日－1101年8月24日），],"[苏轼（1037年1月8日－1101年8月24日），, 号东坡居士、铁冠道人。, 苏轼的散文..."
3,苏文忠公指的是谁?,36,[苏轼（1037年1月8日－1101年8月24日），],"[苏轼（1037年1月8日－1101年8月24日），, 加赐谥号文忠，]"
4,《苏文忠公全集》是由何人编纂？,36,"[苏轼（1037年1月8日－1101年8月24日），, 编有《苏文忠公全集》。]","[宋人王宗稷收其作品，, 编有《苏文忠公全集》。]"
...,...,...,...,...
237,内政部消防署提出防范纵火方法「三从四得」，请问是哪「三从」?,33,"[可以依照「三从四得」的方法防范纵火，, 民众可以依照「三从四得」的方法，, 防范居家或社区...","[\n内政部表示，, 民众可以依照「三从四得」的方法，, 三从就是「从消除死角做起」、「从守..."
238,内政部消防署提出防范纵火方法「三从四得」，请问是哪「四得」?,33,"[可以依照「三从四得」的方法防范纵火，, 民众可以依照「三从四得」的方法，, 防范居家或社区...","[\n内政部表示，, 民众可以依照「三从四得」的方法，, 四得就是「可疑状况要认得」、「大门..."
244,这起诈骗案件的受害人是男性还是女性？,39,"[经查系单身女子曾女数月前上网至「异性」交友网站，, 被骗曾姓女子信以为真，, 遂依指示至南...","[南港分局南港派出所于108年11月5日接获民众报案称其友人疑似遭受诈骗案件，, 经查系单身..."
245,这起诈骗案件的诈骗金额是多少？,39,[警方查证后确认系一椿网路交友爱情诈骗案件，],"[南港分局南港派出所于108年11月5日接获民众报案称其友人疑似遭受诈骗案件，, 当日接获该..."


In [53]:
%store dev_mismatch

Stored 'dev_mismatch' (DataFrame)


## BERT+LSTM

In [33]:
class FGC_LSTM_Network(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers):
        
        super(FGC_LSTM_Network, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-chinese')
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.h0 = nn.Parameter(torch.FloatTensor(hidden_size).uniform_(-0.1, 0.1))
        self.c0 = nn.Parameter(torch.FloatTensor(hidden_size).uniform_(-0.1, 0.1))
        self.linear = nn.Linear(hidden_size*2, 1) 
        # (H-state and c_state must be converted into 3d in forward function)
        
    def forward_nn(self, batch):    
        # batch['ids'] = (batch_size*number of sentence, sent_len)
        # batch['segment_ids'] = (batch_size*number of sentence, sent_len)
        # batch['mask_ids'] = (batch_size*number of sentence, sent_len)
        # pooler_output = (batch_size, 768)
        # hidden_state = (batch_size, sent_len, 768)
        # output = (batch_size, 1)
        
        #h0 = torch.zeros(self.num_layers*2, batch['ids'].shape[0], self.hidden_size).to(device)
        #c0 = torch.zeros(self.num_layers*2, batch['ids'].shape[0], self.hidden_size).to(device)
        
        
        batch_size = batch['ids'].shape[0]
        max_sent_size = batch['ids'].shape[2]
        
        #h0 = self.h0.expand(batch_size, self.num_layers*2, -1)
        #c0 = self.c0.expand(batch_size, self.num_layers*2, -1)
        ids = batch['ids'].view(-1, max_sent_size)
        mask_ids = batch['mask_ids'].view(-1, max_sent_size)
        segment_ids = batch['segment_ids'].view(-1, max_sent_size)
        
        hidden_state, pooler_output = self.bert(ids, mask_ids, segment_ids)   
        
        hidden_state = hidden_state[:,0].view(batch_size, -1, 768)
        
        lstm_output, (hn, cn) = self.lstm(hidden_state)
        
        linear_output = self.linear(lstm_output).squeeze(-1)
        
        return linear_output
    
    def forward(self, batch):
        output = self.forward_nn(batch)
        labels = batch['labels']
        loss_fn = nn.BCEWithLogitsLoss(weight=batch['sentence_mask'])
        loss = loss_fn(output, labels)
        return loss
     
    """
    def loss_fn(self, batch):
        
        batch_size = batch['ids'].shape[0]
        
        if torch.zeros(max(size_list) - min(size_list), batch.shape[2]) in batch:
            loss_fn = nn.BCEWithLogitsLoss(w)
            
        else:
            loss_fn = nn.BCEWithLogitsLoss()
        output = self.forward(batch)
        target = batch['labels'].float().to(device)
        
        return loss_fn(output, target)
    """
    
    def _predict(self, batch):
        
        with torch.no_grad():
            output = self.forward_nn(batch)
            scores = torch.sigmoid(output)
            scores = scores.cpu().numpy().tolist()
        
        return scores
    
    def predict_fgc(self, batch, threshold=0.5):
        scores = self._predict(batch)
        max_i = 0
        max_score = 0
        sp = []

        for i, score in enumerate(scores[0]):

            if score > max_score:
                max_i = i
                max_score = score
            if score >= threshold:
                sp.append(i)

        # This is to ensure there's no empty supporting evidences
        if not sp:
            sp.append(max_i)
        return {'sp': sp, 'sp_scores': scores}

## BERT+Aggregation

In [121]:
class bertAgg(nn.Module):
    
    def __init__(self, number_of_sentence, max_sentence_length, baseline):
        
        super(bertAgg, self).__init__()
        self.max_sentence_length = max_sentence_length
        self.number_of_sentence = number_of_sentence
        baseline.load_state_dict(torch.load("Models/baseline_models_with_scheduler/model_epoch8_eval_em:0.198_precision:0.603_recall:0.588_f1:0.545_loss:0.031.m"))
        self.baseline = baseline
        self.bert = BertModel.from_pretrained('bert-base-chinese')
        self.softmax = nn.Softmax(dim=1)
        self.linearAgg = nn.Linear(1536, 1)
        self.qsLinear = nn.Linear(1536, 1)
        
    def forward_nn(self, batch):
        
        # Sent objects into CUDA
        for key in batch:
            if key == 'labels':
                batch[key] = batch[key].to(device).to(torch.float)
            else:
                batch[key] = batch[key].to(device).to(torch.long)

        batch_size = batch['ids'].shape[0]
        padded_zeros = torch.tensor([0] * self.number_of_sentence)
        
        
        # Sent sentences into BERT embeddings
        sentence_ids = batch['ids'].view(-1, self.max_sentence_length)
        sentence_mask_ids = batch['mask_ids'].view(-1, self.max_sentence_length) #(batch, 5)
        sentence_hidden_state, sentence_pooler_output = self.baseline.bert(sentence_ids, sentence_mask_ids)   
        
        # Sent question into BERT embeddings
        question_ids = batch['q_ids']
        question_mask_ids = batch['q_mask_ids']
        question_hidden_state, question_pooler_output = self.baseline.bert(question_ids, question_mask_ids) # (batch, 768)
      
        sentence_mask = batch['sentence_mask']
        sentence_mask = sentence_mask.type(torch.float)
        
        # Aggregate
        sentence_pooler_output = sentence_pooler_output.view(batch_size, -1, 768) # (batch, 5, 768)
        
        target_sentence = sentence_pooler_output[:, self.number_of_sentence // 2, :].unsqueeze(1) # (batch, 1, 768)
        target_sentence = target_sentence.expand(-1, self.number_of_sentence, -1) # (batch, 5, 768)
        
        concatenated = torch.cat((target_sentence, sentence_pooler_output), dim=-1) # (batch, 5, 768*2)
        att_weight = self.linearAgg(concatenated) + (1.0 - sentence_mask) * -10000 #
        #if sentence_pooler_output
        att_weight = self.softmax(att_weight) # (batch, 5)
        aggregated_sentence = torch.matmul(att_weight.transpose(1,2), sentence_pooler_output) # (batch, 1, 768)
        aggregated_sentence = aggregated_sentence.squeeze(1) # (batch, 768)

        # Concatenate question to aggregated sentence
        qs_concatenated = torch.cat((aggregated_sentence, question_pooler_output), dim=-1) # (batch, 1536)
        final_output = self.qsLinear(qs_concatenated) # (batch, 1)
        return final_output, att_weight#final_output, att_weight
        
        
        
#         for i in range(batch_size):
            
#             mini_batch = sentence_pooler_output[i]
#             # Aggregate sentence weights
#             target_sentence = sentence_pooler_output[i, self.number_of_sentence // 2, :]
#             target_sentence_clone = target_sentence.expand(self.number_of_sentence, 768)
#             concatenated = torch.cat((mini_batch, target_sentence_clone), dim=1)
#             target_sentence_aggregated = torch.sum(self.linearAgg(concatenated) * mini_batch, dim=0)
            
#             # Question sentence feedforward
#             question_to_concat = question_pooler_output[i]
#             qs_concatenated = torch.cat((target_sentence_aggregated, question_to_concat))
#             final_output = self.linearAgg(qs_concatenated).squeeze(-1)
#             batch_output.append(final_output)
        
        
#         return torch.tensor(batch_output).reshape(batch_size, 1).to(device)
        
        
    def forward(self, batch):
        
        output, _ = self.forward_nn(batch)
        labels = batch['labels']
        loss_fn = nn.BCEWithLogitsLoss()
        loss = loss_fn(output, labels)
        #print(batch['ids'], output, _)
        #print(tokenizer.convert_ids_to_tokens(batch['ids'][0][2].tolist()))
        #print(tokenizer.convert_ids_to_tokens(batch['ids'][1][2].tolist()))
        #print(batch['ids'])
        #print(output, labels)
        #print(loss)
        return loss
    
    def _predict(self, batch):
        
        with torch.no_grad():
            
            output, att_weight = self.forward_nn(batch)
            scores = torch.sigmoid(output)
            scores = scores.cpu().numpy().tolist()
        
        return scores
    
    def predict_fgc(self, batch, threshold=0.5):
        scores = self._predict(batch)
        max_i = 0
        max_score = 0
        sp = []

        for i, score in enumerate(scores[0]):

            if score > max_score:
                max_i = i
                max_score = score
            if score >= threshold:
                sp.append(i)

        # This is to ensure there's no empty supporting evidences
        if not sp:
            sp.append(max_i)
        return {'sp': sp, 'sp_scores': scores}

## Baseline+Aggregation

In [122]:
class baselineAgg(nn.Module):

    def __init__(self, number_of_sentence, max_sentence_length, BMPATH):
        
        super(bertAgg, self).__init__()
        self.max_sentence_length = max_sentence_length
        self.number_of_sentence = number_of_sentence
        self.baseline = baseline_model().load_state_dict(torch.load(BMPATH))
        self.softmax = nn.Softmax(dim=1)
        
    def forward_nn(self, batch):
        # batch = (batch_size, sentence length)
        # pooler_output = (batch_size, 768)
        # Sent objects into CUDA
        for key in batch:
            if key == 'labels':
                batch[key] = batch[key].to(device).to(torch.float)
            else:
                batch[key] = batch[key].to(device).to(torch.long)

        ids = batch['ids'].view(-1, self.max_sentence_length)
        mask_ids = 
        hidden_state, pooler_output = self.baseline.bert(batch['ids'], batch['mask_ids'], batch['segment_ids'])
        

SyntaxError: invalid syntax (<ipython-input-122-139763ecbec2>, line 22)

In [123]:
new_network = bertAgg(3, 250, baseline)

I0811 07:59:12.381575 139897081689920 configuration_utils.py:152] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json from cache at /root/.cache/torch/transformers/8a3b1cfe5da58286e12a0f5d7d182b8d6eca88c08e26c332ee3817548cf7e60a.f12a4f986e43d8b328f5b067a641064d67b91597567a06c7b122d1ca7dfd9741
I0811 07:59:12.386453 139897081689920 configuration_utils.py:169] Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pad_token_id": 0,
  

In [124]:
new_network.to(device)

bertAgg(
  (baseline): baseline_model(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(21128, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
                (LayerNorm): Layer

In [37]:
test_val_batches = eval_preprocessing(validation_data, dev_dataset)
for data in test_val_batches:
    data['ids'] = data['ids'].unsqueeze(0)
    data['mask_ids'] = data['mask_ids'].unsqueeze(0)
    data['segment_ids'] = data['segment_ids'].unsqueeze(0)
    data['labels'] = data['labels'].unsqueeze(0)

In [37]:
#new_model_eval(new_network, test_val_batches, 0, validation_data['QUESTIONS'].apply(lambda x: x[0]['SHINT_']).tolist(), 0)

## Training Improved Model & Evaluating Performance

In [None]:
def new_model_optim(nn, num_epochs, lr, dataloader):
    param_optimizer = list(nn.bert.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    num_epochs = num_epochs
    num_train_optimization_steps = len(dataloader) * num_epochs
    optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                     num_warmup_steps=int(
                                                         num_train_optimization_steps * 0.1),
                                                     num_training_steps=num_train_optimization_steps)
    return optimizer, scheduler

In [42]:
def _update_sp(metrics, sp_gold, sp_pred):
    tp, fp, fn = 0, 0, 0
        
    for p in sp_pred:
        if p in sp_gold:
            tp += 1
        else:
            fp += 1
    for g in sp_gold:
        if g not in sp_pred:
            fn += 1
            
    precision = 1.0 * tp / (tp + fp) if tp + fp > 0 else 0.0
    recall = 1.0 * tp / (tp + fn) if tp + fn > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0
    em = 1.0 if fp + fn == 0 else 0.0
    
    metrics['sp_em'] += em
    metrics['sp_f1'] += f1
    metrics['sp_prec'] += precision
    metrics['sp_recall'] += recall
    
    return precision, recall, f1

In [43]:
def eval_sp_fgc(sp_golds, sp_preds):
    
    metrics = {'sp_em': 0, 'sp_prec': 0, 'sp_recall': 0, 'sp_f1': 0}
    
    assert len(sp_golds) == len(sp_preds)
    
    for sp_gold, sp_pred in zip(sp_golds, sp_preds):
        _update_sp(metrics, sp_gold, sp_pred)
    
    N = len(sp_golds)
    for k in metrics.keys():
        metrics[k] /= N
        metrics[k] = round(metrics[k], 3)
    print(metrics)
    return metrics

In [101]:
def new_model_eval(network, train_batches, dev_batches, current_epoch, train_sp_golds, dev_sp_golds, avg_loss):
    
    network.eval()
    with torch.no_grad():
        dev_sp_preds = []
        train_sp_preds = []
        
        for batch in tqdm(dev_batches):
            out_dct= network.predict_fgc(batch)
    
            dev_sp_preds.append(out_dct['sp'])
            print(out_dct['sp'])
        
        for batch in tqdm(train_batches):
            train_dct = network.predict_fgc(batch)
            train_sp_preds.append(train_dct['sp'])
            print(train_dct['sp'])
            
    dev_metrics = eval_sp_fgc(dev_sp_golds, dev_sp_preds)
    train_metrics = eval_sp_fgc(train_sp_golds, train_sp_preds)
    
    print('epoch %d eval_recall: %.3f eval_f1: %.3f loss: %.3f' % (
            current_epoch, dev_metrics['sp_recall'], dev_metrics['sp_f1'], avg_loss))
    
    print('epoch %d eval_recall: %.3f eval_f1: %.3f loss: %.3f' % (
            current_epoch, train_metrics['sp_recall'], train_metrics['sp_f1'], avg_loss))
    #torch.save(network.state_dict(), "New_Models/model_epoch{0}_eval_em:{1:.3f}_precision:{2:.3f}_recall:{3:.3f}_f1:{4:.3f}_loss:{5:.3f}.m".format(current_epoch, metrics['sp_em'], metrics['sp_prec'], metrics['sp_recall'], metrics['sp_f1'], avg_loss))
    
    return #sp_preds, sp_golds

In [None]:
def new_model_train(network, dataloader, train_batches, dev_batches, num_epochs, lr):
    
    optimizer, scheduler = new_model_optim(network, num_epochs, lr, dataloader)
    
    train_sp_golds = training_data['QUESTIONS'].apply(lambda x: x[0]['SHINT_']).tolist()
    dev_sp_golds = validation_data['QUESTIONS'].apply(lambda x: x[0]['SHINT_']).tolist()
    
    for current_epoch in range(num_epochs):
        network.train()
        running_loss = 0.0
        for batch in tqdm(dataloader):
            optimizer.zero_grad()
            current_loss = network(batch)

            current_loss.backward()
            torch.nn.utils.clip_grad_norm_(network.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            running_loss += current_loss.item()
            
        learning_rate_scalar = scheduler.get_lr()[0]
        print('lr = %f' % learning_rate_scalar)
        avg_loss = running_loss/len(dataloader)
        print('epoch %d train_loss: %.3f' % (current_epoch, avg_loss))
        new_model_eval(network, train_batches, dev_batches, current_epoch, train_sp_golds, dev_sp_golds, avg_loss)

In [82]:
sent_dev_batches = sent_eval_preprocessing(validation_data, sent_dev_dataset)

  from ipykernel import kernelapp as app


In [83]:
sent_train_batches = sent_eval_preprocessing(training_data, sent_train_dataset)

  from ipykernel import kernelapp as app


In [126]:
train_sp_golds = training_data['QUESTIONS'].apply(lambda x: x[0]['SHINT_']).tolist()
dev_sp_golds = validation_data['QUESTIONS'].apply(lambda x: x[0]['SHINT_']).tolist()
new_model_eval(new_network, sent_train_batches, sent_dev_batches, 0, dev_sp_golds, train_sp_golds, 0)

HBox(children=(IntProgress(value=0, max=247), HTML(value='')))

[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]


HBox(children=(IntProgress(value=0, max=882), HTML(value='')))

[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]


KeyboardInterrupt: 

In [70]:
new_model_train(new_network, dataloader_sent_train, sent_train_batches, sent_dev_batches, 20, 0.00002)

HBox(children=(IntProgress(value=0, max=7856), HTML(value='')))

lr = 0.000010
epoch 0 train_loss: 0.267


HBox(children=(IntProgress(value=0, max=247), HTML(value='')))

HBox(children=(IntProgress(value=0, max=882), HTML(value='')))

{'sp_em': 0.04, 'sp_prec': 0.259, 'sp_recall': 0.132, 'sp_f1': 0.168}
{'sp_em': 0.065, 'sp_prec': 0.279, 'sp_recall': 0.151, 'sp_f1': 0.185}
epoch 0 eval_recall: 0.132 eval_f1: 0.168 loss: 0.267
epoch 0 eval_recall: 0.151 eval_f1: 0.185 loss: 0.267


HBox(children=(IntProgress(value=0, max=7856), HTML(value='')))

lr = 0.000020
epoch 1 train_loss: 0.270


HBox(children=(IntProgress(value=0, max=247), HTML(value='')))

HBox(children=(IntProgress(value=0, max=882), HTML(value='')))

{'sp_em': 0.04, 'sp_prec': 0.259, 'sp_recall': 0.132, 'sp_f1': 0.168}
{'sp_em': 0.065, 'sp_prec': 0.279, 'sp_recall': 0.151, 'sp_f1': 0.185}
epoch 1 eval_recall: 0.132 eval_f1: 0.168 loss: 0.270
epoch 1 eval_recall: 0.151 eval_f1: 0.185 loss: 0.270


HBox(children=(IntProgress(value=0, max=7856), HTML(value='')))

KeyboardInterrupt: 

In [None]:
new_dev_batches = window_sentence_preprocessing(validation_data, dev_dataset, 10)
testing_dev_batches = DataLoader(new_dev_batches, batch_size=2, shuffle = False, collate_fn = collate_3d)

In [40]:
# MUST DO THIS BEFORE EVALUATING 
eval_train_batches = eval_preprocessing(training_data, train_dataset)
for data in eval_train_batches:
    data['ids'] = data['ids'].unsqueeze(0)
    data['mask_ids'] = data['mask_ids'].unsqueeze(0)
    data['segment_ids'] = data['segment_ids'].unsqueeze(0)
    data['labels'] = data['labels'].unsqueeze(0)

In [41]:
new_model_eval(new_network, eval_train_batches, 0, training_data['QUESTIONS'].apply(lambda x: x[0]['SHINT_']).tolist(), 0)[1]

HBox(children=(IntProgress(value=0, max=882), HTML(value='')))


{'sp_em': 0.085, 'sp_prec': 0.3, 'sp_recall': 0.172, 'sp_f1': 0.206}
epoch 0 eval_recall: 0.172 eval_f1: 0.206 loss: 0.000


[[18],
 [0, 1],
 [1, 2],
 [5],
 [7, 8, 9, 10, 11, 12, 13, 14],
 [22, 23],
 [27],
 [26, 30],
 [33, 34],
 [1, 3, 4, 5, 6],
 [5, 7],
 [7],
 [7, 8, 11],
 [12, 17],
 [11, 17],
 [11, 19],
 [23],
 [25, 28, 30],
 [25, 28],
 [16, 17],
 [],
 [9, 10],
 [9, 11, 13],
 [0, 2, 6],
 [0],
 [0, 14, 16],
 [0, 14, 16],
 [0, 14, 16, 17, 20, 21, 24, 25, 27, 29],
 [0, 14, 17],
 [0, 14, 20, 21],
 [0, 14, 24, 25],
 [0, 14, 27],
 [0, 14, 29],
 [0, 1],
 [0, 1],
 [1, 2],
 [0, 1, 8, 11],
 [0, 1, 8, 11, 12, 14],
 [15],
 [5, 6],
 [9, 16],
 [11, 16],
 [11, 16],
 [11, 16],
 [0, 2],
 [8, 9],
 [8, 9],
 [0],
 [0, 1],
 [0, 4],
 [0, 6, 7, 8, 9],
 [0, 6, 7, 8, 9],
 [18, 21],
 [14],
 [14, 15],
 [21, 23, 24],
 [0, 2],
 [3],
 [0, 4, 5],
 [0, 4],
 [6],
 [11, 12],
 [11, 12],
 [11, 12],
 [11],
 [0],
 [0, 3],
 [0, 3],
 [10, 11],
 [11, 12, 13, 14],
 [],
 [0, 1],
 [0, 1],
 [0, 4],
 [0, 41],
 [0, 41],
 [0, 25, 41],
 [0, 5, 7],
 [0, 8],
 [0, 7],
 [0, 8],
 [1, 2],
 [1, 2],
 [3, 4],
 [3, 4],
 [5, 6, 7, 8],
 [5, 6, 8],
 [9, 10],
 [9, 10]

In [155]:
train(new_network, dataloader_train_3d, a, 20, 0.00002)

HBox(children=(IntProgress(value=0, max=1807), HTML(value='')))


lr = 0.000005
epoch 0 train_loss: 0.122


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




TypeError: string indices must be integers

In [98]:
nn.Parameter(torch.FloatTensor(768).uniform_(-0.1, 0.1)).expand(2, 10, -1).shape

torch.Size([2, 10, 768])

In [103]:
a = iter(dataloader_train_3d).next()['ids']
a.shape

torch.Size([2, 7, 68])

In [109]:
a[1]

tensor([[ 101, 4397,  342, 1367, 2548, 1355, 4385, 4342, 4342, 4500,  784,  720,
         3341, 7157, 4635, 6009, 8024, 2130, 2768, 1400, 1315, 5543, 3175,  912,
         4638, 3123, 6822, 5632, 2346, 1673, 2349,  775, 1358, 4635, 6009, 1920,
         7623, 8043,  102, 7479, 5865, 6814,  782, 4638, 2608, 2552,  680, 3675,
         1213, 8024,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [ 101, 4397,  342, 1367, 2548, 1355, 4385, 4342, 4342, 4500,  784,  720,
         3341, 7157, 4635, 6009, 8024, 2130, 2768, 1400, 1315, 5543, 3175,  912,
         4638, 3123, 6822, 5632, 2346, 1673, 2349,  775, 1358, 4635, 6009, 1920,
         7623, 8043,  102, 4397,  679,  852, 2868, 2245,  749,  782, 5102, 4638,
         4761, 6399, 7566, 1818, 8024,  102,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [ 101, 4397,  342, 1367, 2548, 1355, 4385, 4342, 4342, 4500,  784,

In [58]:
new_network.loss(iter(dataloader_train_3d).next())[0].shape

torch.Size([2, 10, 1])

In [61]:
new_network.loss(iter(dataloader_train_3d).next())[1].shape

torch.Size([2, 10, 1])

In [67]:
iter(dataloader_train_3d).next()

{'ids': tensor([[[ 101, 1380, 2577,  ...,    0,    0,    0],
          [ 101, 1380, 2577,  ...,    0,    0,    0],
          [ 101, 1380, 2577,  ...,    0,    0,    0],
          ...,
          [ 101, 1380, 2577,  ...,    0,    0,    0],
          [ 101, 1380, 2577,  ...,    0,    0,    0],
          [ 101, 1380, 2577,  ...,    0,    0,    0]],
 
         [[ 101, 5018,  671,  ...,    0,    0,    0],
          [ 101, 5018,  671,  ...,    0,    0,    0],
          [ 101, 5018,  671,  ...,    0,    0,    0],
          ...,
          [ 101, 5018,  671,  ...,    0,    0,    0],
          [ 101, 5018,  671,  ...,    0,    0,    0],
          [ 101, 5018,  671,  ...,    0,    0,    0]]], device='cuda:0'),
 'mask_ids': tensor([[[0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          ...,
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0]],
 
         [[0, 0, 0,  ..., 0, 0, 0],
          

## Evaluate Improved Model

In [100]:
validation_data_with_performance_0 = datapreprocessing(validation_data, True)
training_data_with_performance_0 = datapreprocessing(training_data, True)
test_data_with_performance_0 = datapreprocessing(test_data, True)

In [110]:
train_pred_0, train_obs_0 = new_model_eval(new_network, b, 0, training_data['QUESTIONS'].apply(lambda x: x[0]['SHINT_']).tolist()
, 0.001)


HBox(children=(IntProgress(value=0, max=882), HTML(value='')))


{'sp_em': 0.18, 'sp_prec': 0.573, 'sp_recall': 0.381, 'sp_f1': 0.43}
epoch 0 eval_recall: 0.381 eval_f1: 0.430 loss: 0.001


In [115]:
training_data_with_performance_0['train_pred'] = train_pred_0
training_data_with_performance_0['train_obs'] = train_obs_0

In [116]:
correct_sp = []
for i in range(training_data_with_performance_0.shape[0]):
    para = training_data_with_performance_0['Sentence_List'][i]
    sen = []
    for index in training_data_with_performance_0['train_pred'][i]:
        sen.append(para[index])
    correct_sp.append(sen)
training_data_with_performance_0['Pred_List'] = correct_sp
correct_sp = []
for i in range(training_data_with_performance_0.shape[0]):
    para = training_data_with_performance_0['Sentence_List'][i]
    sen = []
    for index in training_data_with_performance_0['train_obs'][i]:
        sen.append(para[index])
    correct_sp.append(sen)
training_data_with_performance_0['Obs_List'] = correct_sp

In [117]:
training_data_with_performance_0.drop(['SE_Index', 'Label', 'train_pred', 'train_obs'], axis=1, inplace=True)

In [133]:
train_mismatch = training_data_with_performance_0[training_data_with_performance_0['Pred_List'] != training_data_with_performance_0['Obs_List']]
train_mismatch

Unnamed: 0,Question,Sentence_List,Length,Pred_List,Obs_List
0,苏东坡的老家在哪?,"[嘉佑二年（1057年），, 苏轼才20岁，, 与弟弟苏辙一同进京参加会考，, 苏轼中进士第...",35,[苏轼才20岁，],[苏轼回蜀守丧，]
1,苏东坡出生于哪一年?,"[嘉佑二年（1057年），, 苏轼才20岁，, 与弟弟苏辙一同进京参加会考，, 苏轼中进士第...",35,[苏轼才20岁，],"[嘉佑二年（1057年），, 苏轼才20岁，]"
2,苏东坡和谁一起进京参加会考?,"[嘉佑二年（1057年），, 苏轼才20岁，, 与弟弟苏辙一同进京参加会考，, 苏轼中进士第...",35,[苏轼才20岁，],"[苏轼才20岁，, 与弟弟苏辙一同进京参加会考，]"
4,苏东坡与曾巩是否同为欧阳修的学生?,"[嘉佑二年（1057年），, 苏轼才20岁，, 与弟弟苏辙一同进京参加会考，, 苏轼中进士第...",35,[苏轼以一篇《刑赏忠厚之至论》的论文得到考官梅尧臣的青睐，],"[欧阳修亦十分赞赏，, 原本欲拔擢为第一，, 但又怕该文为自己的门生曾巩所作，, 为了避嫌，..."
5,苏东坡与王安石在职场上是否理念相同?,"[嘉佑二年（1057年），, 苏轼才20岁，, 与弟弟苏辙一同进京参加会考，, 苏轼中进士第...",35,[苏轼以一篇《刑赏忠厚之至论》的论文得到考官梅尧臣的青睐，],"[反对王安石变法中的一些作为，, 王安石于是屡次在神宗面前诋毁苏轼，]"
...,...,...,...,...,...
875,小明感染肠病毒后痊瘉一周后是否就不会再传染给别人了?,"[国内肠病毒轻症疫情持续上升，, 另新增1例肠病毒71型并发重症病例。, 疾病管制署再次呼吁...",42,[请尽速送大医院接受治疗。],[痊愈后肠病毒会随著粪便排出达8到12周之久。]
877,这起诈骗案件发生于台北市哪一个行政区？,"[松山分局三民派出所警员白小帆，, 警员张秀秀。, 于一百零八年十月三十一日十五时至十七时，...",32,"[松山分局三民派出所警员白小帆，, 于一百零八年十月三十一日十五时至十七时，, 疑似遭到诈骗...","[松山分局三民派出所警员白小帆，, 疑似遭到诈骗两元，]"
878,这起诈骗案件发生的日期是哪一天？,"[松山分局三民派出所警员白小帆，, 警员张秀秀。, 于一百零八年十月三十一日十五时至十七时，...",32,"[疑似遭到诈骗两元，, 到场后，发现系一名年约八十二岁陈姓妇人\n欲提领新台币三十三万元。]","[于一百零八年十月三十一日十五时至十七时，, 疑似遭到诈骗两元，]"
880,这起诈骗案件的受害人姓氏为何？,"[松山分局三民派出所警员白小帆，, 警员张秀秀。, 于一百零八年十月三十一日十五时至十七时，...",32,[疑似遭到诈骗两元，],"[疑似遭到诈骗两元，, 到场后，发现系一名年约八十二岁陈姓妇人\n欲提领新台币三十三万元。]"


In [None]:
i = 17
print(train_mismatch.loc[i]['Question'])
print(train_mismatch.loc[i]['Pred_List'])
print(train_mismatch.loc[i]['Obs_List'])
train_mismatch.loc[i]['Sentence_List']

In [136]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')


I0724 09:52:32.290062 140341672732480 tokenization_utils.py:375] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt from cache at /root/.cache/torch/transformers/8a0c070123c1f794c42a29c6904beb7c1b8715741e235bee04aca2c7636fc83f.9b42061518a39ca00b8b52059fd2bede8daa613f8a8671500e518a8c29de8c00


In [154]:
counter = 0
for a in dataloader_train_3d:
    for b in a['']:
        for c in b:
            print(c)
            #print(tokenizer.convert_ids_to_tokens(c.tolist()))
    counter += 1
    if counter == 5:
        break

tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(1., device='cuda:0')
tensor(0., device='c