In [1]:
from dataset.build_dataset import build_dataset
from readingcomprehension.models.luke import LukeForReadingComprehensionWithLoss
import mindspore.dataset as ds
import os
import numpy as np
from mindspore.mindrecord import FileWriter
import json

# Squad 数据集

In [2]:
FEATURES_FILE = "./data/json_features.npy"
features = np.load(FEATURES_FILE)

In [3]:
list_dict = []
for item in features:
    dict_temp = json.loads(item)
    list_dict.append(dict_temp)

In [4]:
SQUAD_MINDRECORD_FILE = "./data/squad_features.mindrecord"

if os.path.exists(SQUAD_MINDRECORD_FILE):
    os.remove(SQUAD_MINDRECORD_FILE)
    os.remove(SQUAD_MINDRECORD_FILE + ".db")

writer = FileWriter(file_name=SQUAD_MINDRECORD_FILE, shard_num=1)

data_schema = {
    "word_ids": {"type": "int32", "shape": [-1]},
    "word_segment_ids": {"type": "int32", "shape": [-1]},
    "word_attention_mask": {"type": "int32", "shape": [-1]},
    "entity_ids": {"type": "int32", "shape": [-1]},
    "entity_position_ids": {"type": "int32", "shape": [-1]},
    "entity_segment_ids": {"type": "int32", "shape": [-1]},
    "entity_attention_mask": {"type": "int32", "shape": [-1]},
    "start_positions": {"type": "int32", "shape": [-1]},
    "end_positions": {"type": "int32", "shape": [-1]}
}
writer.add_schema(data_schema, "it is a preprocessed squad dataset")

data = []
i = 0
for item in list_dict:
    i += 1
    sample = {
        "word_ids": np.array(item["word_ids"], dtype=np.int32),
        "word_segment_ids": np.array(item["word_segment_ids"], dtype=np.int32),
        "word_attention_mask": np.array(item["word_attention_mask"], dtype=np.int32),
        "entity_ids": np.array(item["entity_ids"], dtype=np.int32),
        "entity_position_ids": np.array(item["entity_position_ids"], dtype=np.int32),
        "entity_segment_ids": np.array(item["entity_segment_ids"], dtype=np.int32),
        "entity_attention_mask": np.array(item["entity_attention_mask"], dtype=np.int32),
        "start_positions": np.array(item["start_positions"], dtype=np.int32),
        "end_positions": np.array(item["end_positions"], dtype=np.int32),
    }

    data.append(sample)
    #print(sample)
    if i % 10 == 0:
        writer.write_raw_data(data)
        data = []

if data:
    writer.write_raw_data(data)

writer.commit()

MSRStatus.SUCCESS

In [5]:
data_set = ds.MindDataset(dataset_file=SQUAD_MINDRECORD_FILE)
count = 0
for item in data_set.create_dict_iterator():
    #print(item)
    count += 1
print("Got {} samples".format(count))

Got 269 samples


# model

In [6]:
from readingcomprehension.models.luke import LukeForReadingComprehension
import mindspore.common.dtype as mstype
from model.bert_model import BertConfig
from mindspore import context
from model.luke import LukeModel
import numpy as np
from mindspore import Tensor, context
from mindspore import dtype as mstype
import mindspore.ops as ops
import mindspore.nn as nn
context.set_context(mode=context.GRAPH_MODE, device_target="CPU")

In [7]:
luke_net_cfg = BertConfig()

In [8]:
model = LukeForReadingComprehension(luke_net_cfg)

In [9]:
import mindspore
x = Tensor(np.ones([20, 5, 10, 10]), mindspore.float32)
shape1 = x.shape[1:]
shape1

(5, 10, 10)

In [10]:
data_sample = next(data_set.create_dict_iterator())
data_sample

{'end_positions': Tensor(shape=[1], dtype=Int32, value= [48]),
 'entity_attention_mask': Tensor(shape=[2], dtype=Int32, value= [0, 0]),
 'entity_ids': Tensor(shape=[2], dtype=Int32, value= [0, 0]),
 'entity_position_ids': Tensor(shape=[60], dtype=Int32, value= [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]),
 'entity_segment_ids': Tensor(shape=[2], dtype=Int32, value= [0, 0]),
 'start_positions': Tensor(shape=[1], dtype=Int32, value= [44]),
 'word_attention_mask': Tensor(shape=[309], dtype=Int32, value= [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

# RobertaEmbeddings

In [11]:
class RobertaEmbeddings(nn.Cell):
    def __init__(self, config):
        super(RobertaEmbeddings, self).__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size,
                                            config.hidden_size,
                                            padding_idx=config.pad_token_id
                                            )
        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
                                                config.hidden_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
                                                  config.hidden_size)

        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file
        self.LayerNorm = nn.LayerNorm([config.hidden_size],
                                      epsilon=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        # self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
        # self.register_buffer("position_ids", nn.Range(config.max_position_embeddings).expand((1, -1)))
        # self.register_buffer("token_type_ids",
        #                      ops.Zeros(self.position_ids.size(), dtype=mstype.int64),  # dtype used to torch.long
        #                      persistent=False)
        # End copy
        self.padding_idx = config.pad_token_id
        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
                                                config.hidden_size,
                                                padding_idx=self.padding_idx)

    def construct(self,
                  input_ids=None,
                  token_type_ids=None,
                  position_ids=None,
                  inputs_embeds=None,
                  past_key_values_length=0):
        if position_ids is None:
            if input_ids is not None:
                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
            else:
                position_ids = create_position_ids_from_input_ids(inputs_embeds)
        #if input_ids is not None:
        input_shape = input_ids.shape
        seq_length = input_shape[1]
        if token_type_ids is None:
            token_type_ids = ops.Zeros(input_shape, dtype=mstype.int64)
        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        embeddings = inputs_embeds + token_type_embeddings
        position_embeddings = self.position_embeddings(position_ids)
        embeddings += position_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
    """
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.
    Args:
       x: torch.Tensor x:
    Returns: torch.Tensor
    """
    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
    pad_id = np.array(padding_idx)
    mask = Tensor(1 * np.array(input_ids.asnumpy() != pad_id))
    #mask = input_ids.ne(padding_idx).int()  # 可能有问题
    cumsum = ops.CumSum()
    incremental_indices = (cumsum(mask, 1) + past_key_values_length) * mask
    return incremental_indices + padding_idx


In [12]:
op_stack = ops.Stack()
word_ids = op_stack([data_sample["word_ids"], data_sample["word_ids"]])
word_segment_ids = op_stack([data_sample["word_segment_ids"], data_sample["word_segment_ids"]])
embeddings = RobertaEmbeddings(luke_net_cfg)
word_embeddings = embeddings.construct(word_ids, word_segment_ids)
word_embeddings

Tensor(shape=[2, 309, 768], dtype=Float32, value=
[[[-2.32064888e-001, -1.75745404e+000, 8.01867902e-001 ... -8.07331145e-001, -5.34173608e-001, -5.00196517e-001],
  [-4.06036109e-001, -1.10527205e+000, 3.34902287e-001 ... 2.24775463e-001, 1.04961407e+000, 4.50774103e-001],
  [1.26334572e+000, -2.01488876e+000, 1.10330796e+000 ... 1.46480680e+000, 2.38117844e-001, 1.17235945e-003],
  ...
  [-5.23515195e-002, -1.31185919e-001, 9.94298160e-002 ... 6.94247425e-001, -1.05684564e-001, -1.13034435e-001],
  [-1.06446281e-001, -5.20738661e-001, 3.52823734e-001 ... -7.26584852e-001, 1.75867572e-001, -2.12046310e-001],
  [1.73092854e+000, -7.14502096e-001, 9.83837962e-001 ... 6.33667886e-001, -1.98341936e-001, 1.35750985e+000]],
 [[-2.32064888e-001, -1.75745404e+000, 8.01867902e-001 ... -8.07331145e-001, -5.34173608e-001, -5.00196517e-001],
  [-4.06036109e-001, -1.10527205e+000, 3.34902287e-001 ... 2.24775463e-001, 1.04961407e+000, 4.50774103e-001],
  [1.26334572e+000, -2.01488876e+000, 1.103307

# EntityEmbeddings

In [18]:
class EntityEmbeddings(nn.Cell):
    """entity embeddings for luke model"""

    def __init__(self, config):
        super(EntityEmbeddings, self).__init__()
        self.config = config
        #config.entity_vocab_size = 20
        #config.entity_emb_size = config.hidden_size
        #config.layer_norm_eps = 1e-6

        self.entity_embeddings = nn.Embedding(config.entity_vocab_size, config.entity_emb_size, padding_idx=0)
        
        if config.entity_emb_size != config.hidden_size:
            self.entity_embedding_dense = nn.Dense(config.entity_emb_size, config.hidden_size, has_bias=False)
            
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
        
        # TODO：[config.hidden_size] 和 torch有区别
        self.layer_norm = nn.LayerNorm([config.hidden_size], epsilon=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.unsqueezee = ops.ExpandDims()

    def construct(self, entity_ids, position_ids, token_type_ids=None):
        """EntityEmbeddings for luke"""
        if token_type_ids is None:
            token_type_ids = ops.zeros_like(entity_ids)

        entity_embeddings = self.entity_embeddings(entity_ids)
        print(entity_embeddings.shape)
        if self.config.entity_emb_size != self.config.hidden_size:
            entity_embeddings = self.entity_embedding_dense(entity_embeddings)
        print(entity_embeddings.shape)   
        entity_position_ids_int = clamp(position_ids)
        entity_position_ids_int = Tensor(entity_position_ids_int.asnumpy().astype(np.int32))
        position_embeddings = self.position_embeddings(entity_position_ids_int)
        #position_embeddings = self.position_embeddings(position_ids)
        position_embedding_mask = 1*self.unsqueezee((position_ids != -1), -1)
        position_embeddings = position_embeddings * position_embedding_mask
        position_embeddings = ops.reduce_sum(position_embeddings, -2)
        position_embeddings = position_embeddings / clamp(ops.reduce_sum(position_embedding_mask, -2), minimum=1e-7)

        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        embeddings = entity_embeddings + position_embeddings + token_type_embeddings
        #embeddings = self.layer_norm(embeddings)
        #embeddings = self.dropout(embeddings)
        return embeddings


def clamp(x, minimum=0.0):
    mask = x > minimum
    x = x * mask + minimum
    return x

In [19]:
net_EntityEmbeddings = EntityEmbeddings(luke_net_cfg)
entity_ids = op_stack([data_sample["entity_ids"],data_sample["entity_ids"]])
entity_position_ids = op_stack([data_sample["entity_position_ids"],data_sample["entity_position_ids"]])
entity_segment_ids = op_stack([data_sample["entity_segment_ids"],data_sample["entity_segment_ids"]])
net_EntityEmbeddings.construct(entity_ids, entity_position_ids, entity_segment_ids)

111


Tensor(shape=[2, 2, 768], dtype=Float32, value=
[[[-4.63429512e-003, -2.22483966e-002, 8.08674004e-003 ... 1.22992527e-002, -9.81160160e-003, 1.63783655e-002],
  [-4.63429512e-003, -2.22483966e-002, 8.08674004e-003 ... 1.22992527e-002, -9.81160160e-003, 1.63783655e-002]],
 [[-4.63429512e-003, -2.22483966e-002, 8.08674004e-003 ... 1.22992527e-002, -9.81160160e-003, 1.63783655e-002],
  [-4.63429512e-003, -2.22483966e-002, 8.08674004e-003 ... 1.22992527e-002, -9.81160160e-003, 1.63783655e-002]]])

# attention_mask

In [None]:
def _compute_extended_attention_mask(word_attention_mask, entity_attention_mask):
    attention_mask = word_attention_mask
    if entity_attention_mask is not None:
        op_Concat = ops.Concat(axis = 1)
        attention_mask = op_Concat((attention_mask, entity_attention_mask))
    unsqueezee = ops.ExpandDims()
    extended_attention_mask = unsqueezee(unsqueezee(attention_mask, 1), 2)
    extended_attention_mask = extended_attention_mask.astype(mstype.float32)
    extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
    return extended_attention_mask

In [None]:
word_attention_mask = op_stack([data_sample["word_attention_mask"],data_sample["word_attention_mask"]])
entity_attention_mask = op_stack([data_sample["entity_attention_mask"],data_sample["entity_attention_mask"]])
attention_mask = _compute_extended_attention_mask(word_attention_mask, entity_attention_mask)
attention_mask

In [None]:
model(word_ids = word_ids,
      word_segment_ids = word_segment_ids,
      word_attention_mask = word_attention_mask,
      entity_ids = entity_ids,
      entity_position_ids = entity_position_ids,
      entity_segment_ids = entity_segment_ids,
      entity_attention_mask = entity_attention_mask
      #start_positions = data_sample["start_positions"],
      #end_positions = data_sample["end_positions"])
     )