In [1]:
from dataset.build_dataset import build_dataset
from readingcomprehension.models.luke import LukeForReadingComprehensionWithLoss
import mindspore.dataset as ds
import os
import numpy as np
from mindspore.mindrecord import FileWriter
import json

# Squad 数据集

In [2]:
FEATURES_FILE = "./data/json_features.npy"
features = np.load(FEATURES_FILE)

In [3]:
list_dict = []
for item in features:
    dict_temp = json.loads(item)
    list_dict.append(dict_temp)

In [4]:
SQUAD_MINDRECORD_FILE = "./data/squad_features.mindrecord"

if os.path.exists(SQUAD_MINDRECORD_FILE):
    os.remove(SQUAD_MINDRECORD_FILE)
    os.remove(SQUAD_MINDRECORD_FILE + ".db")

writer = FileWriter(file_name=SQUAD_MINDRECORD_FILE, shard_num=1)

data_schema = {
    "word_ids": {"type": "int32", "shape": [-1]},
    "word_segment_ids": {"type": "int32", "shape": [-1]},
    "word_attention_mask": {"type": "int32", "shape": [-1]},
    "entity_ids": {"type": "int32", "shape": [-1]},
    "entity_position_ids": {"type": "int32", "shape": [-1]},
    "entity_segment_ids": {"type": "int32", "shape": [-1]},
    "entity_attention_mask": {"type": "int32", "shape": [-1]},
    "start_positions": {"type": "int32", "shape": [-1]},
    "end_positions": {"type": "int32", "shape": [-1]}
}
writer.add_schema(data_schema, "it is a preprocessed squad dataset")

data = []
i = 0
for item in list_dict:
    i += 1
    sample = {
        "word_ids": np.array(item["word_ids"], dtype=np.int32),
        "word_segment_ids": np.array(item["word_segment_ids"], dtype=np.int32),
        "word_attention_mask": np.array(item["word_attention_mask"], dtype=np.int32),
        "entity_ids": np.array(item["entity_ids"], dtype=np.int32),
        "entity_position_ids": np.array(item["entity_position_ids"], dtype=np.int32),
        "entity_segment_ids": np.array(item["entity_segment_ids"], dtype=np.int32),
        "entity_attention_mask": np.array(item["entity_attention_mask"], dtype=np.int32),
        "start_positions": np.array(item["start_positions"], dtype=np.int32),
        "end_positions": np.array(item["end_positions"], dtype=np.int32),
    }

    data.append(sample)
    #print(sample)
    if i % 10 == 0:
        writer.write_raw_data(data)
        data = []

if data:
    writer.write_raw_data(data)

writer.commit()

MSRStatus.SUCCESS

In [5]:
data_set = ds.MindDataset(dataset_file=SQUAD_MINDRECORD_FILE)
count = 0
for item in data_set.create_dict_iterator():
    #print(item)
    count += 1
print("Got {} samples".format(count))

Got 269 samples


# model

In [18]:
from readingcomprehension.models.luke import LukeForReadingComprehension
import mindspore.common.dtype as mstype
from model.bert_model import BertConfig
from mindspore import context
from model.luke import LukeModel
import numpy as np
from mindspore import Tensor, context
from mindspore import dtype as mstype
import mindspore.ops as ops
context.set_context(mode=context.GRAPH_MODE, device_target="CPU")

In [7]:
luke_net_cfg = BertConfig()
luke_net_cfg.hidden_size

768

In [8]:
model = LukeForReadingComprehension(luke_net_cfg)



In [9]:
data_sample = next(data_set.create_dict_iterator())
data_sample

{'end_positions': Tensor(shape=[1], dtype=Int32, value= [177]),
 'entity_attention_mask': Tensor(shape=[2], dtype=Int32, value= [0, 0]),
 'entity_ids': Tensor(shape=[2], dtype=Int32, value= [0, 0]),
 'entity_position_ids': Tensor(shape=[60], dtype=Int32, value= [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]),
 'entity_segment_ids': Tensor(shape=[2], dtype=Int32, value= [0, 0]),
 'start_positions': Tensor(shape=[1], dtype=Int32, value= [172]),
 'word_attention_mask': Tensor(shape=[221], dtype=Int32, value= [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [31]:
data_sample["word_ids"]
arr = np.zeros((256-len(data_sample["word_ids"])),dtype = np.int32)
x_np = Tensor(arr)
op = ops.Concat()
word_ids = op((data_sample["word_ids"], x_np))
op = ops.Stack()
word_ids = op([word_ids,word_ids])
word_ids.shape

(2, 256)

In [25]:
model(word_ids = word_ids,
      word_segment_ids = data_sample["word_segment_ids"],
      word_attention_mask = data_sample["word_attention_mask"],
      entity_ids = data_sample["entity_ids"],
      entity_position_ids = data_sample["entity_position_ids"],
      entity_segment_ids = data_sample["entity_segment_ids"],
      entity_attention_mask = data_sample["entity_attention_mask"]
      #start_positions = data_sample["start_positions"],
      #end_positions = data_sample["end_positions"])
     )

TypeError: mindspore\core\utils\check_convert_utils.cc:491 CheckSubClass] For 'Gather', the type of `index_type` should be subclass of Int64,Int8,Int16,Int32, but got Float32.
The function call stack (See file 'analyze_fail_1.dat' for more details):
# 0 In file C:\Users\DM\Desktop\Duda\HUAWEI\LUKE_mindspore\readingcomprehension\models\luke.py(61)
        return start_logits, end_logits
    ^
# 1 In file C:\Users\DM\Desktop\Duda\HUAWEI\LUKE_mindspore\readingcomprehension\models\luke.py(45)
        encoder_outputs = super(LukeForReadingComprehension, self).construct(
# 2 In file C:\Users\DM\Desktop\Duda\HUAWEI\LUKE_mindspore\model\luke.py(93)
        entity_embeddings = self.entity_embeddings(entity_ids, entity_position_ids, entity_segment_ids)
                        ^
# 3 In file C:\Users\DM\Desktop\Duda\HUAWEI\LUKE_mindspore\model\luke_embeddings.py(46)
        if token_type_ids is None:
# 4 In file C:\Users\DM\Desktop\Duda\HUAWEI\LUKE_mindspore\model\luke_embeddings.py(51)
        position_embeddings = self.position_embeddings(clamp(position_ids))
                          ^
# 5 In file C:\Users\DM\anaconda3\envs\MindSpore\lib\site-packages\mindspore\nn\layer\embedding.py(135)
        if self.use_one_hot:
# 6 In file C:\Users\DM\anaconda3\envs\MindSpore\lib\site-packages\mindspore\nn\layer\embedding.py(139)
            output_for_reshape = self.gather(self.embedding_table, flat_ids, 0)
                             ^


In [None]:
class RobertaEmbeddings(nn.Cell):
    """
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    """

    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
    def __init__(self, config):
        super().__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
        if version.parse(torch.__version__) > version.parse("1.6.0"):
            self.register_buffer(
                "token_type_ids",
                torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device),
                persistent=False,
            )

        # End copy
        self.padding_idx = config.pad_token_id
        self.position_embeddings = nn.Embedding(
            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
        )

    def forward(
        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
    ):
        if position_ids is None:
            if input_ids is not None:
                # Create the position ids from the input token ids. Any padded tokens remain padded.
                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
            else:
                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)

        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            input_shape = inputs_embeds.size()[:-1]

        seq_length = input_shape[1]

        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
        # issue #5664
        if token_type_ids is None:
            if hasattr(self, "token_type_ids"):
                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
                token_type_ids = buffered_token_type_ids_expanded
            else:
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        embeddings = inputs_embeds + token_type_embeddings
        if self.position_embedding_type == "absolute":
            position_embeddings = self.position_embeddings(position_ids)
            embeddings += position_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
        """
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        """
        input_shape = inputs_embeds.size()[:-1]
        sequence_length = input_shape[1]

        position_ids = torch.arange(
            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
        )
        return position_ids.unsqueeze(0).expand(input_shape)