In [1]:
from dataset.build_dataset import build_dataset
from readingcomprehension.models.luke import LukeForReadingComprehensionWithLoss
import mindspore.dataset as ds
import os
import numpy as np
from mindspore.mindrecord import FileWriter
import json

# Squad 数据集

In [2]:
FEATURES_FILE = "./data/json_features.npy"
features = np.load(FEATURES_FILE)

In [3]:
list_dict = []
for item in features:
    dict_temp = json.loads(item)
    list_dict.append(dict_temp)

In [4]:
SQUAD_MINDRECORD_FILE = "./data/squad_features.mindrecord"

if os.path.exists(SQUAD_MINDRECORD_FILE):
    os.remove(SQUAD_MINDRECORD_FILE)
    os.remove(SQUAD_MINDRECORD_FILE + ".db")

writer = FileWriter(file_name=SQUAD_MINDRECORD_FILE, shard_num=1)

data_schema = {
    "word_ids": {"type": "int32", "shape": [-1]},
    "word_segment_ids": {"type": "int32", "shape": [-1]},
    "word_attention_mask": {"type": "int32", "shape": [-1]},
    "entity_ids": {"type": "int32", "shape": [-1]},
    "entity_position_ids": {"type": "int32", "shape": [-1]},
    "entity_segment_ids": {"type": "int32", "shape": [-1]},
    "entity_attention_mask": {"type": "int32", "shape": [-1]},
    "start_positions": {"type": "int32", "shape": [-1]},
    "end_positions": {"type": "int32", "shape": [-1]}
}
writer.add_schema(data_schema, "it is a preprocessed squad dataset")

data = []
i = 0
for item in list_dict:
    i += 1
    sample = {
        "word_ids": np.array(item["word_ids"], dtype=np.int32),
        "word_segment_ids": np.array(item["word_segment_ids"], dtype=np.int32),
        "word_attention_mask": np.array(item["word_attention_mask"], dtype=np.int32),
        "entity_ids": np.array(item["entity_ids"], dtype=np.int32),
        "entity_position_ids": np.array(item["entity_position_ids"], dtype=np.int32),
        "entity_segment_ids": np.array(item["entity_segment_ids"], dtype=np.int32),
        "entity_attention_mask": np.array(item["entity_attention_mask"], dtype=np.int32),
        "start_positions": np.array(item["start_positions"], dtype=np.int32),
        "end_positions": np.array(item["end_positions"], dtype=np.int32),
    }

    data.append(sample)
    #print(sample)
    if i % 10 == 0:
        writer.write_raw_data(data)
        data = []

if data:
    writer.write_raw_data(data)

writer.commit()

MSRStatus.SUCCESS

In [5]:
data_set = ds.MindDataset(dataset_file=SQUAD_MINDRECORD_FILE)
count = 0
for item in data_set.create_dict_iterator():
    #print(item)
    count += 1
print("Got {} samples".format(count))

Got 269 samples


# model

In [6]:
from readingcomprehension.models.luke import LukeForReadingComprehension
import mindspore.common.dtype as mstype
from model.bert_model import BertConfig
from mindspore import context
from model.luke import LukeModel
import numpy as np
from mindspore import Tensor, context
from mindspore import dtype as mstype
import mindspore.ops as ops
context.set_context(mode=context.GRAPH_MODE, device_target="CPU")

In [7]:
luke_net_cfg = BertConfig()
luke_net_cfg.hidden_size

768

In [8]:
model = LukeForReadingComprehension(luke_net_cfg)



In [9]:
data_sample = next(data_set.create_dict_iterator())
data_sample

{'end_positions': Tensor(shape=[1], dtype=Int32, value= [172]),
 'entity_attention_mask': Tensor(shape=[2], dtype=Int32, value= [0, 0]),
 'entity_ids': Tensor(shape=[2], dtype=Int32, value= [0, 0]),
 'entity_position_ids': Tensor(shape=[60], dtype=Int32, value= [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]),
 'entity_segment_ids': Tensor(shape=[2], dtype=Int32, value= [0, 0]),
 'start_positions': Tensor(shape=[1], dtype=Int32, value= [168]),
 'word_attention_mask': Tensor(shape=[180], dtype=Int32, value= [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [10]:
data_sample["word_ids"]
arr = np.zeros((256-len(data_sample["word_ids"])),dtype = np.int32)
x_np = Tensor(arr)
op = ops.Concat()
word_ids = op((data_sample["word_ids"], x_np))
op = ops.Stack()
word_ids = op([word_ids,word_ids])
word_ids.shape

(2, 256)

In [11]:
word_ids

Tensor(shape=[2, 256], dtype=Int32, value=
[[    0, 35416,   147 ...     0,     0,     0],
 [    0, 35416,   147 ...     0,     0,     0]])

In [12]:
model(word_ids = word_ids,
      word_segment_ids = data_sample["word_segment_ids"],
      word_attention_mask = data_sample["word_attention_mask"],
      entity_ids = data_sample["entity_ids"],
      entity_position_ids = data_sample["entity_position_ids"],
      entity_segment_ids = data_sample["entity_segment_ids"],
      entity_attention_mask = data_sample["entity_attention_mask"]
      #start_positions = data_sample["start_positions"],
      #end_positions = data_sample["end_positions"])
     )

TypeError: For 'ReduceSum', the type of `input_x` should be subclass of Tensor[Int8], Tensor[Int16], Tensor[Int32], Tensor[Int64], Tensor[UInt8], Tensor[UInt16], Tensor[UInt32], Tensor[UInt64], Tensor[Float16], Tensor[Float32], Tensor[Float64], but got Tensor[Bool] . This message is only for reference. The supported data types depend on the hardware that executes the operator and it is a subset of the data types above.