In [1]:
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.python.data.ops.dataset_ops import AUTOTUNE
from transformers import BertTokenizer, TFBertModel

In [2]:
feature_description = { # 定义Feature结构，告诉解码器每个Feature的类型是什么
    'id': tf.io.FixedLenFeature([], tf.string),
    'tag_id': tf.io.VarLenFeature(tf.int64),
    'category_id': tf.io.FixedLenFeature([], tf.int64),
    'title': tf.io.FixedLenFeature([], tf.string),
    'asr_text': tf.io.FixedLenFeature([], tf.string),
    'frame_feature': tf.io.VarLenFeature(tf.string)
}


def read_and_decode(example_string):
    '''
    从TFrecord格式文件中读取数据 train
    '''
    feature_dict = tf.io.parse_single_example(example_string, feature_description)
    frame_feature = tf.sparse.to_dense(feature_dict['frame_feature']).numpy()
    title = feature_dict['title'].numpy()
    asr_text = feature_dict['asr_text'].numpy()
    id = feature_dict['id'].numpy()
    tag_id = tf.sparse.to_dense(feature_dict['tag_id']).numpy()
    category_id = feature_dict['category_id'].numpy()


    return id, tag_id, category_id, frame_feature, title, asr_text

In [3]:
filenames = 'data/pairwise/pairwise.tfrecords'
dataset = tf.data.TFRecordDataset(filenames)
datas = {}
for i, data in enumerate(dataset):
    id, tag_id, category_id, frame_feature, title, asr_text = read_and_decode(data)
    datas[i] = [title.decode('utf-8'), asr_text.decode('utf-8')]
    datas['title'] = title.decode('utf-8')
    datas['asr_text'] = asr_text.decode('utf-8')


In [7]:
datas[0].decode('utf-8')

'英雄联盟：8年未曾拿过五杀，电脑都看不下去，直接接管了！'

In [22]:
def mask_tokens(inputs, mlm_probability, tokenizer, special_tokens_mask):
    """
    Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
    """
    labels = np.copy(inputs)
    # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
    probability_matrix = np.random.random_sample(labels.shape)
    special_tokens_mask = special_tokens_mask.astype(np.bool_)

    probability_matrix[special_tokens_mask] = 0.0
    masked_indices = probability_matrix > (1 - mlm_probability)
    labels[~masked_indices] = -100  # We only compute loss on masked tokens

    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
    indices_replaced = (np.random.random_sample(labels.shape) < 0.8) & masked_indices
    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

    # 10% of the time, we replace masked input tokens with random word
    indices_random = (np.random.random_sample(labels.shape) < 0.5) & masked_indices & ~indices_replaced
    random_words = np.random.randint(low=0, high=len(tokenizer), size=np.count_nonzero(indices_random), dtype=np.int64)
    inputs[indices_random] = random_words

    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
    return inputs, labels

In [9]:
title = datas[0].decode('utf-8')

In [14]:
title

'英雄联盟：8年未曾拿过五杀，电脑都看不下去，直接接管了！'

In [17]:
tokenizer = BertTokenizer.from_pretrained('data/chinese-roberta-wwm-ext')
max_bert_length = 32
mlm_probability = 0.15

In [12]:
encoded_inputs = tokenizer(title, max_length=max_bert_length, padding='max_length', truncation=True, return_special_tokens_mask=True)

In [13]:
encoded_inputs

{'input_ids': [101, 5739, 7413, 5468, 4673, 8038, 129, 2399, 3313, 3295, 2897, 6814, 758, 3324, 8024, 4510, 5554, 6963, 4692, 679, 678, 1343, 8024, 4684, 2970, 2970, 5052, 749, 8013, 102, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'special_tokens_mask': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]}

In [15]:
example = tokenizer.pad(encoded_inputs, return_tensors="np", pad_to_multiple_of=None)

In [16]:
example

{'input_ids': array([ 101, 5739, 7413, 5468, 4673, 8038,  129, 2399, 3313, 3295, 2897,
       6814,  758, 3324, 8024, 4510, 5554, 6963, 4692,  679,  678, 1343,
       8024, 4684, 2970, 2970, 5052,  749, 8013,  102,    0,    0]), 'token_type_ids': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'special_tokens_mask': array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1]), 'attention_mask': array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0])}

In [18]:
special_tokens_mask = example.pop("special_tokens_mask", None)

In [19]:
special_tokens_mask

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1])

In [20]:
example

{'input_ids': array([ 101, 5739, 7413, 5468, 4673, 8038,  129, 2399, 3313, 3295, 2897,
       6814,  758, 3324, 8024, 4510, 5554, 6963, 4692,  679,  678, 1343,
       8024, 4684, 2970, 2970, 5052,  749, 8013,  102,    0,    0]), 'token_type_ids': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'attention_mask': array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0])}

In [25]:
example["input_ids"], example["labels"] = mask_tokens(
    example["input_ids"], mlm_probability, tokenizer, special_tokens_mask=special_tokens_mask)

In [26]:
example

{'input_ids': array([  101, 10068,  7413,   103,   103,   103,   129,  2399,  3313,
        3295,  2897,  6814,   758,   103,   103,  4510,   103,  6963,
        4692,   679,   678,  1343,  8024,   103,  2970,   103,  5052,
         749,   103,   102,     0,     0]), 'token_type_ids': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'attention_mask': array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0]), 'labels': array([-100, 5739, -100, 5468, 4673, 8038, -100, -100, -100, -100, -100,
       -100, -100, 3324, -100, 4510, -100, -100, -100, -100, -100, -100,
       -100, -100, -100,  103, -100, -100,  103, -100, -100, -100])}

In [28]:
example["labels"][example["labels"] == tokenizer.pad_token_id] = -100

In [29]:
example["labels"]

array([-100, 5739, -100, 5468, 4673, 8038, -100, -100, -100, -100, -100,
       -100, -100, 3324, -100, 4510, -100, -100, -100, -100, -100, -100,
       -100, -100, -100,  103, -100, -100,  103, -100, -100, -100])

In [30]:
example = {key: tf.convert_to_tensor(arr) for key, arr in example.items()}

In [31]:
example

{'input_ids': <tf.Tensor: shape=(32,), dtype=int32, numpy=
 array([  101, 10068,  7413,   103,   103,   103,   129,  2399,  3313,
         3295,  2897,  6814,   758,   103,   103,  4510,   103,  6963,
         4692,   679,   678,  1343,  8024,   103,  2970,   103,  5052,
          749,   103,   102,     0,     0])>,
 'token_type_ids': <tf.Tensor: shape=(32,), dtype=int32, numpy=
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0])>,
 'attention_mask': <tf.Tensor: shape=(32,), dtype=int32, numpy=
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 0, 0])>,
 'labels': <tf.Tensor: shape=(32,), dtype=int32, numpy=
 array([-100, 5739, -100, 5468, 4673, 8038, -100, -100, -100, -100, -100,
        -100, -100, 3324, -100, 4510, -100, -100, -100, -100, -100, -100,
        -100, -100, -100,  103, -100, -100,  103, -100, -100, -100])>}

In [None]:
title = title.numpy().decode(encoding='utf-8')
encoded_inputs = tokenizer(title, max_length=max_bert_length, padding='max_length', truncation=True, return_special_tokens_mask=True)
example = tokenizer.pad(encoded_inputs, return_tensors="np", pad_to_multiple_of=None)
special_tokens_mask = example.pop("special_tokens_mask", None)
example["input_ids"], example["labels"] = mask_tokens(
    example["input_ids"], mlm_probability, tokenizer, special_tokens_mask=special_tokens_mask)
if tokenizer.pad_token_id is not None:
    example["labels"][example["labels"] == tokenizer.pad_token_id] = -100
example = {key: tf.convert_to_tensor(arr) for key, arr in example.items()}

In [33]:
bert = TFBertModel.from_pretrained('data/chinese-roberta-wwm-ext')

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at data/chinese-roberta-wwm-ext.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [34]:
bert.config

BertConfig {
  "_name_or_path": "data/chinese-roberta-wwm-ext",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "directionality": "bidi",
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.9.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}