In [1]:
#! -*- coding: utf-8 -*-
# 用GlobalPointer做中文命名实体识别
# 数据集 http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz

import numpy as np
from bert4keras.backend import keras, K
from bert4keras.backend import multilabel_categorical_crossentropy
from bert4keras.layers import GlobalPointer
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, DataGenerator
from bert4keras.snippets import open, to_array
from keras.models import Model
from tqdm import tqdm

maxlen = 256
epochs = 10
batch_size = 16
learning_rate = 2e-5
categories = set()

# bert配置
# 下载预训练权重：https://github.com/google-research/bert
config_path = 'chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = 'chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = 'chinese_L-12_H-768_A-12/vocab.txt'

'''
数据样例：
客$$$@@@O
服$$$@@@O
：$$$@@@O
2$$$@@@B-ODR
2$$$@@@I-ODR
1$$$@@@I-ODR
3$$$@@@I-ODR
7$$$@@@I-ODR
6$$$@@@I-ODR
4$$$@@@I-ODR
4$$$@@@I-ODR
2$$$@@@I-ODR
3$$$@@@I-ODR
1$$$@@@I-ODR
7$$$@@@I-ODR
加工：
['客服：221376442317', [3, 14, 'ODR']]
'''
def load_data(filename):
    """加载数据
    单条格式：[text, (start, end, label), (start, end, label), ...]，
              意味着text[start:end + 1]是类型为label的实体。
    """
    D = []
    with open(filename, encoding='utf-8') as f:
        f = f.read()
        for l in f.split('\n\n'):
            if not l:
                continue
            d = ['']
            for i, c in enumerate(l.split('\n')):
                char, flag = c.split('$$$@@@')
                d[0] += char
                if flag[0] == 'B':
                    d.append([i, i, flag[2:]])
                    categories.add(flag[2:])
                elif flag[0] == 'I':
                    d[-1][1] = i
            D.append(d)
    return D


# 标注数据
train_data = load_data('fapiao/train_data.txt')
valid_data = load_data('fapiao/dev_data.txt')
categories = list(sorted(categories))

# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)


class data_generator(DataGenerator):
    """数据生成器
    """
    def __iter__(self, random=False):
        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
        for is_end, d in self.sample(random):
            tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
            mapping = tokenizer.rematch(d[0], tokens)
            start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
            end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
            token_ids = tokenizer.tokens_to_ids(tokens)
            segment_ids = [0] * len(token_ids)
            labels = np.zeros((len(categories), maxlen, maxlen))
            for start, end, label in d[1:]:
                if start in start_mapping and end in end_mapping:
                    start = start_mapping[start]
                    end = end_mapping[end]
                    label = categories.index(label)
                    labels[label, start, end] = 1
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            batch_labels.append(labels[:, :len(token_ids), :len(token_ids)])
            if len(batch_token_ids) == self.batch_size or is_end:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                batch_labels = sequence_padding(batch_labels, seq_dims=3)
                yield [batch_token_ids, batch_segment_ids], batch_labels
                batch_token_ids, batch_segment_ids, batch_labels = [], [], []


def global_pointer_crossentropy(y_true, y_pred):
    """给GlobalPointer设计的交叉熵
    """
    bh = K.prod(K.shape(y_pred)[:2])
    y_true = K.reshape(y_true, (bh, -1))
    y_pred = K.reshape(y_pred, (bh, -1))
    return K.mean(multilabel_categorical_crossentropy(y_true, y_pred))


def global_pointer_f1_score(y_true, y_pred):
    """给GlobalPointer设计的F1
    """
    y_pred = K.cast(K.greater(y_pred, 0), K.floatx())
    return 2 * K.sum(y_true * y_pred) / K.sum(y_true + y_pred)


model = build_transformer_model(config_path, checkpoint_path)
output = GlobalPointer(len(categories), 64)(model.output)

model = Model(model.input, output)
model.summary()

model.compile(
    loss=global_pointer_crossentropy,
    optimizer=Adam(learning_rate),
    metrics=[global_pointer_f1_score]
)


class NamedEntityRecognizer(object):
    """命名实体识别器
    """
    def recognize(self, text, threshold=0):
        tokens = tokenizer.tokenize(text, maxlen=512)
        mapping = tokenizer.rematch(text, tokens)
        token_ids = tokenizer.tokens_to_ids(tokens)
        segment_ids = [0] * len(token_ids)
        token_ids, segment_ids = to_array([token_ids], [segment_ids])
        scores = model.predict([token_ids, segment_ids])[0]
        scores[:, [0, -1]] -= np.inf
        scores[:, :, [0, -1]] -= np.inf
        entities = []
        for l, start, end in zip(*np.where(scores > threshold)):
            entities.append(
                (mapping[start][0], mapping[end][-1], categories[l])
            )
        return entities


NER = NamedEntityRecognizer()


def evaluate(data):
    """评测函数
    """
    X, Y, Z = 1e-10, 1e-10, 1e-10
    for d in tqdm(data, ncols=100):
        R = set(NER.recognize(d[0]))
        T = set([tuple(i) for i in d[1:]])
        X += len(R & T)
        Y += len(R)
        Z += len(T)
    f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
    return f1, precision, recall


class Evaluator(keras.callbacks.Callback):
    """评估与保存
    """
    def __init__(self):
        self.best_val_f1 = 0

    def on_epoch_end(self, epoch, logs=None):
        f1, precision, recall = evaluate(valid_data)
        # 保存最优
        if f1 >= self.best_val_f1:
            self.best_val_f1 = f1
            model.save_weights('./best_model_peopledaily_globalpointer.weights')
        print(
            'valid:  f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' %
            (f1, precision, recall, self.best_val_f1)
        )
#         f1, precision, recall = evaluate(test_data)
#         print(
#             'test:  f1: %.5f, precision: %.5f, recall: %.5f\n' %
#             (f1, precision, recall)
#         )


if True:

    evaluator = Evaluator()
    train_generator = data_generator(train_data, batch_size)

    model.fit(
        train_generator.forfit(),
        steps_per_epoch=len(train_generator),
        epochs=epochs,
        callbacks=[evaluator]
    )

else:

    model.load_weights('./best_model_peopledaily_globalpointer.weights')


Using TensorFlow backend.






Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.






Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        (None, None)         0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Embedding-Token (Embedding)     (None, None, 768)    16226304    Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, None, 768)    1536        I

100%|██████████████████████████████████████████████████████████| 3864/3864 [00:35<00:00, 107.57it/s]


valid:  f1: 0.86120, precision: 0.86411, recall: 0.85832, best f1: 0.86120

Epoch 2/10


100%|██████████████████████████████████████████████████████████| 3864/3864 [00:33<00:00, 114.28it/s]


valid:  f1: 0.87304, precision: 0.86215, recall: 0.88422, best f1: 0.87304

Epoch 3/10


100%|██████████████████████████████████████████████████████████| 3864/3864 [00:34<00:00, 113.07it/s]


valid:  f1: 0.86554, precision: 0.87928, recall: 0.85222, best f1: 0.87304

Epoch 4/10


100%|██████████████████████████████████████████████████████████| 3864/3864 [00:33<00:00, 114.49it/s]


valid:  f1: 0.86746, precision: 0.85020, recall: 0.88544, best f1: 0.87304

Epoch 5/10


100%|██████████████████████████████████████████████████████████| 3864/3864 [00:35<00:00, 109.41it/s]


valid:  f1: 0.86818, precision: 0.86347, recall: 0.87294, best f1: 0.87304

Epoch 6/10


100%|██████████████████████████████████████████████████████████| 3864/3864 [00:34<00:00, 113.46it/s]


valid:  f1: 0.86572, precision: 0.85744, recall: 0.87416, best f1: 0.87304

Epoch 7/10


100%|██████████████████████████████████████████████████████████| 3864/3864 [00:33<00:00, 113.98it/s]


valid:  f1: 0.85823, precision: 0.89163, recall: 0.82724, best f1: 0.87304

Epoch 8/10


100%|██████████████████████████████████████████████████████████| 3864/3864 [00:34<00:00, 111.75it/s]


valid:  f1: 0.86462, precision: 0.86820, recall: 0.86106, best f1: 0.87304

Epoch 9/10


100%|██████████████████████████████████████████████████████████| 3864/3864 [00:33<00:00, 113.97it/s]


valid:  f1: 0.86474, precision: 0.86264, recall: 0.86685, best f1: 0.87304

Epoch 10/10


100%|██████████████████████████████████████████████████████████| 3864/3864 [00:34<00:00, 110.92it/s]

valid:  f1: 0.86661, precision: 0.86424, recall: 0.86898, best f1: 0.87304






In [3]:
# ['客服：221376442317', [3, 14, 'ODR']]
NER.recognize('客服：221376442317')

[(3, 14, 'ODR')]

# 模型文件格式转换（h5->pb）

In [3]:
from keras import backend as K
import tensorflow as tf
# convert .h5 to .pb
def freeze_session(session, keep_var_names=None, output_names=None, clear_devices=True):
    from tensorflow.python.framework.graph_util import convert_variables_to_constants
    graph = session.graph
    with graph.as_default():
        freeze_var_names = list(set(v.op.name for v in tf.global_variables()).difference(keep_var_names or []))
        output_names = output_names or []
        output_names += [v.op.name for v in tf.global_variables()]
        input_graph_def = graph.as_graph_def()
        if clear_devices:
            for node in input_graph_def.node:
                node.device = ""
        frozen_graph = convert_variables_to_constants(session, input_graph_def,
                                                      output_names, freeze_var_names)
        return frozen_graph

print('input is :', (model.input[0].name,model.input[1].name))
print ('output is:', model.output.name)
sess = K.get_session()
frozen_graph = freeze_session(K.get_session(), output_names=[model.output.op.name])

input is : ('Input-Token:0', 'Input-Segment:0')
output is: conditional_random_field_1/add:0
Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`
Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
INFO:tensorflow:Froze 205 variables.
INFO:tensorflow:Converted 205 variables to const ops.


In [4]:
from tensorflow.python.framework import graph_io

output_path='./pb'
pb_model_name='ner_model.pb'
# pb_model_name='t5/t5_model.pb'
graph_io.write_graph(frozen_graph, output_path, pb_model_name, as_text=False)

'./pb/ner_model.pb'

In [5]:
from tensorflow.python.saved_model import signature_constants
from tensorflow.python.saved_model import tag_constants
# export_dir = 't5/saved_model'
# # graph_pb = './5_trained_model.pb'
# graph_pb = 't5/t5_model.pb'
export_dir = './saved_model1'
# graph_pb = './5_trained_model.pb'
graph_pb = './pb/ner_model.pb'
builder = tf.saved_model.builder.SavedModelBuilder(export_dir)
with tf.gfile.GFile(graph_pb, "rb") as f:
    graph_def = tf.GraphDef()
    graph_def.ParseFromString(f.read())

In [6]:
sigs = {}
with tf.Session(graph=tf.Graph()) as sess:
    tf.import_graph_def(graph_def, name="")
    g = tf.get_default_graph()
    inp0 = g.get_tensor_by_name(model.input[0].name)
    inp1 = g.get_tensor_by_name(model.input[1].name)
    out = g.get_tensor_by_name(model.output.name)
    print('inp0',inp0)
    print('inp1',inp1)
    print('out',out)
    
    sigs[signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY] = tf.saved_model.signature_def_utils.predict_signature_def({"in0":inp0,"in1":inp1},{"out":out})
    builder.add_meta_graph_and_variables(sess,[tag_constants.SERVING],signature_def_map=sigs)
builder.save()

inp0 Tensor("Input-Token:0", shape=(?, ?), dtype=float32)
inp1 Tensor("Input-Segment:0", shape=(?, ?), dtype=float32)
out Tensor("conditional_random_field_1/add:0", shape=(?, ?, 15), dtype=float32)
Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.
INFO:tensorflow:No assets to save.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: ./saved_model1/saved_model.pb


b'./saved_model1/saved_model.pb'