# NER 快速上手示例

In [36]:
# import func
import random as rand
from random import random
import numpy as np
import math
import tensorflow as tf

In [2]:
# Parameters used.
MODEL_PATH = 'model/model.ckpt'

In [3]:
def get_sequence(sequence_length):
    """生成一个句子array，每个元素可以看为一个实体(词向量长度为1的实体)
    sequence_length: int,输入序列长度
    return:
        X: list, 根据输入序列随机生成[0,1]之间，长度为序列长度的数据，array([0.4517164 , 0.17458207,...])
        y：list, 根据输入序列随机累计结果，长度为序列长度的数据，array([0 , 0,...])
    """
    X = np.array([random() for _ in range(sequence_length)])
    limit = sequence_length / 4.0
    y = np.array([0 if x < limit else 1 for x in np.cumsum(X)])
    return X, y

In [4]:
# 数据例子
get_sequence(5)

(array([0.99977239, 0.58233066, 0.71998191, 0.75649066, 0.2671246 ]),
 array([0, 1, 1, 1, 1]))

In [5]:
def get_examples(n):
    '''获取长度范围在[5,15]之间的n句，并返回对应的句子序列，标签序列以及句子长度序列
    return：
        X_list: list,[array(int,int,..),...]
        y_list: list,[array(0,1,...),...]
        sequence_length_list: list,[int, int, ...]
    '''
    X_list = []
    y_list = []
    sequence_length_list = []
    for _ in range(n):
        sequence_length = rand.randrange(start=5, stop=15)
        X, y = get_sequence(sequence_length)
        X_list.append(X)
        y_list.append(y)
        sequence_length_list.append(sequence_length)
    
    return X_list, y_list, sequence_length_list


In [6]:
# 数据例子
X_list, y_list, sequence_length_list = get_examples(3)
print(X_list, y_list, sequence_length_list)

[array([0.124389  , 0.27276434, 0.88607881, 0.79260571, 0.50418437,
       0.33872385, 0.09518358, 0.9513725 , 0.4368882 , 0.45860467,
       0.40931727]), array([0.85111579, 0.88904533, 0.67534032, 0.0208597 , 0.21419562,
       0.75828096, 0.39506107, 0.04431259, 0.43886332, 0.31583752,
       0.23241244, 0.82299346, 0.25791869]), array([0.56751806, 0.54503079, 0.68218183, 0.88030109, 0.78564019,
       0.14985963, 0.28419692, 0.01099985])] [array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]), array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]), array([0, 0, 0, 1, 1, 1, 1, 1])] [11, 13, 8]


### tensorflow  dynamic rnn 要求同一批次中句子必须有相同长度，因此必须将同一个batch中data与label 填充为相同长度

In [7]:
def pad(sentence, max_length):
    '''数据长度填充
    '''
    pad_len = max_length - len(sentence)
    padding = np.zeros(pad_len)
    return np.concatenate((sentence, padding))
    
def batch(data, labels, sequence_lengths, batch_size, input_size):
    '''生成模型批数据
        data：输入数据，list(array(float，float,...),array(float,float,...)),句子序列
        labels：标签序列，list(array(int，int,...),array(int,int,...)),句子序列 int取值为0或1，可以将其理解为ner序列标签
        sequence_lengths：list,[int, int, ...]，每个句子长度序列
        batch_size：int,批大小
        input_size：1，词向量长度
    '''
    n_batch = int(math.ceil(len(data) / batch_size)) #math.ceil 向上取整，将整个数据集切分成n_batch
    index = 0
    for _ in range(n_batch):
        batch_sequence_lengths = np.array(sequence_lengths[index: index + batch_size])
        batch_length = np.array(max(batch_sequence_lengths)) # 批次最长长度
        batch_data = np.array([pad(x, batch_length) for x in data[index: index + batch_size]]) # 按照batch 中最大句子长度填充句子短句子
        batch_labels = np.array([pad(x, batch_length) for x in labels[index: index + batch_size]]) # 按照batch 中最大句子长度填充短句子对应的标签序列
        index += batch_size
        
        # 将数据转换成LSTM 需要传入的数据形式，shape: [batch_size,batch_length,input_size] ，使用dynamic rnn model 故我们只需要保证每批数据序列长度一致即可
        # input_size由data 转换为长度为1的list
        batch_data = batch_data.reshape(-1, batch_length, input_size)
        
        yield batch_data, batch_labels, batch_length, batch_sequence_lengths

In [31]:
# Generate train and test data.
x_train, y_train, sequence_length_train = get_examples(200)
x_test, y_test, sequence_length_test = get_examples(50)

### 搭建Bi_lstm+CRF 模型

In [32]:
lr = 0.001    #学习率
training_epochs = 100 #训练迭代次数
input_size = 1 # 输入数据长度，可理解为ner任务中词语词向量长度为 1
batch_size = 32 # 批大小
num_units = 128 # 隐藏层神经元个数
number_of_classes = 2 #类别数

#定义placeholder
# 序列长度为变化数值，此处设置为任意长度None
# shape: [batchsize,sequencelength,input_size]
input_data = tf.placeholder(tf.float32, [None, None, input_size], name='input_data')
# shape: [batch_size, sequence_length]
labels = tf.placeholder(tf.int32, [None, None], name='labels')
batch_sequence_length = tf.placeholder(tf.int32)# 每批中最大句子长度：max sequence length
original_sequence_lengths = tf.placeholder(tf.int32, [None]) #原始序列长度

# 定义lstm cell 命名使用范围,重复运行此段代码必须使用tf.auto_reuse 否则抛出错误
with tf.variable_scope('forward',reuse=tf.AUTO_REUSE):
    lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(num_units, forget_bias=1.0, state_is_tuple=True)
with tf.variable_scope('backward',reuse=tf.AUTO_REUSE):
    lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(num_units, forget_bias=1.0, state_is_tuple=True)

with tf.variable_scope('bistm', reuse=tf.AUTO_REUSE):
    (output_fw, output_bw), states = tf.nn.bidirectional_dynamic_rnn(cell_fw=lstm_fw_cell, 
                                                                 cell_bw=lstm_bw_cell, 
                                                                 inputs=input_data,
                                                                 sequence_length=original_sequence_lengths, 
                                                                 dtype=tf.float32)     # 研究下bidirectional_dynamic_rnn 的用法
# 拼接双向LSTM 输出结果
outputs = tf.concat([output_fw, output_bw], axis=2)
outputs_flat = tf.reshape(outputs,[-1,2 * num_units])

# 全连接层
with tf.variable_scope('W',reuse=tf.AUTO_REUSE):
    W = tf.get_variable(name='W', shape=[2 * num_units, number_of_classes], dtype=tf.float32)
with tf.variable_scope('b',reuse=tf.AUTO_REUSE):
    b = tf.get_variable(name='b', shape=[number_of_classes], dtype=tf.float32, initializer=tf.zeros_initializer())

pred = tf.matmul(outputs_flat, W) + b
scores = tf.reshape(pred, [-1, batch_sequence_length, number_of_classes])

# Linear-CRF
with tf.variable_scope('crf',reuse=tf.AUTO_REUSE):
    log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(scores, labels, original_sequence_lengths)
    loss = tf.reduce_mean(-log_likelihood)

# 计算viterb序列以及score用于预测
viterb_sequence, viterbi_score = tf.contrib.crf.crf_decode(scores,transition_params,original_sequence_lengths)

# 优化器
with tf.variable_scope('optimizer',reuse=tf.AUTO_REUSE):
    optimizer = tf.train.AdamOptimizer(lr)
    train_op = optimizer.minimize(loss)

# 保存模型变量
saver = tf.train.Saver()

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


### 训练模型过程

In [33]:

with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    
    for i in range(training_epochs):
        batchiter = batch(x_train, y_train, sequence_length_train, batch_size, input_size)
        for per_batch in batchiter:
            batch_data, batch_labels, batch_seq_len, batch_sequence_lengths = per_batch
            tf_viterbi_sequence, _ = session.run([viterb_sequence,train_op],
                                                 feed_dict={input_data:batch_data,
                                                            labels:batch_labels,
                                                            batch_sequence_length:batch_seq_len,
                                                            original_sequence_lengths:batch_sequence_lengths})
        # 显示训练精度
        if i % 10 ==0:
            # 创建mask 用于遮盖人为填充的数据长度
            mask = (np.expand_dims(np.arange(batch_seq_len),axis=0) <
                     np.expand_dims(batch_sequence_lengths, axis=1))
            total_labels = np.sum(batch_sequence_lengths)
            correct_labels= np.sum((batch_labels == tf_viterbi_sequence)*mask)
            accuracy = 100.0 * correct_labels/float(total_labels)
            print('Epoch: %d' % i, "Accuracy:%.2f%%"%accuracy)
    
    # 保存模型变量
    saver.save(session, MODEL_PATH)
            



Epoch: 0 Accuracy:47.30%
Epoch: 10 Accuracy:87.84%
Epoch: 20 Accuracy:91.89%
Epoch: 30 Accuracy:95.95%
Epoch: 40 Accuracy:97.30%
Epoch: 50 Accuracy:97.30%
Epoch: 60 Accuracy:97.30%
Epoch: 70 Accuracy:98.65%
Epoch: 80 Accuracy:100.00%
Epoch: 90 Accuracy:100.00%


### 测试模型效果

In [34]:
# Test model

with tf.Session() as session:
    saver.restore(session, MODEL_PATH)
    
    batchiter = batch(x_test, y_test, sequence_length_train, len(x_test), input_size)
    
    for per_batch in batchiter:
        batch_data, batch_labels, batch_seq_len, batch_sequence_lengths = per_batch
        tf_viterbi_sequence, _ = session.run([viterb_sequence,train_op],
                                             feed_dict={input_data:batch_data,
                                                        labels:batch_labels,
                                                        batch_sequence_length:batch_seq_len,
                                                        original_sequence_lengths:batch_sequence_lengths})

    # 创建mask 用于遮盖人为填充的数据长度
    mask = (np.expand_dims(np.arange(batch_seq_len),axis=0) <
             np.expand_dims(batch_sequence_lengths, axis=1))
    total_labels = np.sum(batch_sequence_lengths)
    correct_labels= np.sum((batch_labels == tf_viterbi_sequence)*mask)
    accuracy = 100.0 * correct_labels/float(total_labels)
    
    print("Test accuracy: %.2f%%" % accuracy)

    print("Label:", batch_labels[0].astype(int))    
    print("Pred.:", tf_viterbi_sequence[0])


INFO:tensorflow:Restoring parameters from model/model.ckpt
Test accuracy: 73.20%
Label: [0 0 0 0 0 0 1 1 1 1 1 1 0 0]
Pred.: [0 0 0 1 1 1 0 0 0 0 0 0 0 0]
