In [10]:
import json, os


def makeVocabulary(originPath='data/train.txt', vocabPath='dataProcess'):
    if os.path.exists(os.path.join(vocabPath, 'vocab.txt')) and os.path.exists(os.path.join(vocabPath, 'taged.txt')):
        return getVocabAndTags(vocabPath)
    vocabSet = set()
    tagedSet = set()
    vocabSet.add('<UNK>')
    vocabSet.add('<PAD>')
    vocabSet.add('<START>')
    vocabSet.add('<END>')
    tagedSet.add('<START>')
    tagedSet.add('<END>')
    trainFile = open(originPath, encoding='utf-8')
    for line in trainFile:
        line = line.replace('\n', '')
        if line != '':
            line = line.split(' ')
            vocabSet.add(line[0])
            tagedSet.add(line[1])
    tagedSet.remove('O')
    tagedList = list(tagedSet)
    tagedList.insert(0, 'O')
    vocab2index = dict(zip(vocabSet, range(len(vocabSet))))
    index2vocab = dict(zip(range(len(vocabSet)), vocabSet))
    taged2index = dict(zip(tagedList, range(len(tagedList))))
    index2taged = dict(zip(range(len(tagedList)), tagedList))
    vocabDic = {'vocab2index': vocab2index, 'index2vocab': index2vocab}
    tagedDic = {'taged2index': taged2index, 'index2taged': index2taged}
    with open(os.path.join(vocabPath, 'vocab.txt'), mode='w', encoding='utf-8') as writer:
        json.dump(vocabDic, writer)
    with open(os.path.join(vocabPath, 'taged.txt'), mode='w', encoding='utf-8') as writer:
        json.dump(tagedDic, writer)
    return vocab2index, index2vocab, taged2index, index2taged


def getVocabAndTags(path='dataProcess'):
    with open(os.path.join(path, 'vocab.txt'), encoding='utf-8') as reader:
        vocabDic = json.load(reader)
        vocab2index = vocabDic['vocab2index']
        index2vocab = vocabDic['index2vocab']
        index2vocab = dict(zip(map(int, index2vocab.keys()), index2vocab.values()))
    with open(os.path.join(path, 'taged.txt'), encoding='utf-8') as reader:
        tagedDic = json.load(reader)
        taged2index = tagedDic['taged2index']
        index2taged = tagedDic['index2taged']
        index2taged = dict(zip(map(int, index2taged.keys()), index2taged.values()))
    return vocab2index, index2vocab, taged2index, index2taged


In [11]:
import numpy as np
import tensorflow as tf


class DataGenerator:
    def __init__(self, vocab2index, index2vocab, taged2index, index2taged, maxLength):
        self.__vocab2index = vocab2index
        self.__index2vocab = index2vocab
        self.__taged2index = taged2index
        self.__index2taged = index2taged
        self.__maxLength = maxLength

    def __getData(self, inputFile):
        sentences = list()
        sentenceLengths = list()
        tages = list()
        with open(inputFile, encoding='utf-8') as reader:
            sentence = list()
            tag = list()
            for line in reader:
                line = line.replace('\n', '')
                if line == '':
                    sentence = [self.__vocab2index['<START>']] + sentence
                    tag = [self.__taged2index['<START>']] + tag
                    sentence.append(self.__vocab2index['<END>'])
                    tag.append(self.__taged2index['<END>'])
                    sentenceLengths.append(len(sentence))
                    sentences.append(
                        np.asarray(
                            sentence.copy() + (self.__maxLength - len(sentence)) * [self.__vocab2index['<PAD>']]))
                    tages.append(
                        np.asarray(tag.copy() + (self.__maxLength - len(sentence)) * [self.__taged2index['O']]))
                    sentence.clear()
                    tag.clear()
                else:
                    thisWord = line.split(' ')[0]
                    thisTag = line.split(' ')[1]
                    sentence.append(self.__vocab2index.get(thisWord, self.__vocab2index['<UNK>']))
                    tag.append(self.__taged2index[thisTag])
        return np.asarray(sentences), np.asarray(tages), np.asarray(sentenceLengths)

    def input_fn(self, inputFile, batchSize, ifShuffleAndRepeat=True):
        sentences, tages, sentenceLengths = self.__getData(inputFile)
        dataset = tf.data.Dataset.from_tensor_slices((sentences, sentenceLengths, tages))
        if ifShuffleAndRepeat:
            dataset = dataset.shuffle(1000)
            dataset = dataset.repeat(25)
        dataset = dataset.batch(batchSize)
        iterator = dataset.make_one_shot_iterator()
        sentences, sentenceLengths, tag = iterator.get_next()
        sentences = {'sentences': sentences, 'sentenceLengths': sentenceLengths}
        return sentences, tag

    def indexToText(self, sentence, tag):
        sentence = list(map(lambda x: self.__index2vocab[x], sentence))
        tag = list(map(lambda x: self.__index2taged[x], tag))
        return sentence, tag


In [12]:
import tensorflow as tf
import tensorflow.contrib.crf as crf
from tf_metrics import precision, recall, f1
from tensorflow.contrib import keras


class BiLSTMCrf(object):
    def __init__(self, inputX, inputY, sentenceLengths, numClasses, vocabSize, embeddingSize,
                 hiddenSize, learnRate, maxLength, l2_reg_lambda, dropout_keep_prob, crf):
        self.numClasses = numClasses
        self.vocabSize = vocabSize
        self.embeddingSize = embeddingSize
        self.hiddenSize = hiddenSize
        self.learnRate = learnRate
        self.maxLength = maxLength
        self.l2_reg_lambda = l2_reg_lambda
        self.inputX = inputX
        self.inputY = inputY
        self.sentenceLengths = sentenceLengths
        self.dropout_keep_prob = dropout_keep_prob
        self.crf = crf
        self.__addEmbeddingLayer()
        self.__addBiLSTMLayer()

    def __addEmbeddingLayer(self):
        with tf.name_scope('embeddingLayer'):
            embedding = tf.get_variable(name='embedding', shape=[self.vocabSize, self.embeddingSize],
                                        initializer=tf.contrib.layers.xavier_initializer())
            embeddingInput = tf.nn.embedding_lookup(embedding, self.inputX)
            self.embeddingInput = tf.nn.dropout(embeddingInput, rate=1 - self.dropout_keep_prob)

    def __addBiLSTMLayer(self):
        with tf.name_scope('BiLSTMLayer'):
            lstm_fw_cell = tf.nn.rnn_cell.DropoutWrapper(
                cell=tf.nn.rnn_cell.BasicLSTMCell(num_units=self.hiddenSize),
                output_keep_prob=self.dropout_keep_prob)
            lstm_bw_cell = tf.nn.rnn_cell.DropoutWrapper(
                cell=tf.nn.rnn_cell.BasicLSTMCell(num_units=self.hiddenSize),
                output_keep_prob=self.dropout_keep_prob)
            # keras.layers.Bidirectional(
            #     keras.layers.LSTM(units=self.hiddenSize, dropout=1 - self.dropout_keep_prob)).apply(
            #     inputs=self.embeddingInput, mask=)
            bidOutput, bidCurrent_state = tf.nn.bidirectional_dynamic_rnn(cell_fw=lstm_fw_cell,
                                                                          cell_bw=lstm_bw_cell,
                                                                          sequence_length=self.sentenceLengths,
                                                                          inputs=self.embeddingInput,
                                                                          dtype=tf.float32)
            BiLSTMOutput = tf.concat(bidOutput, axis=-1)
            self.BiLSTMOutput = tf.nn.dropout(BiLSTMOutput, rate=1 - self.dropout_keep_prob)

    def __addBiLSTMOutPutDenseLayer(self):
        with tf.name_scope('BiLSTMOutputDenseLayer'):
            l2_regularizer = tf.contrib.layers.l2_regularizer(scale=self.l2_reg_lambda)  # 获取正则项
            self.bilstmDenseOutput = keras.layers.Dense(units=self.numClasses,
                                                        activation=keras.activations.relu,
                                                        kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                                        kernel_regularizer=l2_regularizer)(self.BiLSTMOutput)
            # self.bilstmDenseOutput = tf.layers.dense(inputs=self.BiLSTMOutput,
            #                                          units=self.numClasses,
            #                                          activation=tf.nn.relu,
            #                                          kernel_initializer=tf.contrib.layers.xavier_initializer(),
            #                                          kernel_regularizer=l2_regularizer)
            self.sequence = tf.argmax(self.bilstmDenseOutput, axis=-1)
            losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.inputY, logits=self.bilstmDenseOutput)
            mask = tf.sequence_mask(self.sentenceLengths)
            losses = tf.boolean_mask(losses, mask)
            self.loss = tf.reduce_mean(losses)
            self.l2_loss = tf.losses.get_regularization_loss()  # 使用get_regularization_loss函数获取定义的全
            self.loss += self.l2_reg_lambda * self.l2_loss

    def __addCrfLayer(self):
        with tf.name_scope('CRFLayer'):
            self.transitionParams = tf.get_variable("transitions", shape=[self.numClasses, self.numClasses],
                                                    initializer=tf.contrib.layers.xavier_initializer())
            logLikelihood, self.transitionParams = crf.crf_log_likelihood(self.bilstmDenseOutput,
                                                                          self.inputY,
                                                                          self.sentenceLengths,
                                                                          transition_params=self.transitionParams)
            self.sequence, _ = crf.crf_decode(self.bilstmDenseOutput,
                                              self.transitionParams,
                                              self.sentenceLengths)
            self.loss = tf.reduce_mean(-logLikelihood)
            self.loss += self.l2_reg_lambda * self.l2_loss

    def getResult(self, mode):
        self.__addBiLSTMOutPutDenseLayer()
        if self.crf:
            self.__addCrfLayer()
        if mode == tf.estimator.ModeKeys.PREDICT:
            return self.sequence
        else:
            weights = tf.sequence_mask(self.sentenceLengths, maxlen=self.maxLength, dtype=tf.int32)
            metrics = {
                'acc': tf.metrics.accuracy(labels=self.inputY, predictions=self.sequence, weights=weights),
                'precision': precision(labels=self.inputY, predictions=self.sequence, num_classes=self.numClasses,
                                       pos_indices=[1, 3, 4, 6, 7, 8], weights=weights),
                'recall': recall(labels=self.inputY, predictions=self.sequence, num_classes=self.numClasses,
                                 pos_indices=[1, 3, 4, 6, 7, 8], weights=weights),
                'f1': f1(labels=self.inputY, predictions=self.sequence, num_classes=self.numClasses,
                         pos_indices=[1, 3, 4, 6, 7, 8], weights=weights)
            }
            if mode == tf.estimator.ModeKeys.TRAIN:
                for metric_name, op in metrics.items():
                    tf.summary.scalar(metric_name, op[1])
                learnRate = tf.train.exponential_decay(self.learnRate, tf.train.get_global_step(), 500, 0.98,
                                                       staircase=True)
                optimizer = tf.train.AdamOptimizer(learnRate)
                self.train_op = optimizer.minimize(self.loss, global_step=tf.train.get_global_step())
                return self.loss, self.train_op
            else:
                return self.loss, metrics


In [16]:
import tensorflow as tf
import numpy as np
import os, functools, argparse

tf.logging.set_verbosity(tf.logging.INFO)

parser = argparse.ArgumentParser(description='BiLSTM-CRF超参数设置')
parser.add_argument('--maxLength', type=int, default=105, help='序列最大长度')
parser.add_argument("--embeddingSize", type=int, default=64, help='字向量维度')
parser.add_argument("--hiddenSize", type=int, default=128, help='LSTM隐藏层维度')
parser.add_argument('--learnRate', type=float, default=0.1, help='学习率设置')
parser.add_argument("--dropout", type=float, default=0.5, help='dropout keep prob')
parser.add_argument("--dataDir", type=str, default='data', help='数据路径')
parser.add_argument("--batchSize", type=int, default=50, help='batchSize')
parser.add_argument("--crf", type=bool, default=True, help='是否使用crf')
parser.add_argument('--l2_reg_lambda', type=float, default=0.1, help='l2正则项系数')


def model_fn(features, labels, mode, params):
    inputX = features['sentences']
    sentenceLengths = features['sentenceLengths']
    model = BiLSTMCrf(inputX, labels, sentenceLengths, params['numClasses'], params['vocabSize'],
                      FLAGS.embeddingSize, FLAGS.hiddenSize, FLAGS.learnRate, FLAGS.maxLength, FLAGS.l2_reg_lambda,
                      FLAGS.dropout if mode == tf.estimator.ModeKeys.TRAIN else 1.0, FLAGS.crf)
    if mode == tf.estimator.ModeKeys.TRAIN:
        loss, train_op = model.getResult(mode)
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
    elif mode == tf.estimator.ModeKeys.EVAL:
        loss, metrics = model.getResult(mode)
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=metrics)
    else:
        sequence = model.getResult(mode)
        predictions = {'sentence': inputX,
                       'tags': sequence}
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)


if __name__ == '__main__':
    FLAGS = parser.parse_known_args()[0]
    vocab2index, index2vocab, taged2index, index2taged = makeVocabulary()
    params = {'numClasses': len(index2taged),
              'vocabSize': len(index2vocab)}
    dataGenerator = DataGenerator(vocab2index, index2vocab, taged2index, index2taged, FLAGS.maxLength)
    model = tf.estimator.Estimator(model_fn=model_fn, params=params, model_dir="./model/")
    train_inputFun = functools.partial(dataGenerator.input_fn, os.path.join(FLAGS.dataDir, 'train.txt'),
                                       batchSize=FLAGS.batchSize)
    model.train(train_inputFun)
    eval_inputFun = functools.partial(dataGenerator.input_fn, os.path.join(FLAGS.dataDir, 'dev.txt'),
                                      batchSize=FLAGS.batchSize, ifShuffleAndRepeat=False)
    model.evaluate(eval_inputFun)
    test_inputFun = functools.partial(dataGenerator.input_fn, os.path.join(FLAGS.dataDir, 'test.txt'),
                                      batchSize=FLAGS.batchSize,
                                      ifShuffleAndRepeat=False)
    predictions = model.predict(test_inputFun)
    for result in predictions:
        sentence, tags = dataGenerator.indexToText(result['sentence'], result['tags'])
        print(dict(zip(sentence, tags)))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './model/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0xb37cd0da0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Instructions for updating:
Colocations handled automatically by placer.
INFO:tensorflow:Calling model_fn.
Instructions for updating:
This class is equiv

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from ./model/model.ckpt-0
Instructions for updating:
Use standard file utilities to get mtimes.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into ./model/model.ckpt.


KeyboardInterrupt: 