# References
- https://guillaumegenthial.github.io/introduction-tensorflow-estimator.html
- [NER with keras and tf](https://towardsdatascience.com/named-entity-recognition-ner-meeting-industrys-requirement-by-applying-state-of-the-art-deep-698d2b3b4ede)


# TODO
- evaluate result

In [1]:
def test_input():
    gen = input_fn(params["texts"], params["tags"], params)
    for i in gen:
        print(i)
        break
# test_input()

In [2]:
## Early stopping
# train_input = functools.partial(input_layer, '')

# # 1. Define our input_fn
# train_inpf = functools.partial(input_fn, 'words.train.txt', 'tags.train.txt',
#                                params, shuffle_and_repeat=True)
# eval_inpf = functools.partial(input_fn,'words.testa.txt', 'tags.testa.txt'
#                               params)

# # 2. Create a hook
# Path(estimator.eval_dir()).mkdir(parents=True, exist_ok=True)
# hook = tf.contrib.estimator.stop_if_no_increase_hook(
#     estimator, 'f1', 500, min_steps=8000, run_every_secs=120)
# train_spec = tf.estimator.TrainSpec(input_fn=input_fn, hooks=[hook])
# eval_spec = tf.estimator.EvalSpec(input_fn=eval_inpf, throttle_secs=120)

# # 3. Train with early stopping
# tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

# main

In [3]:
"""GloVe Embeddings + bi-LSTM + CRF"""
__author__ = "Guillaume Genthial"

import functools
import json
import logging
from pathlib import Path
from datetime import datetime
import sys

import numpy as np
import tensorflow as tf
from tf_metrics import precision, recall, f1

DATA_DIR = '../Data/4ner/'
OBJECTS_DIR = '../objects/'

# Logging
now = datetime.now()
RESULT_DIR = Path('results/') / now.strftime("%m%d_%H%M%S")
RESULT_DIR.mkdir(exist_ok=True)
tf.compat.v1.logging.set_verbosity(logging.INFO)
handlers = [
    logging.FileHandler(RESULT_DIR / 'main.log'),
    # logging.StreamHandler(sys.stdout)
]
logging.getLogger('tensorflow').handlers = handlers

In [4]:
def parse_fn(line_words, line_tags):
    """ parse words and tags line by line
        Note: tf need string encoded to `bytes`
    """
    # Encode in Bytes for TF
    words = [w.encode() for w in line_words.strip().split()[2:]]
    tags = [t.encode() for t in line_tags.strip().split()[2:]]

    assert len(words) == len(tags), f"Words and tags lengths don't match"
    return (words, len(words)), tags


def generator_fn(words, tags):
    with Path(words).open('r') as f_words, Path(tags).open('r') as f_tags:
        for line_words, line_tags in zip(f_words, f_tags):
            yield parse_fn(line_words, line_tags)

def input_fn(words, tags, params={}, shuffle_and_repeat=False):
    shapes = (([None], ()), [None])
    types = ((tf.string, tf.int32), tf.string)
    defaults = (('<pad>', 0), 'O')

    dataset = tf.data.Dataset.from_generator(
        functools.partial(generator_fn, words, tags),
        output_shapes=shapes, output_types=types)

    if shuffle_and_repeat:
        dataset = dataset.shuffle(params['buffer']).repeat(params['epochs'])

    # prefetch: ensure a batch of data is pre-loaded on the computing device
    #           to avoid starvation (=wasting compute resources)
    dataset = (dataset
               .padded_batch(params.get('batch_size', 20), shapes, defaults)
               .prefetch(1))

#     iterator = dataset.make_one_shot_iterator()
#     (words, nwords), tags = iterator.get_next()
#     features = {'words': words, 'nwords': nwords}
#     labels = {'tags': tags}
#     return features, labels
    return dataset

In [5]:
def model_fn(features, labels, mode, params):
    # For serving, features are a bit different
    if isinstance(features, dict):
        features = features['words'], features['nwords']

    # Read vocabs and inputs
    dropout = params['dropout']
    words, nwords = features
    training = (mode == tf.estimator.ModeKeys.TRAIN)
    vocab_words = tf.contrib.lookup.index_table_from_file(
        params['words'], num_oov_buckets=params['num_oov_buckets'])
    with Path(params['tags']).open() as f:
        indices = [idx for idx, tag in enumerate(f) if tag.strip(params['seprator']) != 'O']
        num_tags = len(indices) + 1

    # Word Embeddings
    word_ids = vocab_words.lookup(words)
    glove = np.load(params['glove'])['embeddings']  # np.array
    variable = np.vstack([glove, [[0.]*params['dim']]])   # for unknown words
    variable = tf.Variable(variable, dtype=tf.float32, trainable=False)
    embeddings = tf.nn.embedding_lookup(variable, word_ids)
    embeddings = tf.layers.dropout(embeddings, rate=dropout, training=training)

    # LSTM
    t = tf.transpose(embeddings, perm=[1, 0, 2])  # make time-major
    lstm_cell_fw = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size'])
    output_fw, _ = lstm_cell_fw(t, dtype=tf.float32, sequence_length=nwords)
    
    if params['bi_lstm']:
        lstm_cell_bw = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size'])
        lstm_cell_bw = tf.contrib.rnn.TimeReversedFusedRNN(lstm_cell_bw)
        output_bw, _ = lstm_cell_bw(t, dtype=tf.float32, sequence_length=nwords)
        output = tf.concat([output_fw, output_bw], axis=-1)
    else:
        output = output_fw

    output = tf.transpose(output, perm=[1, 0, 2])  # make batch-major
    output = tf.layers.dropout(output, rate=dropout, training=training)

    # CRF
    logits = tf.layers.dense(output, num_tags) # dim: [batch_size, max_seq_len, num_tags]
    if params['crf']:
        crf_params = tf.get_variable("crf", [num_tags, num_tags], dtype=tf.float32)
        pred_ids, _ = tf.contrib.crf.crf_decode(logits, crf_params, nwords)
    else:
        pred_ids = logits

    if mode == tf.estimator.ModeKeys.PREDICT:
        # Predictions
        reverse_vocab_tags = tf.contrib.lookup.index_to_string_table_from_file(
            params['tags'])

        pred_strings = reverse_vocab_tags.lookup(tf.to_int64(pred_ids))
        predictions = {
            'pred_ids': pred_ids,
            'tags': pred_strings
        }
        return tf.estimator.EstimatorSpec(mode, predictions=predictions)
    else:
        # Loss
        vocab_tags = tf.contrib.lookup.index_table_from_file(params['tags'])
        tags = vocab_tags.lookup(labels) # dim: [batch_size, max_seq_len]

        if params['crf']:
            log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(
                logits, tags, nwords, crf_params)
        else:
            # doesn't work
            log_likelihood = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=tags,
                logits=logits)
        
        loss = tf.reduce_mean(-log_likelihood)

        # Metrics
        weights = tf.sequence_mask(nwords)
        metrics = {
            'acc': tf.metrics.accuracy(tags, pred_ids, weights),
            'precision': precision(tags, pred_ids, num_tags, indices, weights),
            'recall': recall(tags, pred_ids, num_tags, indices, weights),
            'f1': f1(tags, pred_ids, num_tags, indices, weights),
        }
        for metric_name, op in metrics.items():
            tf.compat.v1.summary.scalar(metric_name, op[1])

        if mode == tf.estimator.ModeKeys.EVAL:
            return tf.estimator.EstimatorSpec(
                mode, loss=loss, eval_metric_ops=metrics)

        elif mode == tf.estimator.ModeKeys.TRAIN:
            train_op = tf.train.AdamOptimizer().minimize(
                loss, global_step=tf.train.get_or_create_global_step())
            return tf.estimator.EstimatorSpec(
                mode, loss=loss, train_op=train_op)



In [6]:
if __name__ == '__main__':
    # Params
    params = {
        'dim': 300,
        'dropout': 0.5,
        'num_oov_buckets': 1,
        'epochs': 20,
        'batch_size': 20,
        'buffer': 2500,
        'bi_lstm': True,
        'lstm_size': 100,
        'crf': True, # False doesn't work yet
        'texts': str(Path(OBJECTS_DIR, 'train_corpus.pkl')),
        #'tags': str(Path(OBJECTS_DIR, 'train_corpus_labels.pkl')),
        'tags': str(Path(DATA_DIR, 'vocab.tags.txt')),
        'words': str(Path(DATA_DIR, 'vocab.words.txt')),
        'chars': str(Path(DATA_DIR, 'vocab.chars.txt')),
        'glove': str(Path(DATA_DIR, 'glove.npz')),
        'seprator': ' '
    }

    with (RESULT_DIR/'params.json').open('w') as f:
        json.dump(params, f, indent=4, sort_keys=True)

    def fwords(name):
        return str(Path(DATA_DIR, '{}.words.txt'.format(name)))

    def ftags(name):
        return str(Path(DATA_DIR, '{}.tags.txt'.format(name)))

    # Estimator, train and evaluate
    train_inpf = functools.partial(input_fn, fwords('train_doc'), ftags('train_doc'),
                                   params, shuffle_and_repeat=True)
    eval_inpf = functools.partial(input_fn, fwords('valid_doc'), ftags('valid_doc'))

    # serialize weights to disk every 2 mins
    cfg = tf.estimator.RunConfig(save_checkpoints_secs=120)
    estimator = tf.estimator.Estimator(model_fn, RESULT_DIR/'model', cfg, params)
    Path(estimator.eval_dir()).mkdir(parents=True, exist_ok=True)
    
    hook = tf.compat.v2.estimator.experimental.stop_if_no_increase_hook(
        estimator, 'f1', 500, min_steps=100, run_every_secs=120)
    summary_hook = tf.train.SummarySaverHook(
        save_secs=2,
        output_dir=RESULT_DIR,
        scaffold=tf.train.Scaffold(summary_op=tf.summary.merge_all()))
    train_spec = tf.estimator.TrainSpec(input_fn=train_inpf, hooks=[hook, summary_hook])
    eval_spec = tf.estimator.EvalSpec(input_fn=eval_inpf, throttle_secs=120)
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

    # Write predictions to file
    def write_predictions(name):
        (RESULT_DIR/'score').mkdir(parents=True, exist_ok=True)
        with (RESULT_DIR/f'score/{name}.preds.txt').open('wb') as f:
            test_inpf = functools.partial(input_fn, fwords(name), ftags(name))
            golds_gen = generator_fn(fwords(name), ftags(name))
            preds_gen = estimator.predict(test_inpf)
            for golds, preds in zip(golds_gen, preds_gen):
                ((words, _), tags) = golds
                for word, tag, tag_pred in zip(words, tags, preds['tags']):
                    f.write(b' '.join([word, tag, tag_pred]) + b'\n')
                f.write(b'\n')

    for name in ['train_doc', 'valid_doc', 'test']:
        write_predictions(name)

I1021 23:52:22.835493 140239895029504 estimator.py:209] Using config: {'_model_dir': 'results/1021_235222/model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 120, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f8baf74d0b8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
W1021 23:52:22.838355 140239895029504 module_wrapper.py:136] From /home/texuanw/softw

I1021 23:53:10.842099 140239895029504 basic_session_run_hooks.py:692] global_step/sec: 21.3369
I1021 23:53:10.843842 140239895029504 basic_session_run_hooks.py:260] loss = 0.96171105, step = 801 (4.687 sec)
I1021 23:53:15.372062 140239895029504 basic_session_run_hooks.py:692] global_step/sec: 22.075
I1021 23:53:15.373757 140239895029504 basic_session_run_hooks.py:260] loss = 1.2888892, step = 901 (4.530 sec)
I1021 23:53:20.095606 140239895029504 basic_session_run_hooks.py:692] global_step/sec: 21.1707
I1021 23:53:20.097572 140239895029504 basic_session_run_hooks.py:260] loss = 1.4906082, step = 1001 (4.724 sec)
I1021 23:53:24.768089 140239895029504 basic_session_run_hooks.py:692] global_step/sec: 21.4019
I1021 23:53:24.771737 140239895029504 basic_session_run_hooks.py:260] loss = 0.46519774, step = 1101 (4.674 sec)
I1021 23:53:29.441246 140239895029504 basic_session_run_hooks.py:692] global_step/sec: 21.3986
I1021 23:53:29.444833 140239895029504 basic_session_run_hooks.py:260] loss = 1