### Initialization

https://python-3-patterns-idioms-test.readthedocs.io/en/latest/Singleton.html

In [1]:
import tensorflow as tf
import functools
from pathlib import Path
import json
import math
from model.metrics import precision, recall, f1
from model.cnn import masked_conv1d_and_max
# tf.enable_eager_execution()

DATADIR = 'data/conll2003'
LABEL_COL = 3

# Params
params = {
    'dim_chars': 100,
    'dim': 300,
    'dropout': 0.5,
    'num_oov_buckets': 1,
    'epochs': 25,
    'batch_size': 20,
    'buffer': 15000,
    'filters': 50,
    'kernel_size': 3,
    'lstm_size': 100,
    'words': str(Path(DATADIR, 'vocab.words.txt')),
    'chars': str(Path(DATADIR, 'vocab.chars.txt')),
    'tags': str(Path(DATADIR, 'vocab.tags.txt')),
    'glove': str(Path(DATADIR, 'glove.npz'))
}

with Path(DATADIR, 'params.json').open('w') as f:
    json.dump(params, f, indent=4, sort_keys=True)

In [2]:
import re

MAX_TOKEN_LENGTH = 10
MINIBATCH_SIZE = 10

def get_sentences(filename):    
  with Path(DATADIR, filename).open('r', encoding="utf-8") as f:
    sentences = f.read().strip().split('\n\n')
    
    sentences = [s for s in sentences if not s.startswith('-DOCSTART-')]
    
    return [[t.split() for t in s.split('\n')] for s in sentences if len(s) > 0] 

# By grouping same length sentences we can make better use of the GPU.
# def group_by_len(arr): 
#   groups = {}
#   for i in range(len(arr)):
#     key = len(arr[i])
#     if not key in groups:
#       groups[key] = []  
#     groups[key].append(i) 
#   return groups
#
# def get_minibatches(sentences):
#   groups = group_by_len(sentences)
#   batches = [] 
#        
#   for key in groups:
#     group_size = len(groups[key])
#     bins = int(math.ceil(group_size / float(MINIBATCH_SIZE)))
#     bin_size = int(math.ceil(group_size / float(bins)))
#    
#     for i in range(0, group_size, MINIBATCH_SIZE):
#       bin = []
#       for id in groups[key][i:i+MINIBATCH_SIZE]:
#         bin.append(id)
#       batches.append((key, bin))
#   return batches

def parse_fn(sentence, label_col=3):
    # Encode in Bytes for Tensorflow.
    words = [s[0] for s in sentence]
    tags = [s[label_col].encode() for s in sentence]
    
    # Chars.
    chars = [[c.encode() for c in w] for w in words]
    lengths = [len(c) for c in chars]
    max_len = max(lengths)
    chars = [c + [b'<pad>'] * (max_len - l) for c, l in zip(chars, lengths)]
    
    words = [s[0].encode() for s in sentence]      
    return ((words, len(words)), (chars, lengths)), tags
    
def generator_fn(filename, label_col=3):
    sentences = get_sentences(filename)
    for s in sentences:
        yield parse_fn(s, label_col)
            
def input_fn(filename, params=None, shuffle_and_repeat=False):
  params = params if params is not None else {}
  shapes = (
     (([None], ()),           # (words, nwords)
     ([None, None], [None])), # (chars, nchars)  
     [None]                   # tags
  )

  types = (
    ((tf.string, tf.int32),
    (tf.string, tf.int32)),  
    tf.string
  )

  defaults = (
    (('<pad>', 0),
    ('<pad>', 0)), 
    'O'
  )

  dataset = tf.data.Dataset.from_generator(
    functools.partial(generator_fn, filename, label_col=LABEL_COL),
    output_types=types, output_shapes=shapes
  )

  if shuffle_and_repeat:
    dataset = dataset.shuffle(params['buffer']).repeat(params['epochs'])

  dataset = dataset.padded_batch(params.get('batch_size', 20), shapes, defaults)
  return dataset

# Estimator, train and evaluate
train_inpf = functools.partial(input_fn, 'train', params, shuffle_and_repeat=True)
eval_inpf  = functools.partial(input_fn, 'valid')

print('Done')

Done


### Model

In [3]:
import numpy as np 

from six.moves import reduce
import tensorflow as tf

def model_fn(features, labels, mode, params):
    # For serving features are a bit different
    if isinstance(features, dict):
        features = ((features['words'], features['nwords']),
                    (features['chars'], features['nchars']))  
        
    # Read vocabs and inputs
    dropout = params['dropout']
    (words, nwords), (chars, nchars) = features
    training = (mode == tf.estimator.ModeKeys.TRAIN)
    vocab_words = tf.contrib.lookup.index_table_from_file(
        params['words'], num_oov_buckets=params['num_oov_buckets'])
    vocab_chars = tf.contrib.lookup.index_table_from_file(
        params['chars'], num_oov_buckets=params['num_oov_buckets'])
    with Path(params['tags']).open() as f:
        indices = [idx for idx, tag in enumerate(f) if tag.strip() != 'O']
        num_tags = len(indices) + 1
    with Path(params['chars']).open() as f:
        num_chars = sum(1 for _ in f) + params['num_oov_buckets']

    # Char Embeddings
    char_ids = vocab_chars.lookup(chars)
    variable = tf.get_variable(
        'chars_embeddings', [num_chars + 1, params['dim_chars']], tf.float32)
    char_embeddings = tf.nn.embedding_lookup(variable, char_ids)
    char_embeddings = tf.layers.dropout(char_embeddings, rate=dropout,
                                        training=training)

    # Char 1d convolution
    weights = tf.sequence_mask(nchars)
    char_embeddings = masked_conv1d_and_max(
        char_embeddings, weights, params['filters'], params['kernel_size'])

    # Word Embeddings
    word_ids = vocab_words.lookup(words)
    glove = np.load(params['glove'])['embeddings']  # np.array
    variable = np.vstack([glove, [[0.] * params['dim']]])
    variable = tf.Variable(variable, dtype=tf.float32, trainable=False)
    word_embeddings = tf.nn.embedding_lookup(variable, word_ids)

    # Concatenate Word and Char Embeddings
    embeddings = tf.concat([word_embeddings, char_embeddings], axis=-1)
    embeddings = tf.layers.dropout(embeddings, rate=dropout, training=training)

    # LSTM
    t = tf.transpose(embeddings, perm=[1, 0, 2])  # Need time-major
    lstm_cell_fw = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size'])
    lstm_cell_bw = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size'])
    lstm_cell_bw = tf.contrib.rnn.TimeReversedFusedRNN(lstm_cell_bw)
    output_fw, _ = lstm_cell_fw(t, dtype=tf.float32, sequence_length=nwords)
    output_bw, _ = lstm_cell_bw(t, dtype=tf.float32, sequence_length=nwords)
    output = tf.concat([output_fw, output_bw], axis=-1)
    output = tf.transpose(output, perm=[1, 0, 2])
    output = tf.layers.dropout(output, rate=dropout, training=training)

    # CRF
    logits = tf.layers.dense(output, num_tags)
    crf_params = tf.get_variable("crf", [num_tags, num_tags], dtype=tf.float32)
    pred_ids, _ = tf.contrib.crf.crf_decode(logits, crf_params, nwords)

    if mode == tf.estimator.ModeKeys.PREDICT:
        # Predictions
        reverse_vocab_tags = tf.contrib.lookup.index_to_string_table_from_file(
            params['tags'])
        pred_strings = reverse_vocab_tags.lookup(tf.to_int64(pred_ids))
        predictions = {
            'pred_ids': pred_ids,
            'tags': pred_strings
        }
        return tf.estimator.EstimatorSpec(mode, predictions=predictions)
    else:
        # Loss
        vocab_tags = tf.contrib.lookup.index_table_from_file(params['tags'])
        tags = vocab_tags.lookup(labels)
        log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(
            logits, tags, nwords, crf_params)
        loss = tf.reduce_mean(-log_likelihood)

        # Metrics
        weights = tf.sequence_mask(nwords)
        metrics = {
            'acc': tf.metrics.accuracy(tags, pred_ids, weights),
            'precision': precision(tags, pred_ids, num_tags, indices, weights),
            'recall': recall(tags, pred_ids, num_tags, indices, weights),
            'f1': f1(tags, pred_ids, num_tags, indices, weights),
        }
        for metric_name, op in metrics.items():
            tf.summary.scalar(metric_name, op[1])

        if mode == tf.estimator.ModeKeys.EVAL:
            return tf.estimator.EstimatorSpec(
                mode, loss=loss, eval_metric_ops=metrics)

        elif mode == tf.estimator.ModeKeys.TRAIN:
            train_op = tf.train.AdamOptimizer().minimize(
                loss, global_step=tf.train.get_or_create_global_step())
            return tf.estimator.EstimatorSpec(
                mode, loss=loss, train_op=train_op)
    
print('Done')

Done


In [5]:
cfg = tf.estimator.RunConfig(save_checkpoints_secs=120)
estimator = tf.estimator.Estimator(model_fn, 'results/model', cfg, params)
Path(estimator.eval_dir()).mkdir(parents=True, exist_ok=True)

# Early stop if F1 does not increase.
hook = tf.contrib.estimator.stop_if_no_increase_hook(estimator, 'f1', 500, min_steps=8000, run_every_secs=120)

train_spec = tf.estimator.TrainSpec(input_fn=train_inpf, hooks=[hook])
eval_spec = tf.estimator.EvalSpec(input_fn=eval_inpf, throttle_secs=120)
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

INFO:tensorflow:Using config: {'_is_chief': True, '_master': '', '_keep_checkpoint_max': 5, '_train_distribute': None, '_save_summary_steps': 100, '_log_step_count_steps': 100, '_task_id': 0, '_save_checkpoints_steps': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_protocol': None, '_tf_random_seed': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f37b014c6a0>, '_global_id_in_cluster': 0, '_save_checkpoints_secs': 120, '_experimental_distribute': None, '_evaluation_master': '', '_device_fn': None, '_num_worker_replicas': 1, '_service': None, '_eval_distribute': None, '_keep_checkpoint_every_n_hours': 10000, '_task_type': 'worker', '_model_dir': 'results/model', '_num_ps_replicas': 0}
INFO:tensorflow:Not using Distribute Coordinator.
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evalua

INFO:tensorflow:global_step/sec: 6.72893
INFO:tensorflow:loss = 0.36960164, step = 3601 (14.862 sec)
INFO:tensorflow:global_step/sec: 8.91275
INFO:tensorflow:loss = 0.3851058, step = 3701 (11.220 sec)
INFO:tensorflow:global_step/sec: 8.95681
INFO:tensorflow:loss = 0.73071414, step = 3801 (11.165 sec)
INFO:tensorflow:Saving checkpoints for 3841 into results/model/model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-12-13-19:30:02
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from results/model/model.ckpt-3841
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [10/100]
INFO:tensorflow:Evaluation [20/100]
INFO:tensorflow:Evaluation [30/100]
INFO:tensorflow:Evaluation [40/100]
INFO:tensorflow:Evaluation [50/100]
INFO:tensorflow:Evaluation [60/100]
INFO:tensorflow:Evaluation [70/100]
INFO:tensorflow:Evaluation [80/100]
INFO:tensor

INFO:tensorflow:loss = 0.5414947, step = 7401 (16.493 sec)
INFO:tensorflow:Saving checkpoints for 7460 into results/model/model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-12-13-19:38:03
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from results/model/model.ckpt-7460
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [10/100]
INFO:tensorflow:Evaluation [20/100]
INFO:tensorflow:Evaluation [30/100]
INFO:tensorflow:Evaluation [40/100]
INFO:tensorflow:Evaluation [50/100]
INFO:tensorflow:Evaluation [60/100]
INFO:tensorflow:Evaluation [70/100]
INFO:tensorflow:Evaluation [80/100]
INFO:tensorflow:Evaluation [90/100]
INFO:tensorflow:Evaluation [100/100]
INFO:tensorflow:Finished evaluation at 2018-12-13-19:38:08
INFO:tensorflow:Saving dict for global step 7460: acc = 0.98896974, f1 = 0.9466362, global_step = 7460, loss = 0.506277, p

({'acc': 0.98984313,
  'f1': 0.95153743,
  'global_step': 9900,
  'loss': 0.4733488,
  'precision': 0.9531499,
  'recall': 0.9499304},
 [])

In [6]:
# Write predictions to file.
def write_predictions(name):
  Path('results/score').mkdir(parents=True, exist_ok=True)
  with Path('results/score/{}.preds.txt'.format(name)).open('wb') as f:
    test_inpf = functools.partial(input_fn, name)
    golds_gen = generator_fn(name, label_col=LABEL_COL)
    preds_gen = estimator.predict(test_inpf)
    for golds, preds in zip(golds_gen, preds_gen):
      ((words, _),(_, _)), tags = golds
      for word, tag, tag_pred in zip(words, tags, preds['tags']):
        f.write(b' '.join([word, tag, tag_pred]) + b'\n')
      f.write(b'\n')

for name in ['train', 'valid', 'test']:
  write_predictions(name)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from results/model/model.ckpt-9900
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from results/model/model.ckpt-9900
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from results/model/model.ckpt-9900
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [7]:
!./conlleval < results/score/train.preds.txt
!./conlleval < results/score/valid.preds.txt
!./conlleval < results/score/test.preds.txt

processed 203621 tokens with 23499 phrases; found: 23444 phrases; correct: 22969.
accuracy:  99.69%; precision:  97.97%; recall:  97.74%; FB1:  97.86
              LOC: precision:  98.52%; recall:  99.03%; FB1:  98.78  7177
             MISC: precision:  96.94%; recall:  94.10%; FB1:  95.50  3337
              ORG: precision:  97.89%; recall:  96.66%; FB1:  97.27  6242
              PER: precision:  97.98%; recall:  99.29%; FB1:  98.63  6688
processed 51362 tokens with 5942 phrases; found: 5942 phrases; correct: 5540.
accuracy:  98.83%; precision:  93.23%; recall:  93.23%; FB1:  93.23
              LOC: precision:  94.84%; recall:  97.11%; FB1:  95.97  1881
             MISC: precision:  90.83%; recall:  85.90%; FB1:  88.29  872
              ORG: precision:  91.51%; recall:  87.62%; FB1:  89.52  1284
              PER: precision:  93.91%; recall:  97.12%; FB1:  95.49  1905
processed 46435 tokens with 5648 phrases; found: 5653 phrases; correct: 5080.
accuracy:  97.93%; precision:  89.8

CONLL-2003

processed 204567 tokens with 23499 phrases; found: 23432 phrases; correct: 23014.
accuracy:  99.72%; precision:  98.22%; recall:  97.94%; FB1:  98.08
              LOC: precision:  98.74%; recall:  98.89%; FB1:  98.82  7151
             MISC: precision:  97.61%; recall:  94.04%; FB1:  95.79  3312
              ORG: precision:  97.38%; recall:  97.52%; FB1:  97.45  6330
              PER: precision:  98.75%; recall:  99.33%; FB1:  99.04  6639
              
processed 51578 tokens with 5942 phrases; found: 5928 phrases; correct: 5529.
accuracy:  98.83%; precision:  93.27%; recall:  93.05%; FB1:  93.16
              LOC: precision:  95.33%; recall:  96.68%; FB1:  96.00  1863
             MISC: precision:  90.15%; recall:  83.41%; FB1:  86.65  853
              ORG: precision:  89.77%; recall:  89.04%; FB1:  89.40  1330
              PER: precision:  95.11%; recall:  97.18%; FB1:  96.13  1882
              
processed 46666 tokens with 5648 phrases; found: 5649 phrases; correct: 5081.
accuracy:  97.93%; precision:  89.95%; recall:  89.96%; FB1:  89.95
              LOC: precision:  91.69%; recall:  92.63%; FB1:  92.16  1685
             MISC: precision:  80.78%; recall:  76.64%; FB1:  78.65  666
              ORG: precision:  86.46%; recall:  87.24%; FB1:  86.84  1676
              PER: precision:  95.50%; recall:  95.79%; FB1:  95.65  1622
              
processed 203621 tokens with 23499 phrases; found: 23444 phrases; correct: 22969.
accuracy:  99.69%; precision:  97.97%; recall:  97.74%; FB1:  97.86
              LOC: precision:  98.52%; recall:  99.03%; FB1:  98.78  7177
             MISC: precision:  96.94%; recall:  94.10%; FB1:  95.50  3337
              ORG: precision:  97.89%; recall:  96.66%; FB1:  97.27  6242
              PER: precision:  97.98%; recall:  99.29%; FB1:  98.63  6688
processed 51362 tokens with 5942 phrases; found: 5942 phrases; correct: 5540.
accuracy:  98.83%; precision:  93.23%; recall:  93.23%; FB1:  93.23
              LOC: precision:  94.84%; recall:  97.11%; FB1:  95.97  1881
             MISC: precision:  90.83%; recall:  85.90%; FB1:  88.29  872
              ORG: precision:  91.51%; recall:  87.62%; FB1:  89.52  1284
              PER: precision:  93.91%; recall:  97.12%; FB1:  95.49  1905
processed 46435 tokens with 5648 phrases; found: 5653 phrases; correct: 5080.
accuracy:  97.93%; precision:  89.86%; recall:  89.94%; FB1:  89.90
              LOC: precision:  90.89%; recall:  93.35%; FB1:  92.10  1713
             MISC: precision:  80.18%; recall:  77.21%; FB1:  78.66  676
              ORG: precision:  88.56%; recall:  85.79%; FB1:  87.16  1609
              PER: precision:  94.02%; recall:  96.23%; FB1:  95.11  1655

NER-HTML

processed 110269 tokens with 5822 phrases; found: 5855 phrases; correct: 5798.
accuracy:  99.74%; precision:  99.03%; recall:  99.59%; FB1:  99.31
              PER: precision:  99.03%; recall:  99.59%; FB1:  99.31  5855
processed 36757 tokens with 1788 phrases; found: 1869 phrases; correct: 1689.
accuracy:  98.34%; precision:  90.37%; recall:  94.46%; FB1:  92.37
              PER: precision:  90.37%; recall:  94.46%; FB1:  92.37  1869
processed 44795 tokens with 2708 phrases; found: 2604 phrases; correct: 2308.
accuracy:  96.66%; precision:  88.63%; recall:  85.23%; FB1:  86.90
              PER: precision:  88.63%; recall:  85.23%; FB1:  86.90  2604