In [1]:
import tensorflow as tf
import functools
from pathlib import Path
import json
import math
from model.metrics import precision, recall, f1
from model.cnn import masked_conv1d_and_max
# tf.enable_eager_execution()

DATADIR = 'data/conll2003'
LABEL_COL = 3

# Params
params = {
    'dim_chars': 100,
    'dim': 300,
    'dropout': 0.5,
    'num_oov_buckets': 1,
    'epochs': 25,
    'batch_size': 20,
    'buffer': 15000,
    'filters': 50,
    'kernel_size': 3,
    'lstm_size': 100,
    'words': str(Path(DATADIR, 'vocab.words.txt')),
    'chars': str(Path(DATADIR, 'vocab.chars.txt')),
    'tags': str(Path(DATADIR, 'vocab.tags.txt')),
    'glove': str(Path(DATADIR, 'glove.npz'))
}

with Path(DATADIR, 'params.json').open('w') as f:
    json.dump(params, f, indent=4, sort_keys=True)

In [2]:
def split_array(arr, separator_fn):
    arrays = [[]]
    for i, el in enumerate(arr):
        if separator_fn(el):
            arrays.append([])
        else:
            arrays[-1].append(el)
    return [a for a in arrays if len(a) > 0]

def join_arrays(arr, separator):
    res = []
    for i, el in enumerate(arr):
        res += el + [separator]
    return res

In [3]:
import re
from pathlib import Path
import functools

def get_sentences(filename):    
  with Path(DATADIR, filename).open('r', encoding="utf-8") as f:
    sentences = f.read().strip().split('\n\n')
    return [[t.split() for t in s.split('\n')] for s in sentences if len(s) > 0] 
    
def parse_fn(sentence, label_col=3):
    # Encode in Bytes for Tensorflow.
    words = [s[0] for s in sentence]
    tags = [s[label_col].encode() for s in sentence]
    
    # Chars.
    chars = [[c.encode() for c in w] for w in words]
    lengths = [len(c) for c in chars]
    max_len = max(lengths)
    chars = [c + [b'<pad>'] * (max_len - l) for c, l in zip(chars, lengths)]
    
    words = [s[0].encode() for s in sentence]      
    return ((words, len(words)), (chars, lengths)), tags
    
def generator_fn(filename, label_col=3):
    sentences = get_sentences(filename)
    documents = split_array(sentences, lambda el : el[0][0] == '-DOCSTART-')
    documents = [join_arrays(d, ['EOS', '-X-', '-X-', 'O']) for d in documents]
    for d in documents:
        yield parse_fn(d, label_col)
            
def input_fn(filename, params=None, shuffle_and_repeat=False):
  params = params if params is not None else {}
  shapes = (
     (([None], ()),           # (words, nwords)
     ([None, None], [None])), # (chars, nchars)  
     [None]                   # tags
  )

  types = (
    ((tf.string, tf.int32),
    (tf.string, tf.int32)),  
    tf.string
  )

  defaults = (
    (('<pad>', 0),
    ('<pad>', 0)), 
    'O'
  )

  dataset = tf.data.Dataset.from_generator(
    functools.partial(generator_fn, filename, label_col=LABEL_COL),
    output_types=types, output_shapes=shapes
  )

  if shuffle_and_repeat:
    dataset = dataset.shuffle(params['buffer']).repeat(params['epochs'])

  dataset = dataset.padded_batch(params.get('batch_size', 20), shapes, defaults)
  return dataset

# Estimator, train and evaluate
train_inpf = functools.partial(input_fn, 'train', params, shuffle_and_repeat=True)
eval_inpf  = functools.partial(input_fn, 'valid')

print('Done')

Done


In [4]:
import numpy as np 

from six.moves import reduce
import tensorflow as tf

def model_fn(features, labels, mode, params):
    # For serving features are a bit different
    if isinstance(features, dict):
        features = ((features['words'], features['nwords']),
                    (features['chars'], features['nchars']))  
        
    # Read vocabs and inputs
    dropout = params['dropout']
    (words, nwords), (chars, nchars) = features
    training = (mode == tf.estimator.ModeKeys.TRAIN)
    vocab_words = tf.contrib.lookup.index_table_from_file(
        params['words'], num_oov_buckets=params['num_oov_buckets'])
    vocab_chars = tf.contrib.lookup.index_table_from_file(
        params['chars'], num_oov_buckets=params['num_oov_buckets'])
    with Path(params['tags']).open() as f:
        indices = [idx for idx, tag in enumerate(f) if tag.strip() != 'O']
        num_tags = len(indices) + 1
    with Path(params['chars']).open() as f:
        num_chars = sum(1 for _ in f) + params['num_oov_buckets']

    # Char Embeddings
    char_ids = vocab_chars.lookup(chars)
    variable = tf.get_variable(
        'chars_embeddings', [num_chars + 1, params['dim_chars']], tf.float32)
    char_embeddings = tf.nn.embedding_lookup(variable, char_ids)
    char_embeddings = tf.layers.dropout(char_embeddings, rate=dropout,
                                        training=training)

    # Char 1d convolution
    weights = tf.sequence_mask(nchars)
    char_embeddings = masked_conv1d_and_max(
        char_embeddings, weights, params['filters'], params['kernel_size'])

    # Word Embeddings
    word_ids = vocab_words.lookup(words)
    glove = np.load(params['glove'])['embeddings']  # np.array
    variable = np.vstack([glove, [[0.] * params['dim']]])
    variable = tf.Variable(variable, dtype=tf.float32, trainable=False)
    word_embeddings = tf.nn.embedding_lookup(variable, word_ids)

    # Concatenate Word and Char Embeddings
    embeddings = tf.concat([word_embeddings, char_embeddings], axis=-1)
    embeddings = tf.layers.dropout(embeddings, rate=dropout, training=training)

    # LSTM
    t = tf.transpose(embeddings, perm=[1, 0, 2])  # Need time-major
    lstm_cell_fw = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size'])
    lstm_cell_bw = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size'])
    lstm_cell_bw = tf.contrib.rnn.TimeReversedFusedRNN(lstm_cell_bw)
    output_fw, _ = lstm_cell_fw(t, dtype=tf.float32, sequence_length=nwords)
    output_bw, _ = lstm_cell_bw(t, dtype=tf.float32, sequence_length=nwords)
    output = tf.concat([output_fw, output_bw], axis=-1)
    output = tf.transpose(output, perm=[1, 0, 2])
    output = tf.layers.dropout(output, rate=dropout, training=training)

    # CRF
    logits = tf.layers.dense(output, num_tags)
    crf_params = tf.get_variable("crf", [num_tags, num_tags], dtype=tf.float32)
    pred_ids, _ = tf.contrib.crf.crf_decode(logits, crf_params, nwords)

    if mode == tf.estimator.ModeKeys.PREDICT:
        # Predictions
        reverse_vocab_tags = tf.contrib.lookup.index_to_string_table_from_file(
            params['tags'])
        pred_strings = reverse_vocab_tags.lookup(tf.to_int64(pred_ids))
        predictions = {
            'pred_ids': pred_ids,
            'tags': pred_strings
        }
        return tf.estimator.EstimatorSpec(mode, predictions=predictions)
    else:
        # Loss
        vocab_tags = tf.contrib.lookup.index_table_from_file(params['tags'])
        tags = vocab_tags.lookup(labels)
        log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(
            logits, tags, nwords, crf_params)
        loss = tf.reduce_mean(-log_likelihood)

        # Metrics
        weights = tf.sequence_mask(nwords)
        metrics = {
            'acc': tf.metrics.accuracy(tags, pred_ids, weights),
            'precision': precision(tags, pred_ids, num_tags, indices, weights),
            'recall': recall(tags, pred_ids, num_tags, indices, weights),
            'f1': f1(tags, pred_ids, num_tags, indices, weights),
        }
        for metric_name, op in metrics.items():
            tf.summary.scalar(metric_name, op[1])

        if mode == tf.estimator.ModeKeys.EVAL:
            return tf.estimator.EstimatorSpec(
                mode, loss=loss, eval_metric_ops=metrics)

        elif mode == tf.estimator.ModeKeys.TRAIN:
            train_op = tf.train.AdamOptimizer().minimize(
                loss, global_step=tf.train.get_or_create_global_step())
            return tf.estimator.EstimatorSpec(
                mode, loss=loss, train_op=train_op)
    
print('Done')

Done


In [5]:
cfg = tf.estimator.RunConfig(save_checkpoints_secs=120)
estimator = tf.estimator.Estimator(model_fn, 'results/model', cfg, params)
Path(estimator.eval_dir()).mkdir(parents=True, exist_ok=True)

# Early stop if F1 does not increase.
hook = tf.contrib.estimator.stop_if_no_increase_hook(estimator, 'f1', 500, min_steps=8000, run_every_secs=120)

train_spec = tf.estimator.TrainSpec(input_fn=train_inpf, hooks=[hook])
eval_spec = tf.estimator.EvalSpec(input_fn=eval_inpf, throttle_secs=120)
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

INFO:tensorflow:Using config: {'_train_distribute': None, '_log_step_count_steps': 100, '_device_fn': None, '_tf_random_seed': None, '_global_id_in_cluster': 0, '_num_ps_replicas': 0, '_experimental_distribute': None, '_evaluation_master': '', '_save_summary_steps': 100, '_protocol': None, '_master': '', '_task_id': 0, '_model_dir': 'results/model', '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff095d85710>, '_num_worker_replicas': 1, '_save_checkpoints_secs': 120, '_eval_distribute': None, '_keep_checkpoint_max': 5, '_service': None, '_save_checkpoints_steps': None, '_task_type': 'worker', '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_every_n_hours': 10000, '_is_chief': True}
INFO:tensorflow:Not using Distribute Coordinator.
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evalua

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from results/model/model.ckpt-4605
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 4605 into results/model/model.ckpt.
INFO:tensorflow:loss = 70.743935, step = 4606
INFO:tensorflow:Saving checkpoints for 4650 into results/model/model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-12-17-21:31:10
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from results/model/model.ckpt-4650
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [10/100]
INFO:tensorflow:Finished evaluation at 2018-12-17-21:31:19
INFO:tensorflow:Saving dict for global step 4650: acc = 0.9833187, f1 = 0.92188144, global_step = 4650, loss =

INFO:tensorflow:Saving 'checkpoint_path' summary for global step 5062: results/model/model.ckpt-5062
INFO:tensorflow:Saving checkpoints for 5106 into results/model/model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-12-17-21:51:24
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from results/model/model.ckpt-5106
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [10/100]
INFO:tensorflow:Finished evaluation at 2018-12-17-21:51:32
INFO:tensorflow:Saving dict for global step 5106: acc = 0.98923314, f1 = 0.94847655, global_step = 5106, loss = 8.010061, precision = 0.9544492, recall = 0.9425782
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 5106: results/model/model.ckpt-5106
INFO:tensorflow:global_step/sec: 0.365332
INFO:tensorflow:loss = 6.9861007, step = 5106 (273.866 sec)
INFO:tensorflow:Saving checkpoints for

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-12-17-22:11:38
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from results/model/model.ckpt-5566
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [10/100]
INFO:tensorflow:Finished evaluation at 2018-12-17-22:11:45
INFO:tensorflow:Saving dict for global step 5566: acc = 0.9902952, f1 = 0.9541606, global_step = 5566, loss = 7.7406726, precision = 0.9586951, recall = 0.9496687
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 5566: results/model/model.ckpt-5566
INFO:tensorflow:global_step/sec: 0.391375
INFO:tensorflow:loss = 8.65643, step = 5606 (255.510 sec)
INFO:tensorflow:Saving checkpoints for 5614 into results/model/model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-12-17-22:13:39
INFO:tenso

({'acc': 0.99033177,
  'f1': 0.9538767,
  'global_step': 5788,
  'loss': 7.879855,
  'precision': 0.95824045,
  'recall': 0.9495525},
 [])

In [None]:
# Write predictions to file.
def write_predictions(name):
  Path('results/score').mkdir(parents=True, exist_ok=True)
  with Path('results/score/{}.preds.txt'.format(name)).open('wb') as f:
    test_inpf = functools.partial(input_fn, name)
    golds_gen = generator_fn(name, label_col=LABEL_COL)
    preds_gen = estimator.predict(test_inpf)
    for golds, preds in zip(golds_gen, preds_gen):
      ((words, _),(_, _)), tags = golds
      for word, tag, tag_pred in zip(words, tags, preds['tags']):
        f.write(b' '.join([word, tag, tag_pred]) + b'\n')
      f.write(b'\n')

for name in ['train', 'valid', 'test']:
  write_predictions(name)

In [None]:
!./conlleval < results/score/train.preds.txt
!./conlleval < results/score/valid.preds.txt
!./conlleval < results/score/test.preds.txt