In [2]:
import tensorflow as tf
import numpy as np
import os
import time
from pathlib import Path
from models.estimator import Estimator
from models.hmm import HiddenMarkov, load_raw_dataset

# Disable Tensorflow's debug logs.
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
tf.logging.set_verbosity(tf.logging.ERROR)

### Hidden Markov Models

In [4]:
start_time = time.time()
    
X, Y, _ = load_raw_dataset('data/train')
hmm = HiddenMarkov(
    timesteps=1, 
    use_features=True,
    self_train=True
)
hmm.fit(X, Y)
 
# Write results.
for name in ['train', 'valid', 'test']:
    print('Predicting ' + name)
    x, t, w = load_raw_dataset('data/' + name)
    p = hmm.predict(x)
    
    t = [[['O', 'B-PER', 'I-PER'][t__] for t__ in t_] for t_ in t]
    p = [[['O', 'B-PER', 'I-PER'][p__] for p__ in p_] for p_ in p]

    Path('results').mkdir(parents=True, exist_ok=True)
    with Path('results/{}.preds.txt'.format(name)).open('wb') as f:
        for words, preds, tags in zip(w, p, t):
            f.write(b'\n')
            for word, pred, tag in zip(words, preds, tags):
                f.write(' '.join([word, tag, pred]).encode() + b'\n')

print('Elapsed time: %.4f' % (time.time() - start_time))

Predicting train
Predicting valid
Predicting test
Elapsed time: 24.5647


### CRF / Bi-LSTM-CRF

In [7]:
estimator = Estimator()
estimator.set_params({
  'model': 'bi_lstm_crf', # bi_lstm_crf, crf
  'char_representation': 'cnn',
  'use_features': False,
})
estimator.train()
estimator.test()

Restoring model...
Loss: 0.0057, Acc: 1.0000, Time: 7.3136, Step: 300
Loss: 0.0097, Acc: 1.0000, Time: 12.8564, Step: 600
Loss: 0.0015, Acc: 1.0000, Time: 18.3857, Step: 900
Loss: 0.0000, Acc: 1.0000, Time: 29.4128, Step: 1200
Loss: 0.0082, Acc: 1.0000, Time: 34.8828, Step: 1500
Loss: 0.0042, Acc: 1.0000, Time: 41.3882, Step: 1800
Loss: 0.0014, Acc: 1.0000, Time: 47.0916, Step: 2100
Loss: 0.0017, Acc: 1.0000, Time: 52.3027, Step: 2400
Loss: 0.0095, Acc: 1.0000, Time: 53.8915, Step: 2474
train - Epoch 0, Precision: 0.9676, Recall: 0.9902, F1: 0.9788
Loss: 0.0046, Acc: 1.0000, Time: 8.2811, Step: 300
Loss: 0.0770, Acc: 1.0000, Time: 15.8911, Step: 600
Loss: 0.0000, Acc: 1.0000, Time: 22.7086, Step: 876
valid - Epoch 0, Precision: 0.8734, Recall: 0.9569, F1: 0.9133
Loss: 1.0898, Acc: 0.9667, Time: 7.6243, Step: 300
Loss: 0.0327, Acc: 1.0000, Time: 13.3096, Step: 600
Loss: 0.1234, Acc: 0.9933, Time: 19.8172, Step: 900
Loss: 0.0050, Acc: 1.0000, Time: 22.1869, Step: 1041
test - Epoch 0, Pre

### Evaluate results

With conlleval.

In [8]:
! ./conlleval < results/train.preds.txt
! ./conlleval < results/valid.preds.txt
! ./conlleval < results/test.preds.txt

processed 110269 tokens with 5822 phrases; found: 5958 phrases; correct: 5765.
accuracy:  99.58%; precision:  96.76%; recall:  99.02%; FB1:  97.88
              PER: precision:  96.76%; recall:  99.02%; FB1:  97.88  5958
processed 36757 tokens with 1788 phrases; found: 1959 phrases; correct: 1711.
accuracy:  98.80%; precision:  87.34%; recall:  95.69%; FB1:  91.33
              PER: precision:  87.34%; recall:  95.69%; FB1:  91.33  1959
processed 44795 tokens with 2723 phrases; found: 2728 phrases; correct: 2394.
accuracy:  98.12%; precision:  87.76%; recall:  87.92%; FB1:  87.84
              PER: precision:  87.76%; recall:  87.92%; FB1:  87.84  2728
