In [1]:
import tensorflow as tf
import numpy as np
import time
import sys
import os
from models.estimator import Estimator
from models.hmm import HiddenMarkov, load_raw_dataset
from pathlib import Path

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable debug logs Tensorflow.
tf.logging.set_verbosity(tf.logging.ERROR)

### Bi-LSTM-CRF

In [7]:
# estimator = Estimator(model, )
estimator = Estimator()
estimator.set_dataset_params({
  'model': 'crf', # crf, lstm_crf, html_attention, self_attention
  # 'lstm_size': 200,
  # 'decoder': 'crf', # crf, logits.
  # 'char_representation': 'cnn',
  # 'word_embeddings': 'glove', # glove, elmo. TODO: bert.
  # 'use_features': False, 
  # 'f_score_alpha': 0.5,
})
# estimator.train()
estimator.test()


Restoring model...
Loss: 0.1675, Acc: 1.0000, Time: 3.2803, Step: 300
Loss: 0.1649, Acc: 1.0000, Time: 4.8178, Step: 600
Loss: 0.0199, Acc: 1.0000, Time: 6.3680, Step: 900
Loss: 0.0026, Acc: 1.0000, Time: 8.3494, Step: 1200
Loss: 0.1104, Acc: 1.0000, Time: 9.8134, Step: 1500
Loss: 0.0226, Acc: 1.0000, Time: 11.4722, Step: 1800
Loss: 0.1084, Acc: 1.0000, Time: 13.0408, Step: 2100
Loss: 0.0482, Acc: 1.0000, Time: 14.5014, Step: 2400
Loss: 0.1039, Acc: 1.0000, Time: 14.9431, Step: 2474
train - Epoch 0, Precision: 0.7887, Recall: 0.7595, F1: 0.7738
Loss: 0.3397, Acc: 0.9692, Time: 2.1385, Step: 300
Loss: 0.2148, Acc: 0.9800, Time: 3.7365, Step: 600
Loss: 0.0018, Acc: 1.0000, Time: 5.0893, Step: 876
valid - Epoch 0, Precision: 0.7718, Recall: 0.7981, F1: 0.7847
Loss: 1.0599, Acc: 0.9667, Time: 2.3508, Step: 300
Loss: 0.0883, Acc: 1.0000, Time: 4.1859, Step: 600
Loss: 0.7030, Acc: 0.9733, Time: 5.9270, Step: 900
Loss: 0.1643, Acc: 1.0000, Time: 6.6686, Step: 1041
test - Epoch 0, Precision: 0

### Hidden Markov Model

In [3]:
start_time = time.time()
timesteps = 1
naive_bayes = timesteps == 0
if naive_bayes:
    timesteps = 1
    
X, Y, _ = load_raw_dataset('data/train')
hmm = HiddenMarkov(
    timesteps, 
    naive_bayes=naive_bayes,
    use_gazetteer=True,
    use_features=True,
    self_train=True
)
hmm.fit(X, Y)
 
for name in ['train', 'valid', 'test']:
    print('Predicting ' + name)
    x, t, w = load_raw_dataset('data/' + name)
    p = hmm.predict(x)
    
    t = [[['O', 'B-PER', 'I-PER'][t__] for t__ in t_] for t_ in t]
    p = [[['O', 'B-PER', 'I-PER'][p__] for p__ in p_] for p_ in p]

    with Path('{}.preds.txt'.format(name)).open('wb') as f:
        for words, preds, tags in zip(w, p, t):
            f.write(b'\n')
            for word, pred, tag in zip(words, preds, tags):
                f.write(' '.join([word, tag, pred]).encode() + b'\n')

print('Elapsed time: %.4f' % (time.time() - start_time))

[[9.1350e+04 1.0000e+00 5.8230e+03]
 [1.0000e+00 1.0000e+00 1.0000e+00]
 [3.0240e+03 1.0000e+00 1.0076e+04]]
[[9.40066273e-01 1.02908185e-05 5.99234363e-02]
 [3.33333333e-01 3.33333333e-01 3.33333333e-01]
 [2.30822075e-01 7.63300511e-05 7.69101595e-01]]
Predicting train
Predicting valid
Predicting test
Elapsed time: 27.2895


In [5]:
! ./conlleval < train.preds.txt
! ./conlleval < valid.preds.txt
! ./conlleval < test.preds.txt

processed 110269 tokens with 5822 phrases; found: 5991 phrases; correct: 5033.
accuracy:  97.57%; precision:  84.01%; recall:  86.45%; FB1:  85.21
              PER: precision:  84.01%; recall:  86.45%; FB1:  85.21  5991
processed 36757 tokens with 1788 phrases; found: 2167 phrases; correct: 1619.
accuracy:  96.36%; precision:  74.71%; recall:  90.55%; FB1:  81.87
              PER: precision:  74.71%; recall:  90.55%; FB1:  81.87  2167
processed 44795 tokens with 2723 phrases; found: 2725 phrases; correct: 2360.
accuracy:  96.50%; precision:  86.61%; recall:  86.67%; FB1:  86.64
              PER: precision:  86.61%; recall:  86.67%; FB1:  86.64  2725
