# BiLSTM-CRF (manual build)
Build the BiLSTM-CRF without train_utils, to show where you can plug custom losses/metrics.

In [None]:
import os
os.environ.setdefault('KERAS_BACKEND', os.getenv('KERAS_BACKEND', 'tensorflow'))
import sys, pathlib
root = pathlib.Path(__file__).resolve().parents[2]
sys.path.insert(0, str(root))
import numpy as np
import keras
from keras import layers, ops as K
from keras_crf.layers import CRF
from keras_crf.crf_ops import crf_log_likelihood, crf_marginals
from examples.utils.data import make_varlen_dataset
from examples.utils.metrics import MaskedTokenAccuracy


In [None]:
X_train, Y_train, _ = make_varlen_dataset(1000, 40, 200, 6, seed=1)
X_val, Y_val, _ = make_varlen_dataset(200, 40, 200, 6, seed=2)
vocab_size = 200
num_tags = 6
tokens = keras.Input(shape=(None,), dtype='int32', name='tokens')
x = layers.Embedding(vocab_size + 1, 64, mask_zero=True)(tokens)
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
crf = CRF(num_tags)
decoded, potentials, lens, trans = crf(x)
labels = keras.Input(shape=(None,), dtype='int32', name='labels')

class NLL(keras.layers.Layer):
    def call(self, inputs):
        pot, y_true, ln, tr = inputs
        return -crf_log_likelihood(pot, y_true, ln, tr)

class Dice(keras.layers.Layer):
    def __init__(self, n, s=1.0, **kw):
        super().__init__(**kw); self.n=n; self.s=s
    def call(self, inputs):
        pot, y_true, ln, tr = inputs
        T = K.shape(pot)[1]
        probs = crf_marginals(pot, ln, tr)
        y_oh = K.one_hot(K.cast(y_true, 'int32'), self.n)
        y_oh = K.cast(y_oh, probs.dtype)
        mask = K.expand_dims(K.cast(K.arange(T)[None, :] < K.cast(ln[:, None], 'int32'), probs.dtype), -1)
        inter = K.sum(y_oh*probs*mask, axis=(1,2))
        sums = K.sum(y_oh*mask, axis=(1,2)) + K.sum(probs*mask, axis=(1,2))
        dice = (2*inter + self.s)/(sums + self.s)
        return 1.0 - dice

nll = NLL()([potentials, labels, lens, trans])
dice = Dice(num_tags)([potentials, labels, lens, trans])

combo = keras.layers.Lambda(lambda zs: 0.2*zs[0] + 0.8*zs[1])([nll, dice])

class _Id(keras.layers.Layer):
    def __init__(self, **kw): super().__init__(**kw); self.supports_masking=True
    def call(self, x): return x

decoded_out = _Id(name='decoded_output')(decoded)
loss_out = keras.layers.Lambda(lambda z: z, name='crf_log_likelihood_output')(combo)
model = keras.Model({'tokens': tokens, 'labels': labels}, {'decoded_output': decoded_out, 'crf_log_likelihood_output': loss_out})

def zero_loss(y_true, y_pred):
    return K.mean(K.zeros_like(y_pred[..., :1]))

model.compile(optimizer=keras.optimizers.Adam(1e-3),
              loss={'decoded_output': zero_loss, 'crf_log_likelihood_output': lambda yt, yp: K.mean(yp)},
              metrics={'decoded_output': [MaskedTokenAccuracy()]})

y_train = {'decoded_output': Y_train, 'crf_log_likelihood_output': np.zeros((X_train.shape[0],), np.float32)}
sw_train = {'decoded_output': (X_train!=0).astype(np.float32), 'crf_log_likelihood_output': np.ones((X_train.shape[0],), np.float32)}
y_val = {'decoded_output': Y_val, 'crf_log_likelihood_output': np.zeros((X_val.shape[0],), np.float32)}
sw_val = {'decoded_output': (X_val!=0).astype(np.float32), 'crf_log_likelihood_output': np.ones((X_val.shape[0],), np.float32)}

model.fit({'tokens': X_train, 'labels': Y_train}, y_train, sample_weight=sw_train, validation_data=({'tokens': X_val, 'labels': Y_val}, y_val, sw_val), epochs=2, batch_size=64, verbose=2)


## CleanCoNLL and MultiCoNER via train_lib (importable)

In [None]:
from examples import train_lib as tl
# CleanCoNLL (after preparing splits)
# X_tr, Y_tr, X_va, Y_va, X_te, Y_te, V, C, id2tag = tl.load_conll('examples/data/cleanconll/train.txt', 'examples/data/cleanconll/valid.txt', 'examples/data/cleanconll/test.txt')
# model2, pred2 = tl.build_bilstm_crf_models(V, C, loss='dice+nll', joint_nll_weight=0.2)
# tl.train_and_evaluate(model2, pred2, X_tr, Y_tr, X_va, Y_va, X_te, Y_te, epochs=2, batch_size=64, id2tag=id2tag)
# MultiCoNER EN-English
# X_tr, Y_tr, X_va, Y_va, X_te, Y_te, V, C, id2tag = tl.load_multiconer_en('examples/data/multiconer/EN-English')
# model3, pred3 = tl.build_bilstm_crf_models(V, C, loss='dice+nll', joint_nll_weight=0.2)
# tl.train_and_evaluate(model3, pred3, X_tr, Y_tr, X_va, Y_va, X_te, Y_te, epochs=2, batch_size=64, id2tag=id2tag)
