# BiLSTM-CRF: Variable-Length Sequences and CoNLL-style Data

This advanced notebook extends the basic BiLSTM-CRF example with:

- Variable-length sequences using padding and masking (mask_zero).
- CoNLL-style data loading helpers (sentence-per-block, token/tag columns).

It demonstrates training and evaluation with the standalone `keras_crf` package.

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from keras_crf import CRF, text as kcrf
print('TF', tf.__version__, 'Keras', keras.__version__)

## Part 1: Variable-length synthetic sequences
We create random-length sequences padded with 0. The Embedding uses `mask_zero=True`, so the CRF receives a boolean mask.

In [None]:
def make_varlen_dataset(num_samples=2000, max_len=40, vocab_size=200, num_tags=4, seed=7):
    rng = np.random.default_rng(seed)
    lens = rng.integers(low=max_len//3, high=max_len+1, size=num_samples, dtype=np.int32)
    X = np.zeros((num_samples, max_len), dtype=np.int32)
    Y = np.zeros((num_samples, max_len), dtype=np.int32)
    for i, L in enumerate(lens):
        seq = rng.integers(1, vocab_size, size=L, dtype=np.int32)
        X[i, :L] = seq
        mod = seq % 10
        y = np.zeros(L, dtype=np.int32)
        y[mod >= 7] = 3
        y[(mod >= 4) & (mod <= 6)] = 2
        y[(mod >= 2) & (mod <= 3)] = 1
        # noise
        flip = rng.random(L) < 0.03
        y[flip] = rng.integers(0, num_tags, size=flip.sum())
        Y[i, :L] = y
    return X, Y, lens

num_tags = 4; vocab_size = 300; max_len = 50
X_train, Y_train, L_train = make_varlen_dataset(3000, max_len, vocab_size, num_tags, seed=1)
X_val,   Y_val,   L_val   = make_varlen_dataset(600,  max_len, vocab_size, num_tags, seed=2)
X_test,  Y_test,  L_test  = make_varlen_dataset(600,  max_len, vocab_size, num_tags, seed=3)
X_train.shape, Y_train.shape

### BiLSTM-CRF model with masking
The Embedding generates a mask. We pass it into CRF so sequence lengths are derived correctly. We override train_step and test_step to compute the CRF negative log-likelihood.

In [None]:
embedding_dim = 64
lstm_units = 64

class BiLstmCrfModel(keras.Model):
    def __init__(self, vocab_size, num_tags, embedding_dim=64, lstm_units=64):
        super().__init__()
        self.embedding = layers.Embedding(input_dim=vocab_size+1, output_dim=embedding_dim, mask_zero=True)
        self.bilstm = layers.Bidirectional(layers.LSTM(lstm_units, return_sequences=True))
        self.crf = CRF(units=num_tags)
    def call(self, inputs, training=False):
        x = self.embedding(inputs)
        x = self.bilstm(x)
        mask = self.embedding.compute_mask(inputs)
        return self.crf(x, mask=mask)

class ModelWithCRFLoss(keras.Model):
    def __init__(self, core):
        super().__init__()
        self.core = core
    def call(self, inputs, training=False):
        return self.core(inputs, training=training)
    def _loss_from_batch(self, data, training=False):
        x, y, sw = keras.utils.unpack_x_y_sample_weight(data)
        decoded, potentials, seq_len, kernel = self(x, training=training)
        ll, _ = kcrf.crf_log_likelihood(potentials, y, seq_len, kernel)
        loss = -tf.reduce_mean(ll)
        if sw is not None:
            sw = tf.cast(sw, loss.dtype)
            if sw.shape.rank == 0:
                sw = tf.fill(tf.shape(ll), sw)
            loss = tf.reduce_mean(sw * (-ll))
        return loss
    def train_step(self, data):
        with tf.GradientTape() as tape:
            loss = self._loss_from_batch(data, training=True)
        grads = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
        return {"loss": loss}
    def test_step(self, data):
        loss = self._loss_from_batch(data, training=False)
        return {"loss": loss}

core = BiLstmCrfModel(vocab_size=vocab_size, num_tags=num_tags, embedding_dim=embedding_dim, lstm_units=lstm_units)
model = ModelWithCRFLoss(core)
model.compile(optimizer=keras.optimizers.Adam(1e-3))
model.summary()

### Train & evaluate (masked)

In [None]:
history = model.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=3, batch_size=64)
history.history

### Masked accuracy
We compute accuracy ignoring padding (zeros).

In [None]:
decoded, potentials, seq_len, kernel = model.predict(X_test, batch_size=64, verbose=0)
mask = (X_test != 0)
num = (decoded[mask] == Y_test[mask]).sum()
den = mask.sum()
acc = num/den
print(f'Masked token accuracy: {acc:.4f}')

## Part 2: CoNLL-style loader
Provide utilities to parse a CoNLL-like file (space-separated token/tag columns, blank line between sentences).

In [None]:
from typing import List, Tuple, Dict, Optional

def read_conll(path: str, token_col: int = 0, tag_col: int = -1, lowercase: bool = False) -> Tuple[List[List[str]], List[List[str]]]:
    sentences = []
    tags = []
    cur_toks, cur_tags = [], []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if cur_toks:
                    sentences.append(cur_toks)
                    tags.append(cur_tags)
                    cur_toks, cur_tags = [], []
                continue
            if line.startswith('-DOCSTART-'):
                continue
            parts = line.split()
            if lowercase:
                tok = parts[token_col].lower()
            else:
                tok = parts[token_col]
            tag = parts[tag_col]
            cur_toks.append(tok)
            cur_tags.append(tag)
    if cur_toks:
        sentences.append(cur_toks)
        tags.append(cur_tags)
    return sentences, tags

def build_maps(sentences: List[List[str]], tags: List[List[str]], min_freq: int = 1) -> Tuple[Dict[str,int], Dict[str,int]]:
    from collections import Counter
    c = Counter(tok for sent in sentences for tok in sent)
    tok2id = {"<PAD>":0, "<UNK>":1}
    for tok, cnt in c.items():
        if cnt >= min_freq:
            tok2id.setdefault(tok, len(tok2id))
    tagset = sorted(set(t for ts in tags for t in ts))
    tag2id = {t:i for i,t in enumerate(tagset)}
    return tok2id, tag2id

def encode_and_pad(sentences: List[List[str]], tags: List[List[str]], tok2id: Dict[str,int], tag2id: Dict[str,int], max_len: Optional[int] = None) -> Tuple[np.ndarray, np.ndarray]:
    if max_len is None:
        max_len = max(len(s) for s in sentences)
    X = np.zeros((len(sentences), max_len), dtype=np.int32)
    Y = np.zeros((len(sentences), max_len), dtype=np.int32)
    for i,(s,t) in enumerate(zip(sentences, tags)):
        ids = [tok2id.get(w, tok2id["<UNK>"]) for w in s][:max_len]
        tg  = [tag2id[u] for u in t][:max_len]
        X[i,:len(ids)] = ids
        Y[i,:len(tg)] = tg
    return X, Y

# Example usage (uncomment and set your path):
# train_sents, train_tags = read_conll('/path/to/train.conll', token_col=0, tag_col=-1)
# val_sents,   val_tags   = read_conll('/path/to/valid.conll', token_col=0, tag_col=-1)
# test_sents,  test_tags  = read_conll('/path/to/test.conll',  token_col=0, tag_col=-1)
# tok2id, tag2id = build_maps(train_sents, train_tags, min_freq=1)
# X_train, Y_train = encode_and_pad(train_sents, train_tags, tok2id, tag2id)
# X_val,   Y_val   = encode_and_pad(val_sents,   val_tags,   tok2id, tag2id, max_len=X_train.shape[1])
# X_test,  Y_test  = encode_and_pad(test_sents,  test_tags,  tok2id, tag2id, max_len=X_train.shape[1])
# num_tags = len(tag2id); vocab_size = len(tok2id) - 1  # exclude PAD index from token count if desired
# core = BiLstmCrfModel(vocab_size=vocab_size, num_tags=num_tags, embedding_dim=128, lstm_units=128)
# model = ModelWithCRFLoss(core)
# model.compile(optimizer=keras.optimizers.Adam(2e-3))
# model.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=5, batch_size=64)
# decoded, _, _, _ = model.predict(X_test, batch_size=64)
# mask = (X_test != 0)
# acc = (decoded[mask] == Y_test[mask]).mean()
# print('CoNLL masked token acc:', acc)
