# Token Classification with Hugging Face
Based on the tutorial
https://huggingface.co/docs/transformers/tasks/token_classification

In [1]:
import numpy as np
import tensorflow as tf
from datasets import load_dataset, Dataset, DatasetDict

from transformers import (AutoTokenizer,
                          DataCollatorForTokenClassification,
                          create_optimizer,
                          TFAutoModelForTokenClassification,
                          pipeline,
                         )

from transformers.keras_callbacks import KerasMetricCallback

import evaluate
from nltk import word_tokenize

import sys, os
currentdir = os.path.abspath(os.path.curdir)
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 
sys.path.insert(0,parentdir+'/embed') 

%load_ext autoreload
%autoreload 2
import train_ner as tn
import ner
import ner.llm_utils as llu

from nltk.chunk import conlltags2tree, ChunkScore

import toml

seqeval = evaluate.load('seqeval')

In [2]:
cfg = tn.gen_cfg()
text_lst = tn.get_wiki_pm_stacks_data(cfg)
sent_tok, trainer_params = tn.gen_sent_tokzer(text_lst, cfg)
tokens_lst, ner_tags_lst, title_lst = ner.bio_tag.put_ner_tags(text_lst, sent_tok)
def_lst = ner.bio_tag.put_pos_ner_tags(text_lst, sent_tok)

The name of the problematic article is: examples.xml
The name of the problematic article is: coding.xml
The name of the problematic article is: spaces-pushouts.xml
The name of the problematic article is: guide.xml
The name of the problematic article is: moduli.xml
The name of the problematic article is: more-groupoids.xml
The name of the problematic article is: chapters.xml
The name of the problematic article is: sets.xml
The name of the problematic article is: obsolete.xml
The name of the problematic article is: examples-defos.xml
The name of the problematic article is: spaces-more-cohomology.xml
The name of the problematic article is: bibliography.xml
The name of the problematic article is: fdl.xml
The name of the problematic article is: limits.xml
The name of the problematic article is: conventions.xml
The name of the problematic article is: introduction.xml
The name of the problematic article is: quot.xml
The name of the problematic article is: desirables.xml


In [3]:
pos_lst = [[d[0][1] for d in tree_lst['ner']] for tree_lst in def_lst]
data_dict = {
    'id': list(range(len(tokens_lst))),
    'tokens': tokens_lst,
    'ner_tags': ner_tags_lst,
    'title': title_lst,
    'pos': pos_lst,
}
ds = Dataset.from_dict(data_dict)
temp1_dd = ds.train_test_split(test_size=0.1, shuffle=True)
temp2_dd = temp1_dd['train'].train_test_split(test_size=0.1, shuffle=True)

ds = DatasetDict({
    'train': temp2_dd['train'],
    'test': temp1_dd['test'],
    'valid': temp2_dd['test'],
})
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'title', 'pos'],
        num_rows: 21141
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'title', 'pos'],
        num_rows: 2611
    })
    valid: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'title', 'pos'],
        num_rows: 2350
    })
})

In [4]:
checkpoint = 'distilbert-base-cased'
checkpoint = 'bert-base-cased'
#checkpoint = 'bert-large-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
label_list = ['O', 'B-defndum', 'I-defndum']

##checkpoint = 'gpt2'
##tokenizer = AutoTokenizer.from_pretrained(checkpoint, add_prefix_space=True)

In [5]:
#example = wnut['train'][0]
example = ds['train'][10]

tok_input = tokenizer(example['tokens'], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tok_input['input_ids'])
tokens

['[CLS]',
 'In',
 'an',
 'overview',
 'presented',
 'on',
 'its',
 'website',
 'in',
 '2008',
 'the',
 'Su',
 '##sta',
 '##ina',
 '##bility',
 'Science',
 'Program',
 'at',
 'Harvard',
 'University',
 'described',
 'the',
 'field',
 'in',
 'the',
 'following',
 'way',
 ',',
 'stress',
 '##ing',
 'its',
 'inter',
 '##dis',
 '##ci',
 '##p',
 '##lina',
 '##rity',
 ':',
 "'",
 'Su',
 '##sta',
 '##ina',
 '##bility',
 'science',
 "'",
 'is',
 'problem',
 '-',
 'driven',
 ',',
 'interdisciplinary',
 'scholarship',
 'that',
 'seeks',
 'to',
 'facilitate',
 'the',
 'design',
 ',',
 'implementation',
 ',',
 'and',
 'evaluation',
 'of',
 'effective',
 'interventions',
 'that',
 'foster',
 'shared',
 'prosperity',
 'and',
 'reduced',
 'poverty',
 'while',
 'protecting',
 'the',
 'environment',
 '.',
 '[SEP]']

In [6]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'],
                                truncation=True,
                                is_split_into_words=True)
    
    labels=[]
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
        
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

#tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)
tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/22 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [7]:
#labels = [label_list[i] for i in example['ner_tags']]


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    
    return {
        'precision': results['overall_precision'],
        'recall': results['overall_recall'],
        'f1': results['overall_f1'],
        'accuracy': results['overall_accuracy'],
    }
    

In [8]:
id2label = {
...     0: "O",
...     1: "B-DFNDUM",
...     2: "I-DFNDUM",
... }
>>> label2id = {
...     "O": 0,
...     "B-DFNDUM": 1,
...     "I-DFNDUM": 2,
... }

In [9]:
batch_size = 16
num_train_epochs = 3
num_train_steps = (len(tokenized_ds['train']) // batch_size * num_train_epochs)
optimizer, lr_schedule = create_optimizer(
    init_lr = 2e-5,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)

model = TFAutoModelForTokenClassification.from_pretrained(
    checkpoint,
    num_labels=3,
    id2label=id2label,
    label2id=label2id,
)
    

2023-07-27 13:42:57.882085: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-07-27 13:42:57.887266: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-07-27 13:42:57.887412: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-07-27 13:42:57.888157: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [10]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")

tf_train_set = model.prepare_tf_dataset(
    tokenized_ds["train"],
    shuffle=True,
    batch_size=4,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_ds["valid"],
    shuffle=False,
    batch_size=4,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    tokenized_ds["test"],
    shuffle=False,
    batch_size=4,
    collate_fn=data_collator,
)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  tensor = as_tensor(value)


In [11]:
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [12]:
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
callbacks = [metric_callback,]
model.fit(x=tf_train_set,
          validation_data=tf_validation_set,
          epochs=3,
          callbacks=callbacks)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7ff958763ca0>

In [13]:
text = """We define a Banach space as a complete vector normed space."""
text = ''
j = 19
for t in ds['test'][j]['tokens']:
    text += t + ' '
print(f'{text=}')
print(f'The real title is: ', ' '.join([ds['test'][j]['tokens'][k]
                                        for k, n in enumerate(ds['test'][j]['ner_tags']) if n != 0]))
classifier = pipeline('ner', model=model, tokenizer=tokenizer)
print('The pipeline result is: ', classifier(text))

inputs = tokenizer(text, return_tensors='tf')
logits = model(**inputs).logits
predicted_ids = tf.math.argmax(logits, axis=-1)
predicted_token_class = [model.config.id2label[t] for t in predicted_ids[0].numpy().tolist()]

for i in range(len(predicted_token_class)):
    print(inputs.tokens()[i], predicted_token_class[i])

tt = tokenizer(ds['test'][j]['tokens'], return_tensors='tf', is_split_into_words=True)
logits = model(**tt).logits

# Grouping entities
predicted_ids = tf.math.argmax(logits, axis=-1)[0]
predictions = predicted_ids.numpy().tolist()
results = []
inputs_with_offsets = tokenizer(text, return_offsets_mapping=True)
tokens = inputs_with_offsets.tokens()
offsets = inputs_with_offsets['offset_mapping']

probs = tf.math.softmax(logits, axis=-1)[0]
probs = probs.numpy().tolist()

#start, end = inputs.word_to_chars(10)
end = 0

idx = 0
while idx < len(predictions):
    pred = predictions[idx]
    label = model.config.id2label[pred]
    if label != 'O':
        label = label[2:]
        start, end = offsets[idx] # 2nd output is the end of word
        #idx += 1
        
        # Grab all tokens labeled with an I-label
        all_scores = []
        while (
            idx < len(predictions)
            and model.config.id2label[predictions[idx]][2:] == label
               ):
            all_scores.append(probs[idx][pred])
            _, end = offsets[idx]
            idx += 1
            
        score = np.mean(all_scores).item()
        word = text[start:end]
        results.append(
            {'entity': label, 
             'score': score,
             'word': word,
            'start': start,
            'end': end,}
        )
    idx += 1
print(results)

text='Formally , a transition system is a pair ( S , - ) where S is a set of states and - is a set of state transitions ( i.e. , a subset of S x S ) . '
The real title is:  transition system
The pipeline result is:  [{'entity': 'B-DFNDUM', 'score': 0.9698083, 'index': 5, 'word': 'transition', 'start': 13, 'end': 23}, {'entity': 'I-DFNDUM', 'score': 0.9639087, 'index': 6, 'word': 'system', 'start': 24, 'end': 30}]
[CLS] O
Form O
##ally O
, O
a O
transition B-DFNDUM
system I-DFNDUM
is O
a O
pair O
( O
S O
, O
- O
) O
where O
S O
is O
a O
set O
of O
states O
and O
- O
is O
a O
set O
of O
state O
transitions O
( O
i O
. O
e O
. O
, O
a O
subset O
of O
S O
x O
S O
) O
. O
[SEP] O
[{'entity': 'DFNDUM', 'score': 0.48646090587135404, 'word': 'transition system', 'start': 13, 'end': 30}]


In [14]:
#j=780
chunkscore = ChunkScore()

spec_toks = list(tokenizer.special_tokens_map.values())
spec_toks.remove('[UNK]')


dif_len_lst = []
for j in range(len(ds['test'])):
    tt = tokenizer(ds['test'][j]['tokens'], return_tensors='tf', is_split_into_words=True)
    logits = model(**tt).logits

    # Grouping entities
    predicted_ids = tf.math.argmax(logits, axis=-1)[0]
    predictions = predicted_ids.numpy().tolist()
    pp = [model.config.id2label[t] for t in predictions]

    wl, il = llu.get_words_back(tt.tokens(),
                          preds=pp, special_tokens=spec_toks)
    try:
        wl, il = llu.join_by_example(wl, ds['test'][j]['tokens'], preds=il)
    except AssertionError:
        print(f'Index {j=} caused the error')

    tree_pred = conlltags2tree([(tok, 'Upa', pred) for tok, pred in zip(wl, il)])

    jdict = ds['test'][j]
    bio_tagged = tn.tf_bio_tagger(jdict['ner_tags'])
    tree_gold = conlltags2tree([(jdict['tokens'][i], 
                                 'Upa', 
                                 bio_tagged[i])
                                for i in range(len(jdict['tokens']))])

    chunkscore.score(tree_pred, tree_gold)
    
    if len(wl) != len(jdict['tokens']):
        dif_len_lst.append(j)
#for i in range(len(il)):
    #print(f"{wl[i]:<15} {il[i]:<10} {jdict['tokens'][i]:<15} {bio_tagged[i]}")
    #print(f"{wl[i]:<20} {il[i]:<10}") 
print(chunkscore)

ChunkParse score:
    IOB Accuracy:  97.5%%
    Precision:     68.5%%
    Recall:        67.7%%
    F-Measure:     68.1%%


In [15]:
for j in range(len(ds['test'])):
    if ['There', 'are', 'two', 'approaches'] == ds['test'][j]['tokens'][:4]:
        print(j)

In [16]:
j=142
tt = tokenizer(ds['test'][j]['tokens'], return_tensors='tf', is_split_into_words=True)
logits = model(**tt).logits

# Grouping entities
predicted_ids = tf.math.argmax(logits, axis=-1)[0]
predictions = predicted_ids.numpy().tolist()
pp = [model.config.id2label[t] for t in predictions]

wl, il = ner.llm_utils.get_words_back(tt.tokens(),
                      preds=pp, special_tokens=tokenizer.special_tokens_map.values())
#wl, il = ner.llm_utils.join_math_tokens(wl, il)

tree_pred = conlltags2tree([(tok, 'Upa', pred) for tok, pred in zip(wl, il)])

jdict = ds['test'][j]
bio_tagged = tn.tf_bio_tagger(jdict['ner_tags'])
tree_gold = conlltags2tree([(jdict['tokens'][i], 
                             'Upa', 
                             bio_tagged[i])
                            for i in range(len(jdict['tokens']))])

#for i in range(len(il)):
#    print(f"{wl[i]:<15} {il[i]:<10} {jdict['tokens'][i]:<15} {bio_tagged[i]}")
    #print(f"{wl[i]:<20} {il[i]:<10}") 

joined_toks, joined_preds = llu.join_by_example(wl, jdict['tokens'], preds=il)
short_gold = ['1', '\\in', 'c', 'a', 'be', 'fin']
short_pred = ['1', '\\', 'in', 'c', 'a', 'b', 'e', 'fin']
#joined_preds = join_by_example(short_pred, short_gold)
for i in range(len(joined_toks)):
    print(f"{i}  {joined_toks[i]:<15} {joined_preds[i]:<10} {jdict['tokens'][i]:<15} {bio_tagged[i]}")

0  We              O          We              O
1  say             O          say             O
2  that            O          that            O
3  the             O          the             O
4  tensor          O          tensor          O
5  _inline_math_   O          _inline_math_   O
6  is              O          is              O
7  a               O          a               O
8  Codazzi         B-DFNDUM   Codazzi         B-DFNDUM
9  Tensor          I-DFNDUM   Tensor          I-DFNDUM
10  if              O          if              O
11  _inline_math_   O          _inline_math_   O
12  .               O          .               O


In [17]:
'\\in'.startswith('\\')

True

## Previous Results
bert-base-cased
```text
ChunkParse score:
    IOB Accuracy:  97.7%%
    Precision:     74.3%%
    Recall:        69.9%%
    F-Measure:     72.0%%
```

```text
ChunkParse score:
    IOB Accuracy:  97.9%%
    Precision:     77.9%%
    Recall:        72.3%%
    F-Measure:     75.0%%
```

tf_train_set=

<_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(8, None), dtype=tf.int64, name=None), 'token_type_ids': TensorSpec(shape=(8, None), dtype=tf.int6
4, name=None), 'attention_mask': TensorSpec(shape=(8, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(8, None), dtype=tf.int64, name=None))>                            

tf_validation_set=

<_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'token_type_ids': TensorSpec(shape=(None, None), dt
ype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>