# Token Classification with Hugging Face
Based on the tutorial
https://huggingface.co/docs/transformers/tasks/token_classification

In [1]:
import numpy as np
import tensorflow as tf
from datasets import load_dataset, Dataset, DatasetDict

from transformers import (AutoTokenizer,
                          DataCollatorForTokenClassification,
                          create_optimizer,
                          TFAutoModelForTokenClassification,
                          pipeline,
                         )

from transformers.keras_callbacks import KerasMetricCallback

import evaluate
from nltk import word_tokenize
from lxml import etree

import sys, os
currentdir = os.path.abspath(os.path.curdir)
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 
sys.path.insert(0,parentdir+'/embed') 

%load_ext autoreload
%autoreload 2
import train_ner as tn
import ner
import ner.llm_utils as llu
import embed.inference_ner as iner

from nltk.chunk import conlltags2tree, ChunkScore

import toml

seqeval = evaluate.load('seqeval')

2023-08-19 10:37:08.548327: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-08-19 10:37:08.553180: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-08-19 10:37:08.553325: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


In [2]:
cfg = tn.gen_cfg()
text_lst = tn.get_wiki_pm_stacks_data(cfg)
sent_tok, trainer_params = tn.gen_sent_tokzer(text_lst, cfg)
tokens_lst, ner_tags_lst, title_lst = ner.bio_tag.put_ner_tags(text_lst, sent_tok)
def_lst = ner.bio_tag.put_pos_ner_tags(text_lst, sent_tok)

The name of the problematic article is: examples.xml
The name of the problematic article is: coding.xml
The name of the problematic article is: spaces-pushouts.xml
The name of the problematic article is: guide.xml
The name of the problematic article is: moduli.xml
The name of the problematic article is: more-groupoids.xml
The name of the problematic article is: chapters.xml
The name of the problematic article is: sets.xml
The name of the problematic article is: obsolete.xml
The name of the problematic article is: examples-defos.xml
The name of the problematic article is: spaces-more-cohomology.xml
The name of the problematic article is: bibliography.xml
The name of the problematic article is: fdl.xml
The name of the problematic article is: limits.xml
The name of the problematic article is: conventions.xml
The name of the problematic article is: introduction.xml
The name of the problematic article is: quot.xml
The name of the problematic article is: desirables.xml


In [3]:
pos_lst = [[d[0][1] for d in tree_lst['ner']] for tree_lst in def_lst]
data_dict = {
    'id': list(range(len(tokens_lst))),
    'tokens': tokens_lst,
    'ner_tags': ner_tags_lst,
    'title': title_lst,
    'pos': pos_lst,
}
ds = Dataset.from_dict(data_dict)
temp1_dd = ds.train_test_split(test_size=0.1, shuffle=True)
temp2_dd = temp1_dd['train'].train_test_split(test_size=0.1, shuffle=True)

ds = DatasetDict({
    'train': temp2_dd['train'],
    'test': temp1_dd['test'],
    'valid': temp2_dd['test'],
})
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'title', 'pos'],
        num_rows: 21142
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'title', 'pos'],
        num_rows: 2611
    })
    valid: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'title', 'pos'],
        num_rows: 2350
    })
})

In [13]:
#checkpoint = 'distilbert-base-cased'
#checkpoint = 'xlm-roberta-base'
checkpoint = 'bert-base-cased'
#checkpoint = 'bert-large-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
label_list = ['O', 'B-defndum', 'I-defndum']

##checkpoint = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(checkpoint, add_prefix_space=True)
model_path = '/media/hd1/trained_models/ner_model/HFtransformers/ner-2023-08-01_1627/trainer/trans_HF_ner/ner_Aug-01_16-28/'
model = TFAutoModelForTokenClassification.from_pretrained(model_path)

Some layers from the model checkpoint at /media/hd1/trained_models/ner_model/HFtransformers/ner-2023-08-01_1627/trainer/trans_HF_ner/ner_Aug-01_16-28/ were not used when initializing TFBertForTokenClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForTokenClassification were initialized from the model checkpoint at /media/hd1/trained_models/ner_model/HFtransformers/ner-2023-08-01_1627/trainer/trans_HF_ner/ner_Aug-01_16-28/.
If your task is similar to the task the model of the checkpoint was trained o

In [5]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'],
                                truncation=True,
                                is_split_into_words=True)
    
    labels=[]
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
        
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

#tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)
tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/22 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [6]:
#labels = [label_list[i] for i in example['ner_tags']]


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    
    return {
        'precision': results['overall_precision'],
        'recall': results['overall_recall'],
        'f1': results['overall_f1'],
        'accuracy': results['overall_accuracy'],
    }
    

In [7]:
id2label = {
...     0: "O",
...     1: "B-DFNDUM",
...     2: "I-DFNDUM",
... }
>>> label2id = {
...     "O": 0,
...     "B-DFNDUM": 1,
...     "I-DFNDUM": 2,
... }

In [8]:
batch_size = 16
num_train_epochs = 3
num_train_steps = (len(tokenized_ds['train']) // batch_size * num_train_epochs)
optimizer, lr_schedule = create_optimizer(
    init_lr = 2e-5,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)

model = TFAutoModelForTokenClassification.from_pretrained(
    checkpoint,
    num_labels=3,
    id2label=id2label,
    label2id=label2id,
)
    

All model checkpoint layers were used when initializing TFBertForTokenClassification.

Some layers of TFBertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
tokenizer.special_tokens_map.values()

dict_values(['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]'])

In [10]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")

tf_train_set = model.prepare_tf_dataset(
    tokenized_ds["train"],
    shuffle=True,
    batch_size=1,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_ds["valid"],
    shuffle=False,
    batch_size=1,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    tokenized_ds["test"],
    shuffle=False,
    batch_size=1,
    collate_fn=data_collator,
)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  tensor = as_tensor(value)


In [13]:
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [18]:
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
callbacks = [metric_callback,]
model.fit(x=tf_train_set,
          validation_data=tf_validation_set,
          epochs=3,
          callbacks=callbacks)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f0d0c2b1480>

In [14]:
text = """We define a Banach space as a complete vector normed space."""
#text = ''
#j = 19
#for t in ds['test'][j]['tokens']:
#    text += t + ' '
print(f'{text=}')
#print(f'The real title is: ', ' '.join([ds['test'][j]['tokens'][k]
#                         for k, n in enumerate(ds['test'][j]['ner_tags']) if n != 0]))
classifier = pipeline('ner', model=model, tokenizer=tokenizer)
print('The pipeline result is: ', classifier(text))

inputs = tokenizer(text, return_tensors='tf')
logits = model(**inputs).logits
predicted_ids = tf.math.argmax(logits, axis=-1)
predicted_token_class = [model.config.id2label[t] for t in predicted_ids[0].numpy().tolist()]

for i in range(len(predicted_token_class)):
    print(inputs.tokens()[i], predicted_token_class[i])

#tt = tokenizer(ds['test'][j]['tokens'], return_tensors='tf', is_split_into_words=True)
tt = tokenizer(text, return_tensors='tf', is_split_into_words=False)
logits = model(**tt).logits
print(f"{logits=}")

# Grouping entities
predicted_ids = tf.math.argmax(logits, axis=-1)[0]
predictions = predicted_ids.numpy().tolist()
results = []
inputs_with_offsets = tokenizer(text, return_offsets_mapping=True)
tokens = inputs_with_offsets.tokens()
offsets = inputs_with_offsets['offset_mapping']

probs = tf.math.softmax(logits, axis=-1)[0]
probs = probs.numpy().tolist()

#start, end = inputs.word_to_chars(10)
end = 0

idx = 0
while idx < len(predictions):
    pred = predictions[idx]
    label = model.config.id2label[pred]
    if label != 'O':
        label = label[2:]
        start, end = offsets[idx] # 2nd output is the end of word
        #idx += 1
        
        # Grab all tokens labeled with an I-label
        all_scores = []
        while (
            idx < len(predictions)
            and model.config.id2label[predictions[idx]][2:] == label
               ):
            all_scores.append(probs[idx][pred])
            _, end = offsets[idx]
            idx += 1
            
        score = np.mean(all_scores).item()
        word = text[start:end]
        results.append(
            {'entity': label, 
             'score': score,
             'word': word,
            'start': start,
            'end': end,}
        )
    idx += 1
print(results)

text='We define a Banach space as a complete vector normed space.'
The pipeline result is:  [{'entity': 'B-DFNDUM', 'score': 0.99795747, 'index': 4, 'word': 'Ban', 'start': 12, 'end': 15}, {'entity': 'I-DFNDUM', 'score': 0.87701976, 'index': 5, 'word': '##ach', 'start': 15, 'end': 18}, {'entity': 'I-DFNDUM', 'score': 0.9935475, 'index': 6, 'word': 'space', 'start': 19, 'end': 24}]
[CLS] O
We O
define O
a O
Ban B-DFNDUM
##ach I-DFNDUM
space I-DFNDUM
as O
a O
complete O
vector O
norm O
##ed O
space O
. O
[SEP] O
logits=<tf.Tensor: shape=(1, 16, 3), dtype=float32, numpy=
array([[[ 3.030289  , -2.0339897 , -2.0566025 ],
        [ 7.8431334 , -4.7557993 , -4.332306  ],
        [ 7.9283433 , -4.4622803 , -4.4607487 ],
        [ 7.735637  , -4.489064  , -3.966631  ],
        [-0.56327933,  5.646585  , -4.5539656 ],
        [-2.3025973 ,  0.08733091,  2.1395137 ],
        [-0.1091496 , -5.2520733 ,  4.9334927 ],
        [ 7.901957  , -4.716515  , -4.1931643 ],
        [ 7.9019723 , -4.7961826 

In [18]:
xml_path = '/media/hd1/glossary/inference_class_all/math00/0006_001.xml.gz'
model_dir = '/media/hd1/trained_models/ner_model/lstm_ner/ner_Sep-29_15-37/exp_041/'
sent_tok = iner.read_sent_tok(os.path.join(model_dir, 'punkt_params.pickle'))
pars = etree.XMLParser(recover=True)                                           
xml_tree = etree.parse(xml_path, parser=pars)                                  
root = xml_tree.getroot()
Defs = root.findall('.//definition')   

#sent_lst = iner.str_tok_pos_tags(Defs[12].find('stmnt').text, sent_tok )
sent_lst = sent_tok.tokenize(Defs[70].find('stmnt').text)


empty = ''

tt = tokenizer(sent_lst, return_tensors='tf', is_split_into_words=False, padding=True, truncation=True)
logits = model(**tt)['logits']
predicted_ids = tf.math.argmax(logits, axis=-1)
predictions = predicted_ids.numpy().tolist()
for j,p in enumerate(predictions):
    label_lst = [model.config.id2label[t] for t in p]
    for k,lab in enumerate(label_lst):
        print(f"('{tt.tokens(j)[k]}', {empty:>20} '{lab}'),")
#join1 = llu.get_words_back(tt.tokens(1))[0]
#llu.join_math_tokens(join1)[0]
#join1

('[CLS]',                      'O'),
('A',                      'O'),
('mon',                      'O'),
('##oid',                      'O'),
('##al',                      'O'),
('category',                      'O'),
('is',                      'O'),
('symmetric',                      'B-DFNDUM'),
('mon',                      'I-DFNDUM'),
('##oid',                      'I-DFNDUM'),
('##al',                      'I-DFNDUM'),
('if',                      'O'),
('it',                      'O'),
('has',                      'O'),
('the',                      'O'),
('special',                      'O'),
('arrow',                      'O'),
('_',                      'O'),
('display',                      'O'),
('_',                      'O'),
('math',                      'O'),
('_',                      'O'),
('for',                      'O'),
('every',                      'O'),
('pair',                      'O'),
('_',                      'O'),
('in',                      'O'),
('##line

In [11]:
#j=780
chunkscore = ChunkScore()

spec_toks = list(tokenizer.special_tokens_map.values())
spec_toks.remove('[UNK]')
#spec_toks.remove('<unk>')


dif_len_lst = []
for j in range(len(ds['test'])):
    tt = tokenizer(ds['test'][j]['tokens'], return_tensors='tf', is_split_into_words=True)
    logits = model(**tt).logits

    # Grouping entities
    predicted_ids = tf.math.argmax(logits, axis=-1)[0]
    predictions = predicted_ids.numpy().tolist()
    pp = [model.config.id2label[t] for t in predictions]

    wl, il = llu.get_words_back(tt.tokens(),
                          preds=pp, special_tokens=spec_toks)
    try:
        wl, il = llu.join_by_example(wl, ds['test'][j]['tokens'], preds=il)
    except AssertionError:
        print(f'Index {j=} caused the error')

    tree_pred = conlltags2tree([(tok, 'Upa', pred) for tok, pred in zip(wl, il)])

    jdict = ds['test'][j]
    bio_tagged = tn.tf_bio_tagger(jdict['ner_tags'])
    tree_gold = conlltags2tree([(jdict['tokens'][i], 
                                 'Upa', 
                                 bio_tagged[i])
                                for i in range(len(jdict['tokens']))])

    chunkscore.score(tree_pred, tree_gold)
    
    if len(wl) != len(jdict['tokens']):
        dif_len_lst.append(j)
#for i in range(len(il)):
    #print(f"{wl[i]:<15} {il[i]:<10} {jdict['tokens'][i]:<15} {bio_tagged[i]}")
    #print(f"{wl[i]:<20} {il[i]:<10}") 
print(chunkscore)

ChunkParse score:
    IOB Accuracy:  99.5%%
    Precision:     94.8%%
    Recall:        91.6%%
    F-Measure:     93.2%%


In [None]:
tokenizer.special_tokens_map

In [None]:
for j in range(len(ds['test'])):
    if ['There', 'are', 'two', 'approaches'] == ds['test'][j]['tokens'][:4]:
        print(j)

In [None]:
j=142
tt = tokenizer(ds['test'][j]['tokens'], return_tensors='tf', is_split_into_words=True)
logits = model(**tt).logits

# Grouping entities
predicted_ids = tf.math.argmax(logits, axis=-1)[0]
predictions = predicted_ids.numpy().tolist()
pp = [model.config.id2label[t] for t in predictions]

wl, il = ner.llm_utils.get_words_back(tt.tokens(),
                      preds=pp, special_tokens=tokenizer.special_tokens_map.values())
#wl, il = ner.llm_utils.join_math_tokens(wl, il)

tree_pred = conlltags2tree([(tok, 'Upa', pred) for tok, pred in zip(wl, il)])

jdict = ds['test'][j]
bio_tagged = tn.tf_bio_tagger(jdict['ner_tags'])
tree_gold = conlltags2tree([(jdict['tokens'][i], 
                             'Upa', 
                             bio_tagged[i])
                            for i in range(len(jdict['tokens']))])

#for i in range(len(il)):
#    print(f"{wl[i]:<15} {il[i]:<10} {jdict['tokens'][i]:<15} {bio_tagged[i]}")
    #print(f"{wl[i]:<20} {il[i]:<10}") 

joined_toks, joined_preds = llu.join_by_example(wl, jdict['tokens'], preds=il)
short_gold = ['1', '\\in', 'c', 'a', 'be', 'fin']
short_pred = ['1', '\\', 'in', 'c', 'a', 'b', 'e', 'fin']
#joined_preds = join_by_example(short_pred, short_gold)
for i in range(len(joined_toks)):
    print(f"{i}  {joined_toks[i]:<15} {joined_preds[i]:<10} {jdict['tokens'][i]:<15} {bio_tagged[i]}")

## Previous Results
**bert-base-cased**
```text
ChunkParse score:
    IOB Accuracy:  97.7%%
    Precision:     74.3%%
    Recall:        69.9%%
    F-Measure:     72.0%%
```

```text
ChunkParse score:
    IOB Accuracy:  97.9%%
    Precision:     77.9%%
    Recall:        72.3%%
    F-Measure:     75.0%%
```

```text
ChunkParse score:
    IOB Accuracy:  98.2%%
    Precision:     80.1%%
    Recall:        76.7%%
    F-Measure:     78.4%%
```

**bert-large-cased** (Bridges)

5 Epochs
```text
ChunkParse score:
    IOB Accuracy:  98.3%%
    Precision:     82.7%%
    Recall:        79.4%%
    F-Measure:     81.0%%
```