In [82]:
from random import randint
from huggingface_hub import login
from transformers import AdamWeightDecay, TFAutoModelForTokenClassification, AutoTokenizer

import os 
import sys

parent_dir = os.path.abspath('..')
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

from src.preprocess import PrepSystem
from src.tag import TagInfo
from src.train import check_gpus, get_token

from evaluate import evaluator
from datasets import load_dataset, DatasetInfo, DatasetDict
from evaluate import utils, enable_progress_bar, is_progress_bar_enabled, push_to_hub
import pandas as pd

This notebook uses the [`evaluate` library from huggingface](https://huggingface.co/docs/evaluate/v0.4.0/en/base_evaluator#evaluate-models-on-the-hub) which defaults using the first GPU detected. 

In [2]:
# Check for GPUs
check_gpus()

2023-12-04 11:24:34.867448: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-12-04 11:24:34 INFO     [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
2023-12-04 11:24:34.885483: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-12-04 11:24:34.885731: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA n

Login to huggingface 
create a .env file with a huggingface access token
Example:


```shell
# .env in root dir
HF_TOKEN=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
```

In [3]:
login(get_token())

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/snek/.cache/huggingface/token
Login successful


To evaluate the model, the tokenizer and dataset need to be loaded. 
The `PrepSystem` class can load, filter, and pre-process the dataset and load the tokenizer. This is useful particularly for experiment B which has a modified dataset since it trains on a limited number of tags. 

In [4]:
pretrained_model_checkpoint = "distilbert-base-uncased"
learning_rate = 2e-5
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=0.0)
filter_by_lang = "en"

In [5]:
# uses the entire MultiNERD tagset
A = PrepSystem(labels=TagInfo.full_tagset,
                pretrained_model_checkpoint=pretrained_model_checkpoint,
                dataset_batch_size=16,
                filter_tagset=False,
                language=filter_by_lang,
                split_filter="test"
                )

# uses only PER, ORG, LOC, DIS, ANIM
B = PrepSystem(labels=TagInfo.main_five, 
                pretrained_model_checkpoint=pretrained_model_checkpoint,
                dataset_batch_size=16,
                filter_tagset=True,
                language=filter_by_lang,
                split_filter="test"
                )

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

2023-12-04 11:24:52 INFO     Filtered language by en. 
DatasetDict({
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 32908
    })
})
2023-12-04 11:24:52 INFO     Using the full tagset
2023-12-04 11:24:52 INFO     Making sure all labels have sequential IDs. This can happen if a reduced tagset is chosen
2023-12-04 11:24:52 INFO     All label ids are sequential, nothing to swap.
2023-12-04 11:24:52 INFO     Adding Sequence(ClassLabel) feature to dataset to make it usable with the `TokenClassificationEvaluator` from `evaluation`.
Read more: https://huggingface.co/docs/evaluate/v0.4.0/en/package_reference/evaluator_classes


Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

2023-12-04 11:24:55 INFO     Filtered language by en. 
DatasetDict({
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 32908
    })
})
2023-12-04 11:24:55 INFO     Keeping these tags only: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-ANIM', 'I-ANIM', 'B-DIS', 'I-DIS']. All other tags will be set to '0'
2023-12-04 11:24:55 INFO     Making sure all labels have sequential IDs. This can happen if a reduced tagset is chosen
2023-12-04 11:24:55 INFO     Swapping these labels: {14: 10, 13: 9}
2023-12-04 11:24:55 INFO     Modified label to ID: {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-ANIM': 7, 'I-ANIM': 8, 'B-DIS': 9, 'I-DIS': 10}
2023-12-04 11:24:55 INFO     Modified ID to label: {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-ANIM', 8: 'I-ANIM', 9: 'B-DIS', 10: 'I-DIS'}
2023-12-04 11:24:55 INFO     Adding Sequence(ClassLabel) feature to dataset to make it usable w

In order to use the `evaluator`, the dataset features need to contain a `ClassLabel`. These have been added by the `PrepSystem` class

In [6]:
# System A has the full tagset 
A.dataset["test"].features

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-ANIM', 'I-ANIM', 'B-BIO', 'I-BIO', 'B-CEL', 'I-CEL', 'B-DIS', 'I-DIS', 'B-EVE', 'I-EVE', 'B-FOOD', 'I-FOOD', 'B-INST', 'I-INST', 'B-MEDIA', 'I-MEDIA', 'B-MYTH', 'I-MYTH', 'B-PLANT', 'I-PLANT', 'B-TIME', 'I-TIME', 'B-VEHI', 'I-VEHI'], id=None), length=-1, id=None)}

In [7]:
# System B has only five + the '0' tag
B.dataset["test"].features

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-ANIM', 'I-ANIM', 'B-DIS', 'I-DIS'], id=None), length=-1, id=None)}

[`evaluate.TokenClassificationEvaluator`](https://huggingface.co/docs/evaluate/v0.4.0/en/package_reference/evaluator_classes#evaluate.TokenClassificationEvaluator) can compute metrics for this specific task.

The metric used here is [seqeval](https://huggingface.co/spaces/evaluate-metric/seqeval) to calculate precision, recall, and f1. 

In [8]:
utils.logging.set_verbosity(10)
enable_progress_bar()
print("Is tqdm progress bar enabled?", is_progress_bar_enabled())

task_evaluator = evaluator("token-classification")
metric = "seqeval"

Is tqdm progress bar enabled? True


The finetuned models can be loaded from hugginface.

It's also possible to load the model if stored locally

```python
locally_stored_model = TFAutoModelForTokenClassification.from_pretrained("./model_dir")
```

In [15]:
exp_A = "i-be-snek/distilbert-base-uncased-finetuned-ner-exp_A"
exp_B = "i-be-snek/distilbert-base-uncased-finetuned-ner-exp_B"

In [9]:
finetuned_model_A = TFAutoModelForTokenClassification.from_pretrained(exp_A)
finetuned_model_B = TFAutoModelForTokenClassification.from_pretrained(exp_B)

2023-12-04 11:24:56.074698: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-12-04 11:24:56.074870: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-12-04 11:24:56.074944: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

The tokenizer can also be loaded from hugginface. 

Alternatively, the tokenizer could be loaded from `PrepSystem`

In [10]:
tokenizer_A = AutoTokenizer.from_pretrained("i-be-snek/distilbert-base-uncased-finetuned-ner-exp_A")
# tokenizer_B = AutoTokenizer.from_pretrained("i-be-snek/distilbert-base-uncased-finetuned-ner-exp_B")

# A.load_tokenizer()
# tokenizer_A = A.tokenizer
B.load_tokenizer()
tokenizer_B = B.tokenizer

Evaluation may take up to 20-30 minutes to evaluate on GPU on the full test set.

To evaluate on a small subset of the test data, use `.select`

```python
results = task_evaluator.compute(
    model_or_pipeline=my_finetuned_model,
    data=my_dataset["test"].select(range(100)),
    tokenizer=my_tokenizer,
    metric=my_metric,
    device=0,
)
```

In [11]:
results_A = task_evaluator.compute(
    model_or_pipeline=finetuned_model_A,
    data=A.dataset["test"],
    tokenizer=tokenizer_A,
    metric=metric,
    device=0,
)
results_A

2023-12-04 11:25:01 DEBUG    Checking /home/snek/.cache/huggingface/evaluate/downloads/ce3d470a80c053c9717f2e2f5afecd57e33d25206350f19ddc705fff68aabbe8.39fd158e256d7438039bca37be07c68d2db98a59b944148d8fb5bd3d080432bc.py for additional imports.
2023-12-04 11:25:01 DEBUG    Created importable dataset file at /home/snek/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--seqeval/541ae017dc683f85116597d48f621abc7b21b88dc42ec937c71af5415f0af63c/seqeval.py
2023-12-04 11:50:00 INFO     Removing /home/snek/.cache/huggingface/metrics/seqeval/default/default_experiment-1-0.arrow


{'ANIM': {'precision': 0.6672619047619047,
  'recall': 0.6988778054862843,
  'f1': 0.6827040194884287,
  'number': 3208},
 'BIO': {'precision': 0.6666666666666666,
  'recall': 0.75,
  'f1': 0.7058823529411765,
  'number': 16},
 'CEL': {'precision': 0.5081967213114754,
  'recall': 0.7560975609756098,
  'f1': 0.6078431372549019,
  'number': 82},
 'DIS': {'precision': 0.6623235613463626,
  'recall': 0.8036890645586298,
  'f1': 0.726190476190476,
  'number': 1518},
 'EVE': {'precision': 0.8962765957446809,
  'recall': 0.9573863636363636,
  'f1': 0.9258241758241759,
  'number': 704},
 'FOOD': {'precision': 0.6378091872791519,
  'recall': 0.6378091872791519,
  'f1': 0.6378091872791519,
  'number': 1132},
 'INST': {'precision': 0.6428571428571429,
  'recall': 0.75,
  'f1': 0.6923076923076924,
  'number': 24},
 'LOC': {'precision': 0.9641371276418705,
  'recall': 0.9636560212907518,
  'f1': 0.9638965144330754,
  'number': 24048},
 'MEDIA': {'precision': 0.9319148936170213,
  'recall': 0.956331

In [12]:
results_B = task_evaluator.compute(
    model_or_pipeline=finetuned_model_B,
    data=B.dataset["test"],
    tokenizer=tokenizer_B,
    metric=metric,
    device=0,
)
results_B

2023-12-04 11:50:07 DEBUG    Checking /home/snek/.cache/huggingface/evaluate/downloads/ce3d470a80c053c9717f2e2f5afecd57e33d25206350f19ddc705fff68aabbe8.39fd158e256d7438039bca37be07c68d2db98a59b944148d8fb5bd3d080432bc.py for additional imports.
2023-12-04 11:50:07 DEBUG    Created importable dataset file at /home/snek/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--seqeval/541ae017dc683f85116597d48f621abc7b21b88dc42ec937c71af5415f0af63c/seqeval.py
2023-12-04 12:14:48 INFO     Removing /home/snek/.cache/huggingface/metrics/seqeval/default/default_experiment-1-0.arrow


{'ANIM': {'precision': 0.6746031746031746,
  'recall': 0.7948877805486284,
  'f1': 0.7298225529479108,
  'number': 3208},
 'DIS': {'precision': 0.695303550973654,
  'recall': 0.7997364953886693,
  'f1': 0.7438725490196079,
  'number': 1518},
 'LOC': {'precision': 0.9666694372870086,
  'recall': 0.9672322022621423,
  'f1': 0.96695073789233,
  'number': 24048},
 'ORG': {'precision': 0.9547123623011016,
  'recall': 0.942883046237534,
  'f1': 0.9487608332066291,
  'number': 6618},
 'PER': {'precision': 0.9890483383685801,
  'recall': 0.9948717948717949,
  'f1': 0.9919515197424487,
  'number': 10530},
 'overall_precision': 0.9362959157462112,
 'overall_recall': 0.9524846478811898,
 'overall_f1': 0.9443209050281742,
 'overall_accuracy': 0.9913435631438657,
 'total_time_in_seconds': 1472.2920382579998,
 'samples_per_second': 22.351543813913707,
 'latency_in_seconds': 0.044739638940622335}

#### Experiment A evaluation results

In [83]:
print(pd.DataFrame(results_A).iloc[:, :15])

                  ANIM        BIO        CEL          DIS         EVE  \
precision     0.667262   0.666667   0.508197     0.662324    0.896277   
recall        0.698878   0.750000   0.756098     0.803689    0.957386   
f1            0.682704   0.705882   0.607843     0.726190    0.925824   
number     3208.000000  16.000000  82.000000  1518.000000  704.000000   

                  FOOD       INST           LOC       MEDIA       MYTH  \
precision     0.637809   0.642857      0.964137    0.931915   0.638889   
recall        0.637809   0.750000      0.963656    0.956332   0.718750   
f1            0.637809   0.692308      0.963897    0.943966   0.676471   
number     1132.000000  24.000000  24048.000000  916.000000  64.000000   

                   ORG           PER        PLANT        TIME       VEHI  
precision     0.941176      0.990330     0.558043    0.756579   0.735294  
recall        0.962224      0.992023     0.752796    0.795848   0.781250  
f1            0.951584      0.991176  

In [86]:
results_A_overall = {key.split("_")[1]: value for key, value in results_A.items() if key.startswith("overall_")}


print(pd.DataFrame(list(results_A_overall.items()), columns=["metric", "value"]))

      metric     value
0  precision  0.905358
1     recall  0.930318
2         f1  0.917668
3   accuracy  0.986355


#### Experiment B evaluation results

In [91]:
print(pd.DataFrame(results_B).iloc[:, :5])

                  ANIM          DIS           LOC          ORG           PER
precision     0.674603     0.695304      0.966669     0.954712      0.989048
recall        0.794888     0.799736      0.967232     0.942883      0.994872
f1            0.729823     0.743873      0.966951     0.948761      0.991952
number     3208.000000  1518.000000  24048.000000  6618.000000  10530.000000


In [92]:
results_B_overall = {key.split("_")[1]: value for key, value in results_B.items() if key.startswith("overall_")}
print(pd.DataFrame(list(results_A_overall.items()), columns=["metric", "value"]))

      metric     value
0  precision  0.905358
1     recall  0.930318
2         f1  0.917668
3   accuracy  0.986355


The results can be pushed to the huggingface hub using the `evaluate` library

In [93]:
for k, v in results_A_overall.items():
    push_to_hub(model_id=exp_A,
                task_type="token-classification",
                dataset_type="Babelscape/multinerd",
                dataset_name="Babelscape/multinerd",
                metric_type=metric,
                metric_name=k,
                metric_value=float(v),
                dataset_split="test",
                task_name="ner",
                overwrite=True
                )

README.md:   0%|          | 0.00/7.96k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.03k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.10k [00:00<?, ?B/s]

In [94]:
for k, v in results_B_overall.items():
    push_to_hub(model_id=exp_B,
                task_type="token-classification",
                dataset_type="Babelscape/multinerd",
                dataset_name="Babelscape/multinerd with only 5 tags",
                metric_type=metric,
                metric_name=k,
                metric_value=float(v),
                dataset_split="test",
                task_name="ner",
                overwrite=True
                )

README.md:   0%|          | 0.00/4.30k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.55k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.62k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.69k [00:00<?, ?B/s]