# Fine-tune an [`EntityRecognizer`](https://spacy.io/api/entityrecognizer)
Goal: get a better performance than the off-the-shelf network

In [1]:
import json

## Data loading

In [2]:
import json
def load_dataset(path_to_json: str) -> dict[str, tuple[str, list[tuple[int, int, str]]]]:
    with open(path_to_json, encoding="utf8") as in_file:
        return json.load(in_file)

dataset = load_dataset("../dataset/French_ELTEC_NER_Open_Dataset.json")
print(f"Loaded a dataset with {len(dataset)} documents.")

Loaded a dataset with 100 documents.


## Train / val separation

We will use 70 documents to train, and 30 to validate (measure) the effective performance.

In [3]:
# All objects are list with aligned content, 
# i.e. train_ids[i] is the document ID for the ith element,
# train_texts[i] contains the text for this particular document,
# and train_entities[i] contains the target entities for this document.
# test_* variants are structured the same way, with 70% of the samples in train, and 30% in test.
train_ids = []
train_texts = []
train_entities = []
test_ids = []
test_texts = []
test_entities = []
for count, (id, (text, entity)) in enumerate(dataset.items()):
    # Switch the destination based on the percentage of elements already added
    # dict storage is shuffled but deterministic, so no need to seed any RNG here
    dst_ids, dst_texts, dst_entities = (
        (train_ids, train_texts, train_entities)
        if (100 * count / len(dataset)) < 70 else
        (test_ids, test_texts, test_entities)
    )
    for dst, item in zip((dst_ids, dst_texts, dst_entities), (id, text, entity)):
        dst.append(item)
print(f"Produced a training set with {len(train_ids)} elements, and a test set with {len(test_ids)} elements.")

Produced a training set with 70 elements, and a test set with 30 elements.


## Evaluate the performance of a pre-trained network

We can compare the performance of two pre-trained network and a fine-tuned one:
- `fr_core_news_sm`: small 
- `fr_core_news_lg`: large
- `fr_core_news_lg+finetune`: large + fine-tuning (performed by us)

In [11]:
from spacy.scorer import Scorer
from spacy.training.example import Example

def evaluate(ner_model, dataset_dict, debug=False):
    """FIXME DOC"""
    examples = []
    for doc_id, (text, target_entities) in dataset_dict.items():
        pred_doc = ner_model(text)
        if debug:
            print("Pred.:", [(ent.text, ent.label_) for ent in pred_doc.ents], " ↔ Targ.:", [(text[e[0]:e[1]], e[2]) for e in target_entities])
        try:
            example = Example.from_dict(pred_doc, {"entities": target_entities})
            examples.append(example)
        except ValueError as e:
            err_msg = f"Error parsing document '{doc_id}': "
            err_msg += getattr(e, "msg", str(e))
            print(err_msg)
            raise ValueError(err_msg)
    
    scorer = Scorer()
    scores = scorer.score_spans(examples, "ents")
    # print(scores["ents_f"])
    return scores

In [12]:
test_dict = {k:dataset[k] for k in test_ids}
len(test_dict)

30

In [13]:
nlp_model = spacy.load("fr_core_news_sm")
evaluate(nlp_model, test_dict)

Dans la grande salle des fêtes de 1' « ..." with entities "[[1003, 1014, 'PER'], [1246, 1252, 'PER'], [1254, ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.


{'ents_p': 0.4423305588585018,
 'ents_r': 0.6876155268022182,
 'ents_f': 0.5383502170767005,
 'ents_per_type': {'LOC': {'p': 0.4674922600619195,
   'r': 0.7259615384615384,
   'f': 0.568738229755179},
  'ORG': {'p': 0.0, 'r': 0.0, 'f': 0.0},
  'MISC': {'p': 0.0, 'r': 0.0, 'f': 0.0},
  'PER': {'p': 0.5846560846560847,
   'r': 0.6636636636636637,
   'f': 0.6216596343178622}}}

In [16]:
!python -m spacy download fr_core_news_lg
nlp_model = spacy.load("fr_core_news_lg")
evaluate(nlp_model, test_dict)

Collecting fr-core-news-lg==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_lg-3.7.0/fr_core_news_lg-3.7.0-py3-none-any.whl (571.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m571.8/571.8 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Installing collected packages: fr-core-news-lg
Successfully installed fr-core-news-lg-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_lg')


Dans la grande salle des fêtes de 1' « ..." with entities "[[1003, 1014, 'PER'], [1246, 1252, 'PER'], [1254, ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.


{'ents_p': 0.48466257668711654,
 'ents_r': 0.7301293900184843,
 'ents_f': 0.5825958702064897,
 'ents_per_type': {'LOC': {'p': 0.6037735849056604,
   'r': 0.7692307692307693,
   'f': 0.6765327695560254},
  'ORG': {'p': 0.0, 'r': 0.0, 'f': 0.0},
  'MISC': {'p': 0.0, 'r': 0.0, 'f': 0.0},
  'PER': {'p': 0.649171270718232,
   'r': 0.7057057057057057,
   'f': 0.6762589928057553}}}

On voit ici que la performance est meilleure pour le modèle large.

## Convert training data to Spacy format

In [None]:
import spacy
from spacy.tokens import DocBin

def prepare_save_dataset(ids, texts, entities, dst_path):
    nlp = spacy.blank("fr")
    # the DocBin will store the example documents
    db = DocBin()
    for id, text, annotations in zip(ids, texts, entities):
        ents = None
        doc = nlp(text)
        ents = []
        for start, end, label in annotations:
            span = doc.char_span(start, end, label=label)
            base_err_str = None
            if span is None:
                print(f"Warning: in document '{id}', could align entity '{text[start:end]}' to computed tokens. ")
                span = doc.char_span(start, end, label=label, alignment_mode="contract")
                print(f"\t Try to fix by contracting to span '{span}'.")
            if span is None:
                span = doc.char_span(start, end, label=label, alignment_mode="expand")
                print(f"\t Try to fix by expanding to span '{span}'.")
            if span is None:
                print(f"\tCannot recover: skipping.")
                continue
            ents.append(span)
        doc.ents = ents
        db.add(doc)
    db.to_disk(dst_path)

In [None]:
print("Prepare training set…")
prepare_save_dataset(train_ids, train_texts, train_entities, "../tmp/train.spacy")
print("Prepare test set…")
prepare_save_dataset(test_ids, test_texts, test_entities, "../tmp/test.spacy")

Prepare training set…
	 Try to fix by contracting to span 'roi Philippe'.
Prepare test set…
	 Try to fix by contracting to span 'None'.
	 Try to fix by expanding to span 'Bordeaux-'.


In [14]:
from spacy import displacy
def display_doc(id, ids, text, entities):
    ii = ids.index(id)
    from spacy import displacy
    manual_content = {
        "text": text[ii],
        "ents": [{"start": e[0], "end": e[1], "label": e[2]} for e in entities[ii]],
        "title": id
    }
    displacy.render(manual_content, manual=True, style="ent", jupyter=True)

In [18]:
# display_doc("FRA01002_DelarueMardrus", test_ids, test_texts, test_entities)

## Prepare training scripts and launch the training

TODO use <https://spacy.io/usage/training#quickstart>