this was run in google colab because of the need for gpu.

In [1]:
!pip install spacy tqdm scikit-learn
!python -m spacy download en_core_web_sm 

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import json
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
from sklearn.model_selection import train_test_split

with open('ner_training_data.json', 'r') as f:
    data = json.load(f)

nlp = spacy.blank('en')

def create_training_data(data):
    db = DocBin()
    for text, annot in tqdm(data):
        doc = nlp.make_doc(text)
        ents = []
        entity_indices = set()

        sorted_entities = sorted(annot['entities'], key=lambda x: (x[0], -(x[1] - x[0])))

        for start, end, label in sorted_entities:
            # Check if this entity overlaps with any previously added entity
            if any(i in entity_indices for i in range(start, end)):
                continue  # Skip this entity if there's an overlap

            span = doc.char_span(start, end, label=label, alignment_mode='strict')
            if span is not None:
                ents.append(span)
                entity_indices.update(range(start, end))

        try:
            doc.ents = ents
            db.add(doc)
        except ValueError as e:
            print(f"Skipping problematic example: {text}")
            print(f"Error: {e}")

    return db

train_data, test_data = train_test_split(data, test_size=0.1, random_state=42)
train_db = create_training_data(train_data)
test_db = create_training_data(test_data)

train_db.to_disk("./train.spacy")
test_db.to_disk("./test.spacy")

In [4]:
!python -m spacy debug config config.cfg

[1m
[1m
[1m
Traceback (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/spacy/__main__.py", line 4, in <module>
    setup_cli()
  File "/usr/local/lib/python3.10/dist-packages/spacy/cli/_util.py", line 87, in setup_cli
    command(prog_name=COMMAND)
  File "/usr/local/lib/python3.10/dist-packages/click/core.py", line 1157, in __call__
    return self.main(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/typer/core.py", line 723, in main
    return _main(
  File "/usr/local/lib/python3.10/dist-packages/typer/core.py", line 193, in _main
    rv = self.invoke(ctx)
  File "/usr/local/lib/python3.10/dist-packages/click/core.py", line 1688, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
  File "/usr/local/lib/python3.10/dis

In [3]:
!python -m spacy init fill-config config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [5]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./test.spacy --gpu-id 0

[38;5;2m✔ Created output directory: output[0m
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    130.85    0.00    0.00    0.00    0.00
  0     200        403.91   4665.73    5.70   16.88    3.43    0.06
  0     400        601.05   3663.32    0.00    0.00    0.00    0.00
  0     600        850.69   3060.25    0.23   37.50    0.11    0.00
  0     800       5402.67   5325.35   15.38   24.98   11.12    0.15
  0    1000        276.99   2732.89   18.97   18.39   19.59    0.19
  0    1200       7464.71   4235.20   10.21   25.68    6.37    0.10
  0    1400       1667.15   2509.60    4.15   40.28    2.19    0.04
  0    1600       1163.99   3019.94    6.75   39.04    3.69    

In [9]:
from spacy import displacy

nlp = spacy.load("./output/model-last")

text = "CVS wins bidding war for Signify Health, will acquire company in $8B deal This website is using a security service to protect itself from online attacks. The action you just performed triggered the security solution. There are several actions that could trigger this block including submitting a certain word or phrase, a SQL command or malformed data. You can email the site owner to let them know you were blocked. Please include what you were doing when this page came up and the Cloudflare Ray ID found at the bottom of this page. Cloudflare Ray ID: 8a20e4435d706a0d      Your IP:      Click to reveal34.106.165.23Performance & security by Cloudflare"

doc = nlp(text)

if len(doc.ents) == 0:
    print("No entities found.")
else:
    for ent in doc.ents:
        print(f"Entity: {ent.text}, Label: {ent.label_}")

displacy.render(doc, style="ent", jupyter=True)


Entity: CVS, Label: ACQUIRER


In [18]:
!python -m spacy benchmark accuracy ./output/model-last test.spacy --output results.json --gold-preproc --displacy-limit 10 --per-component --spans-key spans

[38;5;4mℹ Using CPU[0m
[38;5;4mℹ Per-component scores will be saved to output JSON file.[0m
[38;5;2m✔ Saved results to results.json[0m


In [11]:
import shutil
from google.colab import files

shutil.make_archive('output', 'zip', folder_to_download)

files.download('output.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>