# Spacy3 NER - Model Building(Training, Prediction Example)


A working model of Spacy NER Model for custom model building

In [1]:
from __future__ import unicode_literals, print_function

import random
from pathlib import Path
import spacy
from tqdm import tqdm

import datetime

print(datetime.date.today())

2021-05-04


## Spacy3 Sample Data Model

In [2]:
TRAIN_DATA = [
    ('Who is Nishanth?', {
        'entities': [(7, 15, 'PERSON')]
    }),
     ('Who is Kamal Khumar?', {
        'entities': [(7, 19, 'PERSON')]
    }),
    ('I like London and Berlin.', {
        'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
    })
]

## Spacy3 Model Building

In [18]:
# Model & Training Params
model = None
n_iter=100

In [4]:
if model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")

Created blank 'en' model


In [17]:
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(factory_name='ner', last=True)
else:
    ner = nlp.get_pipe('ner')

__Note2:__ As of Spacy3 the format of providing inputs to  `nlp.update` has change.
    
    In the `spacy2` packages, we used to pass the `nlp.update([text], [annotations],.....)` and in the current models `nlp.update` accepts only the inputs which in `spacy.training.example.Example` class.

In [7]:
from spacy.training.example import Example

for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in tqdm(TRAIN_DATA):
            example = Example.from_dict(nlp.make_doc(text), annotations)
            nlp.update(
                [example],  
                drop=0.5,  
                sgd=optimizer,
                losses=losses)
        print(losses)

100%|██████████| 3/3 [00:00<00:00, 27.35it/s]
100%|██████████| 3/3 [00:00<00:00, 34.67it/s]
100%|██████████| 3/3 [00:00<00:00, 33.68it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 12.980319321155548}
{'ner': 12.311058759689331}
{'ner': 11.008314788341522}


100%|██████████| 3/3 [00:00<00:00, 27.52it/s]
100%|██████████| 3/3 [00:00<00:00, 20.97it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 9.964548081159592}
{'ner': 8.560982674360275}


100%|██████████| 3/3 [00:00<00:00, 21.73it/s]
100%|██████████| 3/3 [00:00<00:00, 24.90it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 7.494104325771332}
{'ner': 7.147747904062271}


100%|██████████| 3/3 [00:00<00:00, 26.93it/s]
100%|██████████| 3/3 [00:00<00:00, 22.05it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 6.809496946632862}
{'ner': 6.252179758623242}


100%|██████████| 3/3 [00:00<00:00, 17.74it/s]
100%|██████████| 3/3 [00:00<00:00, 22.43it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 5.5997209921479225}
{'ner': 5.3316211918136105}


100%|██████████| 3/3 [00:00<00:00, 23.73it/s]
100%|██████████| 3/3 [00:00<00:00, 26.63it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 4.666502970037982}
{'ner': 5.111178790699341}


100%|██████████| 3/3 [00:00<00:00, 28.33it/s]
100%|██████████| 3/3 [00:00<00:00, 32.09it/s]
100%|██████████| 3/3 [00:00<00:00, 37.04it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 4.643602845200803}
{'ner': 6.334257873240858}
{'ner': 4.517644856576226}


100%|██████████| 3/3 [00:00<00:00, 26.21it/s]
100%|██████████| 3/3 [00:00<00:00, 30.86it/s]
100%|██████████| 3/3 [00:00<00:00, 34.68it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 4.356501877948176}
{'ner': 4.984810533127529}
{'ner': 6.241194950256613}


100%|██████████| 3/3 [00:00<00:00, 29.19it/s]
100%|██████████| 3/3 [00:00<00:00, 30.37it/s]
100%|██████████| 3/3 [00:00<00:00, 35.23it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 6.801824269219651}
{'ner': 8.814300565165468}
{'ner': 5.863554886549537}


100%|██████████| 3/3 [00:00<00:00, 27.84it/s]
100%|██████████| 3/3 [00:00<00:00, 23.03it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 4.626041115086991}
{'ner': 3.1144582917622756}


100%|██████████| 3/3 [00:00<00:00, 25.81it/s]
100%|██████████| 3/3 [00:00<00:00, 29.23it/s]
100%|██████████| 3/3 [00:00<00:00, 40.23it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 2.559470534979482}
{'ner': 2.7828347058261897}
{'ner': 2.284569873926557}


100%|██████████| 3/3 [00:00<00:00, 31.69it/s]
100%|██████████| 3/3 [00:00<00:00, 24.49it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 2.189214519490804}
{'ner': 2.269616372891772}


100%|██████████| 3/3 [00:00<00:00, 17.75it/s]
100%|██████████| 3/3 [00:00<00:00, 32.32it/s]
100%|██████████| 3/3 [00:00<00:00, 31.41it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.8250188302519064}
{'ner': 2.8328332858671956}
{'ner': 1.5807744569509055}


100%|██████████| 3/3 [00:00<00:00, 26.65it/s]
100%|██████████| 3/3 [00:00<00:00, 29.55it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 2.93997597710564}
{'ner': 2.2311021183966275}


100%|██████████| 3/3 [00:00<00:00, 23.65it/s]
100%|██████████| 3/3 [00:00<00:00, 17.69it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.715067291781424}
{'ner': 2.4039388072904444}


100%|██████████| 3/3 [00:00<00:00, 34.02it/s]
100%|██████████| 3/3 [00:00<00:00, 37.84it/s]
100%|██████████| 3/3 [00:00<00:00, 26.75it/s]


{'ner': 0.624455678371771}
{'ner': 0.5759262654274656}
{'ner': 0.5674940006518892}


100%|██████████| 3/3 [00:00<00:00, 25.39it/s]
100%|██████████| 3/3 [00:00<00:00, 41.45it/s]
100%|██████████| 3/3 [00:00<00:00, 39.08it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.36080906980659555}
{'ner': 0.16868381214995257}
{'ner': 1.2240315327007816}


100%|██████████| 3/3 [00:00<00:00, 28.51it/s]
100%|██████████| 3/3 [00:00<00:00, 36.31it/s]
100%|██████████| 3/3 [00:00<00:00, 27.71it/s]

{'ner': 0.011906150036428295}
{'ner': 0.05341847198153671}
{'ner': 0.00505619317753235}


100%|██████████| 3/3 [00:00<00:00, 36.79it/s]
100%|██████████| 3/3 [00:00<00:00, 44.42it/s]
100%|██████████| 3/3 [00:00<00:00, 42.36it/s]
  0%|          | 0/3 [00:00<?, ?it/s]


{'ner': 0.0008243863311860845}
{'ner': 0.02013278041547363}
{'ner': 0.00013035071366598828}


100%|██████████| 3/3 [00:00<00:00, 35.14it/s]
100%|██████████| 3/3 [00:00<00:00, 31.44it/s]
100%|██████████| 3/3 [00:00<00:00, 37.87it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.0014708780336125812}
{'ner': 0.0003686893623645816}
{'ner': 0.1817735408101359}


100%|██████████| 3/3 [00:00<00:00, 31.81it/s]
100%|██████████| 3/3 [00:00<00:00, 40.15it/s]
100%|██████████| 3/3 [00:00<00:00, 39.54it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.017029728022777}
{'ner': 6.746001942516204e-06}
{'ner': 0.00014853442456995525}


100%|██████████| 3/3 [00:00<00:00, 30.68it/s]
100%|██████████| 3/3 [00:00<00:00, 40.55it/s]
100%|██████████| 3/3 [00:00<00:00, 32.06it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.007244845739106269}
{'ner': 0.0027772387732367815}
{'ner': 0.02159830658570109}


100%|██████████| 3/3 [00:00<00:00, 33.82it/s]
100%|██████████| 3/3 [00:00<00:00, 35.45it/s]
100%|██████████| 3/3 [00:00<00:00, 38.24it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 8.378032120811538e-06}
{'ner': 0.004593457567891715}
{'ner': 0.0001913059083154695}


100%|██████████| 3/3 [00:00<00:00, 35.83it/s]
100%|██████████| 3/3 [00:00<00:00, 23.52it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.0018499321844585837}
{'ner': 0.013717168741703638}


100%|██████████| 3/3 [00:00<00:00, 17.39it/s]
100%|██████████| 3/3 [00:00<00:00, 35.17it/s]
100%|██████████| 3/3 [00:00<00:00, 34.45it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.064371856127766e-06}
{'ner': 2.246477471741537e-08}
{'ner': 1.5878341230069328e-06}


100%|██████████| 3/3 [00:00<00:00, 36.21it/s]
100%|██████████| 3/3 [00:00<00:00, 30.43it/s]
100%|██████████| 3/3 [00:00<00:00, 36.44it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.01741482633795052}
{'ner': 1.4317723344501601e-05}
{'ner': 4.1163054968047617e-07}


100%|██████████| 3/3 [00:00<00:00, 30.94it/s]
100%|██████████| 3/3 [00:00<00:00, 33.84it/s]
100%|██████████| 3/3 [00:00<00:00, 34.35it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.00010805964748961546}
{'ner': 3.890004747748672e-08}
{'ner': 0.00011721707497961082}


100%|██████████| 3/3 [00:00<00:00, 31.09it/s]
100%|██████████| 3/3 [00:00<00:00, 33.36it/s]
100%|██████████| 3/3 [00:00<00:00, 40.91it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.00030704612121467323}
{'ner': 0.012633368691881175}
{'ner': 1.5329991876744552e-07}


100%|██████████| 3/3 [00:00<00:00, 27.73it/s]
100%|██████████| 3/3 [00:00<00:00, 42.18it/s]
100%|██████████| 3/3 [00:00<00:00, 39.14it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 3.8254045383562295e-05}
{'ner': 1.5513627853737408e-07}
{'ner': 7.684096087987159e-05}


100%|██████████| 3/3 [00:00<00:00, 31.28it/s]
100%|██████████| 3/3 [00:00<00:00, 34.26it/s]
100%|██████████| 3/3 [00:00<00:00, 40.13it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.00048543954100789283}
{'ner': 3.3275541617166604e-08}
{'ner': 0.00040133387768896776}


100%|██████████| 3/3 [00:00<00:00, 28.33it/s]
100%|██████████| 3/3 [00:00<00:00, 25.93it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.5138306144453272e-07}
{'ner': 2.6300833138873905e-08}


100%|██████████| 3/3 [00:00<00:00, 29.10it/s]
100%|██████████| 3/3 [00:00<00:00, 29.42it/s]
100%|██████████| 3/3 [00:00<00:00, 35.57it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 3.5794416048165813e-07}
{'ner': 8.581952126705328e-05}
{'ner': 6.639954419894849e-09}


100%|██████████| 3/3 [00:00<00:00, 29.57it/s]
100%|██████████| 3/3 [00:00<00:00, 34.21it/s]
100%|██████████| 3/3 [00:00<00:00, 40.87it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.1909877976503998e-06}
{'ner': 1.586881272015997e-05}
{'ner': 7.920333079243761e-08}


100%|██████████| 3/3 [00:00<00:00, 32.73it/s]
100%|██████████| 3/3 [00:00<00:00, 29.96it/s]
100%|██████████| 3/3 [00:00<00:00, 35.21it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.0171083705036309e-07}
{'ner': 2.9472733673791743e-08}
{'ner': 2.049207301247394e-06}


100%|██████████| 3/3 [00:00<00:00, 39.17it/s]
100%|██████████| 3/3 [00:00<00:00, 35.71it/s]
100%|██████████| 3/3 [00:00<00:00, 37.99it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.0005363296435380145}
{'ner': 1.607732755738595e-06}
{'ner': 4.2108962484672143e-07}


100%|██████████| 3/3 [00:00<00:00, 35.84it/s]
100%|██████████| 3/3 [00:00<00:00, 38.27it/s]
100%|██████████| 3/3 [00:00<00:00, 34.25it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 6.36253916429815e-08}
{'ner': 7.755486084172884e-08}
{'ner': 1.3739288210257049e-08}


100%|██████████| 3/3 [00:00<00:00, 28.19it/s]
100%|██████████| 3/3 [00:00<00:00, 32.92it/s]
100%|██████████| 3/3 [00:00<00:00, 34.32it/s]

{'ner': 7.086135590781128e-10}
{'ner': 2.515308585839245e-08}
{'ner': 3.6175018446075242e-06}





## Spacy3 Prediction

In [8]:
for text, _ in TRAIN_DATA:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])

Entities [('Nishanth', 'PERSON')]
Entities [('Kamal Khumar', 'PERSON')]
Entities [('London', 'LOC'), ('Berlin', 'LOC')]


## Spacy3 Save Model

In [10]:
nlp.to_disk('sample_ner_traning_model')

## Spacy3 Load Model

In [16]:
model = spacy.load('sample_ner_traning_model')

print('Entities', [(ent.text, ent.label_) for ent in model(text).ents])

Entities [('London', 'LOC'), ('Berlin', 'LOC')]
