In [1]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import spacy
from tqdm import tqdm

In [2]:
nlp1 = spacy.load('en')

In [3]:
doc1 = nlp1("Who is Samuel")

In [4]:
for token in doc1.ents:
  print(token.text, token.label_)


Samuel PERSON


In [5]:
#training Data
TRAIN_DATA = [
    ('Who is Kofi Annan?', {
        'entities': [(8, 18, 'PERSON')]
    }),
     ('Who is Steve Jobs?', {
        'entities': [(7, 17, 'PERSON')]
    }),
    ('I like London and Berlin.', {
        'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
    })
]

In [6]:
@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    output_dir=("/content/drive/My Drive", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int))

SyntaxError: ignored

In [7]:
model = None
output_dir=Path("/content/drive/My Drive")
n_iter = 100

In [8]:
if model is not None :
  nlp = spacy.load(model)
else :
  nlp = spacy.blank('en')


In [9]:
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
else:
    ner = nlp.get_pipe('ner')

In [10]:
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

    # get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in tqdm(TRAIN_DATA):
            nlp.update(
                [text],  # batch of texts
                [annotations],  # batch of annotations
                drop=0.5,  # dropout - make it harder to memorise data
                sgd=optimizer,  # callable to update weights
                losses=losses)
        print(losses)

100%|██████████| 3/3 [00:00<00:00, 26.70it/s]
100%|██████████| 3/3 [00:00<00:00, 34.24it/s]
100%|██████████| 3/3 [00:00<00:00, 36.45it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 11.91785515844822}
{'ner': 10.945472121238708}
{'ner': 9.540121402591467}


100%|██████████| 3/3 [00:00<00:00, 35.02it/s]
100%|██████████| 3/3 [00:00<00:00, 32.32it/s]
100%|██████████| 3/3 [00:00<00:00, 31.61it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 8.591119404882193}
{'ner': 7.274728715419769}
{'ner': 6.842073037289083}


100%|██████████| 3/3 [00:00<00:00, 35.24it/s]
100%|██████████| 3/3 [00:00<00:00, 35.06it/s]
100%|██████████| 3/3 [00:00<00:00, 36.63it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 5.768677848856896}
{'ner': 4.1798099535517395}
{'ner': 5.422159171663225}


100%|██████████| 3/3 [00:00<00:00, 34.89it/s]
100%|██████████| 3/3 [00:00<00:00, 36.74it/s]
100%|██████████| 3/3 [00:00<00:00, 36.28it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 5.115231205333657}
{'ner': 5.412354227803007}
{'ner': 4.263489391581203}


100%|██████████| 3/3 [00:00<00:00, 33.96it/s]
100%|██████████| 3/3 [00:00<00:00, 35.74it/s]
100%|██████████| 3/3 [00:00<00:00, 37.42it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 5.296270006863779}
{'ner': 4.383512935248518}
{'ner': 4.656680647278117}


100%|██████████| 3/3 [00:00<00:00, 34.66it/s]
100%|██████████| 3/3 [00:00<00:00, 33.70it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 3.437699576291557}
{'ner': 3.6810257303964136}


100%|██████████| 3/3 [00:00<00:00, 26.27it/s]
100%|██████████| 3/3 [00:00<00:00, 26.37it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 4.653713840447338}
{'ner': 4.341912288131222}


100%|██████████| 3/3 [00:00<00:00, 25.30it/s]
100%|██████████| 3/3 [00:00<00:00, 22.21it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 3.8852722499052486}
{'ner': 3.2027875216072204}


100%|██████████| 3/3 [00:00<00:00, 23.49it/s]
100%|██████████| 3/3 [00:00<00:00, 25.40it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 2.5022863067538523}
{'ner': 1.904804986182894}


100%|██████████| 3/3 [00:00<00:00, 27.43it/s]
100%|██████████| 3/3 [00:00<00:00, 38.21it/s]
100%|██████████| 3/3 [00:00<00:00, 35.16it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 3.3384506850664284}
{'ner': 1.4784373181341175}
{'ner': 2.5021034076606967}


100%|██████████| 3/3 [00:00<00:00, 35.22it/s]
100%|██████████| 3/3 [00:00<00:00, 34.43it/s]
100%|██████████| 3/3 [00:00<00:00, 36.47it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.6574372986007146}
{'ner': 1.4766399072842502}
{'ner': 1.3233451354873424}


100%|██████████| 3/3 [00:00<00:00, 35.80it/s]
100%|██████████| 3/3 [00:00<00:00, 34.87it/s]
100%|██████████| 3/3 [00:00<00:00, 36.49it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.4978394468094808}
{'ner': 0.49220223649555106}
{'ner': 0.9113647922868633}


100%|██████████| 3/3 [00:00<00:00, 34.08it/s]
100%|██████████| 3/3 [00:00<00:00, 37.99it/s]
100%|██████████| 3/3 [00:00<00:00, 36.85it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.010471738712380638}
{'ner': 0.1567010276169626}
{'ner': 2.205806021147323}


100%|██████████| 3/3 [00:00<00:00, 35.60it/s]
100%|██████████| 3/3 [00:00<00:00, 37.03it/s]
100%|██████████| 3/3 [00:00<00:00, 35.74it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.010466038637463898}
{'ner': 0.0014397645172559358}
{'ner': 0.5141761592247136}


100%|██████████| 3/3 [00:00<00:00, 31.69it/s]
100%|██████████| 3/3 [00:00<00:00, 36.99it/s]
100%|██████████| 3/3 [00:00<00:00, 36.72it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.4983074708340614}
{'ner': 0.7267384934724365}
{'ner': 0.4091935022800164}


100%|██████████| 3/3 [00:00<00:00, 35.05it/s]
100%|██████████| 3/3 [00:00<00:00, 34.66it/s]
100%|██████████| 3/3 [00:00<00:00, 37.63it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.008978294175306736}
{'ner': 0.24507012081403912}
{'ner': 0.05205747801863969}


100%|██████████| 3/3 [00:00<00:00, 33.07it/s]
100%|██████████| 3/3 [00:00<00:00, 37.08it/s]
100%|██████████| 3/3 [00:00<00:00, 29.91it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.13397564462107803}
{'ner': 2.0832080107815765}
{'ner': 0.00025078954224590276}


100%|██████████| 3/3 [00:00<00:00, 27.39it/s]
100%|██████████| 3/3 [00:00<00:00, 27.83it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 4.544756027607773e-05}
{'ner': 0.006985236687565013}


100%|██████████| 3/3 [00:00<00:00, 25.37it/s]
100%|██████████| 3/3 [00:00<00:00, 28.67it/s]
100%|██████████| 3/3 [00:00<00:00, 36.86it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.0027227264286237302}
{'ner': 1.253766632646218}
{'ner': 0.02401618897528379}


100%|██████████| 3/3 [00:00<00:00, 35.67it/s]
100%|██████████| 3/3 [00:00<00:00, 36.70it/s]
100%|██████████| 3/3 [00:00<00:00, 34.34it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 6.851898208061698e-07}
{'ner': 0.058988099195153715}
{'ner': 2.521309159050488e-06}


100%|██████████| 3/3 [00:00<00:00, 33.36it/s]
100%|██████████| 3/3 [00:00<00:00, 37.37it/s]
100%|██████████| 3/3 [00:00<00:00, 36.29it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 6.564435068046595e-05}
{'ner': 0.0028958663040766037}
{'ner': 0.01832555564123911}


100%|██████████| 3/3 [00:00<00:00, 34.40it/s]
100%|██████████| 3/3 [00:00<00:00, 35.61it/s]
100%|██████████| 3/3 [00:00<00:00, 36.82it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.0002921605411772739}
{'ner': 1.812192938990024e-07}
{'ner': 5.739523166599295e-06}


100%|██████████| 3/3 [00:00<00:00, 35.72it/s]
100%|██████████| 3/3 [00:00<00:00, 36.89it/s]
100%|██████████| 3/3 [00:00<00:00, 35.96it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 3.0329576741431163e-07}
{'ner': 9.499395859547354e-08}
{'ner': 8.097997918318989e-06}


100%|██████████| 3/3 [00:00<00:00, 35.18it/s]
100%|██████████| 3/3 [00:00<00:00, 36.01it/s]
100%|██████████| 3/3 [00:00<00:00, 37.72it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.6183399107057003e-09}
{'ner': 2.4188461981881215e-06}
{'ner': 1.937782474777386e-07}


100%|██████████| 3/3 [00:00<00:00, 35.72it/s]
100%|██████████| 3/3 [00:00<00:00, 36.10it/s]
100%|██████████| 3/3 [00:00<00:00, 35.90it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 3.0278219357873463e-05}
{'ner': 4.468522932703251e-06}
{'ner': 0.02775237487318944}


100%|██████████| 3/3 [00:00<00:00, 35.75it/s]
100%|██████████| 3/3 [00:00<00:00, 36.02it/s]
100%|██████████| 3/3 [00:00<00:00, 34.86it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 7.106485908229072e-08}
{'ner': 3.986304280777657e-05}
{'ner': 8.313423262369347e-08}


100%|██████████| 3/3 [00:00<00:00, 36.51it/s]
100%|██████████| 3/3 [00:00<00:00, 36.73it/s]
100%|██████████| 3/3 [00:00<00:00, 34.58it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.0766952251233512e-06}
{'ner': 7.69084443603085e-09}
{'ner': 0.00062574504991779}


100%|██████████| 3/3 [00:00<00:00, 32.10it/s]
100%|██████████| 3/3 [00:00<00:00, 26.19it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.5149090879322875e-05}
{'ner': 4.587905030572008e-05}


100%|██████████| 3/3 [00:00<00:00, 23.90it/s]
100%|██████████| 3/3 [00:00<00:00, 25.26it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 0.00010334462523816046}
{'ner': 4.337936800132482e-07}


100%|██████████| 3/3 [00:00<00:00, 25.40it/s]
100%|██████████| 3/3 [00:00<00:00, 34.53it/s]
100%|██████████| 3/3 [00:00<00:00, 34.17it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 7.648565079334642e-07}
{'ner': 3.577735046074634e-08}
{'ner': 8.410060451695264e-07}


100%|██████████| 3/3 [00:00<00:00, 33.31it/s]
100%|██████████| 3/3 [00:00<00:00, 33.72it/s]
100%|██████████| 3/3 [00:00<00:00, 35.62it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.9319593104543754e-07}
{'ner': 3.983573744251012e-08}
{'ner': 2.3281625870046322e-08}


100%|██████████| 3/3 [00:00<00:00, 35.45it/s]
100%|██████████| 3/3 [00:00<00:00, 34.02it/s]
100%|██████████| 3/3 [00:00<00:00, 35.44it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 5.531335441795522e-06}
{'ner': 1.3948138062950927e-09}
{'ner': 5.073013324319905e-09}


100%|██████████| 3/3 [00:00<00:00, 33.85it/s]
100%|██████████| 3/3 [00:00<00:00, 34.65it/s]
100%|██████████| 3/3 [00:00<00:00, 33.50it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.4260432139014588e-06}
{'ner': 1.6809166100959244e-09}
{'ner': 0.0005285270768159518}


100%|██████████| 3/3 [00:00<00:00, 34.11it/s]
100%|██████████| 3/3 [00:00<00:00, 35.25it/s]
100%|██████████| 3/3 [00:00<00:00, 35.03it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 7.472641400722603e-05}
{'ner': 2.1840886591911642e-07}
{'ner': 2.3009777017497414e-06}


100%|██████████| 3/3 [00:00<00:00, 31.82it/s]
100%|██████████| 3/3 [00:00<00:00, 37.23it/s]
100%|██████████| 3/3 [00:00<00:00, 36.92it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

{'ner': 1.1845418622159137e-07}
{'ner': 2.1397593650617247e-08}
{'ner': 5.940370736715911e-07}


100%|██████████| 3/3 [00:00<00:00, 34.07it/s]
100%|██████████| 3/3 [00:00<00:00, 34.88it/s]

{'ner': 3.513950062234614e-09}
{'ner': 4.9131790536712405e-06}





In [11]:

# test the trained model
for text, _ in TRAIN_DATA:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Entities [('Kofi Annan', 'PERSON')]
Tokens [('Who', '', 2), ('is', '', 2), ('Kofi', 'PERSON', 3), ('Annan', 'PERSON', 1), ('?', '', 2)]
Entities [('London', 'LOC'), ('Berlin', 'LOC')]
Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3), ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]
Entities [('Steve Jobs', 'PERSON')]
Tokens [('Who', '', 2), ('is', '', 2), ('Steve', 'PERSON', 3), ('Jobs', 'PERSON', 1), ('?', '', 2)]


In [None]:
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Saved model to /content/drive/My Drive


In [None]:
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
for text, _ in TRAIN_DATA:
    doc = nlp2(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Loading from /content/drive/My Drive
Entities [('London', 'LOC'), ('Berlin', 'LOC')]
Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3), ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]
Entities [('Kofi Annan', 'PERSON')]
Tokens [('Who', '', 2), ('is', '', 2), ('Kofi', 'PERSON', 3), ('Annan', 'PERSON', 1), ('?', '', 2)]
Entities [('Steve Jobs', 'PERSON')]
Tokens [('Who', '', 2), ('is', '', 2), ('Steve', 'PERSON', 3), ('Jobs', 'PERSON', 1), ('?', '', 2)]


In [None]:
LABEL = 'ANIMAL'

In [None]:
TRAIN_DATA = [
    ("Horses are too tall and they pretend to care about your feelings", {
        'entities': [(0, 6, 'ANIMAL')]
    }),

    ("Do they bite?", {
        'entities': []
    }),

    ("horses are too tall and they pretend to care about your feelings", {
        'entities': [(0, 6, 'ANIMAL')]
    }),

    ("horses pretend to care about your feelings", {
        'entities': [(0, 6, 'ANIMAL')]
    }),

    ("they pretend to care about your feelings, those horses", {
        'entities': [(48, 54, 'ANIMAL')]
    }),

    ("horses?", {
        'entities': [(0, 6, 'ANIMAL')]
    })
]

In [None]:
@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    new_model_name=("New model name for model meta.", "option", "nm", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int))


def main(model=None, new_model_name='animal', output_dir=None, n_iter=20):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)   # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in tqdm(TRAIN_DATA):
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)

    # test the trained model
    test_text = 'Do you like horses?'
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)

In [None]:
# Run our Function
main()

  0%|          | 0/6 [00:00<?, ?it/s]

Created blank 'en' model


100%|██████████| 6/6 [00:00<00:00, 28.57it/s]
100%|██████████| 6/6 [00:00<00:00, 31.96it/s]
  0%|          | 0/6 [00:00<?, ?it/s]

{'ner': 30.936285078525543}
{'ner': 14.958685375750065}


100%|██████████| 6/6 [00:00<00:00, 31.14it/s]
100%|██████████| 6/6 [00:00<00:00, 31.77it/s]
  0%|          | 0/6 [00:00<?, ?it/s]

{'ner': 8.703778005625281}
{'ner': 7.313736651587533}


100%|██████████| 6/6 [00:00<00:00, 31.15it/s]
100%|██████████| 6/6 [00:00<00:00, 32.20it/s]
  0%|          | 0/6 [00:00<?, ?it/s]

{'ner': 4.294245195498661}
{'ner': 3.286863576651456}


100%|██████████| 6/6 [00:00<00:00, 29.98it/s]
100%|██████████| 6/6 [00:00<00:00, 32.52it/s]
  0%|          | 0/6 [00:00<?, ?it/s]

{'ner': 1.6006127730270183}
{'ner': 0.1744719205945813}


100%|██████████| 6/6 [00:00<00:00, 31.28it/s]
100%|██████████| 6/6 [00:00<00:00, 32.06it/s]
  0%|          | 0/6 [00:00<?, ?it/s]

{'ner': 0.0036385594575921}
{'ner': 0.09023403022481838}


100%|██████████| 6/6 [00:00<00:00, 31.22it/s]
100%|██████████| 6/6 [00:00<00:00, 32.56it/s]
  0%|          | 0/6 [00:00<?, ?it/s]

{'ner': 0.0002729902213913422}
{'ner': 0.00014057699411105796}


100%|██████████| 6/6 [00:00<00:00, 30.97it/s]
100%|██████████| 6/6 [00:00<00:00, 32.06it/s]
  0%|          | 0/6 [00:00<?, ?it/s]

{'ner': 2.1613790391953023e-05}
{'ner': 3.358945930230211e-05}


100%|██████████| 6/6 [00:00<00:00, 31.88it/s]
100%|██████████| 6/6 [00:00<00:00, 32.54it/s]
  0%|          | 0/6 [00:00<?, ?it/s]

{'ner': 3.8468130030645687e-05}
{'ner': 5.80819253799542e-07}


100%|██████████| 6/6 [00:00<00:00, 31.06it/s]
100%|██████████| 6/6 [00:00<00:00, 32.45it/s]


{'ner': 4.2597351938256785e-06}
{'ner': 9.65229727203219e-06}


100%|██████████| 6/6 [00:00<00:00, 31.12it/s]
100%|██████████| 6/6 [00:00<00:00, 31.31it/s]


{'ner': 1.2685182280842513e-06}
{'ner': 5.223596049518888e-09}
Entities in 'Do you like horses?'
ANIMAL horses
