In [None]:
!pip install spacy==2.3.1
import pandas as pd, spacy, random
from spacy.util import minibatch, compounding

# Load the training data

## Training data format:
A list of tuples, where each tuple contains 1 data point for a text as shown below.

The numbers means starting and ending position of the entities in hte text or string. For example 'STREET' starts at position 210 and ends at position 229 of the string.

In [None]:
SAMPLE_TRAIN_DATA = [('BE SUBORDINATED UPON THE REFINANCING OF ANY PRIOR MORTGAGE\nTHIS DEED OF TRUST...',
 {'entities': [(210, 229, 'STREET'), (231, 239, 'CITY'), (241, 243, 'STATE'), (244, 249, 'ZIP')]})]

### When labeling entities, there're some special cases to be aware of. If you see random characters is linked together with your entities, you need  to include them as part of the entity.

Examples:

The @ part means entity. @ is not actually in the string, it's just to show you where the entity is at.

*   Here is a sample @entity@.
*   Here is a sample @!!#entity*@.
*   Here is a sample &*( @entity entity@.
*   Here is a sample @^Washington D.C.@, blah blah.



## Load your training data here.

In [None]:
TRAIN_DATA = ...

# Create new model. (Just run the cell)

In [None]:
nlp = spacy.blank("en")
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)

# Register the entity label

In [None]:
for lb in ['label']: # Change the label.
    ner.add_label(lb)

# Train the NER model (Just run the cell)

In [None]:
optimizer = nlp.begin_training()

move_names = list(ner.move_names) # Only for new model

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

with nlp.disable_pipes(*other_pipes):  # only train NER
    sizes = compounding(1.0, 4.0, 1.001)
    # batch up the examples using spaCy's minibatch
    for itn in range(50):
        random.shuffle(TRAIN_DATA)
        batches = minibatch(TRAIN_DATA, size = sizes)
        losses = {}
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd = optimizer, drop = 0.35, losses = losses)
        print("Losses", losses)

# Save the model

In [None]:
folder_path = 'path/model_name' #Change this
nlp.to_disk(folder_path)

# How to use the model?

## Load the model

In [None]:
nlp = spacy.load("path/NER_model_name")

## Extract the entities

In [None]:
t = 'This is a test sentence.'
for e in nlp(t).ents:
    print(e.label_, e.text)