In [1]:
!pip install spacy -Uqqq

## Importing Libraries

In [2]:
import json
import pandas as pd
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin

In [10]:
with open('./annotations.json', 'r') as json_file:
  data = json.load(json_file)

annot_data = data['annotations']
train_size = int(len(annot_data) * 0.75)

TRAIN_DATA = annot_data[:train_size]
EVAL_DATA = annot_data[train_size:]

## Convert JSON file into DocBin object

In [12]:
def json_to_docbin(data, path):
  nlp = spacy.blank("en") 
  db = DocBin()

  for text, annot in tqdm(data):
      doc = nlp.make_doc(text)
      ents = []
      for start, end, label in annot["entities"]:
          span = doc.char_span(start, end, label=label, alignment_mode="contract")
          if span is None:
              print("Skipping entity")
          else:
              ents.append(span)
      doc.ents = ents
      db.add(doc)

  db.to_disk(path)

In [13]:
json_to_docbin(TRAIN_DATA, "./train.spacy")
json_to_docbin(EVAL_DATA, "./dev.spacy")

100%|██████████| 44/44 [00:00<00:00, 1375.55it/s]


Skipping entity


100%|██████████| 15/15 [00:00<00:00, 1331.92it/s]


## Train Custom NER Model

In [15]:
# Downloading the base model needed to train data i.e. en_core_web_lg
!python -m spacy download en_core_web_lg -qqq

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [16]:
# Generated config file from https://spacy.io/usage/training
# Now, run the following command to initialize the model

!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [19]:
# Train the model.
# If you don't have eval data/test data, you can use train DocBin i.e. train.spacy
# after '--paths.dev' for model evaluation

!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy

[38;5;2m✔ Created output directory: output[0m
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     37.33    0.00    0.00    0.00    0.00
  4     200        244.31   1659.76   25.60   28.07   23.53    0.26
  9     400         29.75    470.65   35.09   43.48   29.41    0.35
 18     600        529.58    526.26   49.66   45.68   54.41    0.50
 27     800         27.47     43.64   48.28   45.45   51.47    0.48
 40    1000          8.79     14.47   51.47   51.47   51.47    0.51
 56    1200         32.41     45.22   48.72   43.18   55.88    0.49
 76    1400         10.07     16.52   51.85   52.24   51.47    0.52
100 

## Test Custom NER Model

In [22]:
# Test model by loading it from 'output/model-best' directory

test_nlp = spacy.load('output/model-best')

doc = test_nlp('Vijay Shekhar Sharma is the CEO of Paytm.')

spacy.displacy.render(doc, style="ent", jupyter=True)