# Named entity recognition with Spacy


## Download libraries and models

In [1]:
import numpy as np
import pandas as pd
import spacy
import json
import tqdm
#https://www.newscatcherapi.com/blog/train-custom-named-entity-recognition-ner-model-with-spacy-v3
#https://spacy.io/usage/training#quickstart
#ner annotator: 'https://tecoholic.github.io/ner-annotator/'

## Load training data 

In [91]:
with open("annotations.json", "r") as rf:
    trainingData = json.load(rf)
trainingData['annotations'][1]['text']

'Last fall Oracle released Blockchain App Builder for Oracle Blockchain platform - a low-code development toolset for Oracle Blockchain Platform to increase the speed of innovation. The App Builder can automatically generate chaincode for basic CRUD methods and persistence APIs from a declarative specification and empowers a blockchain application developer to develop chaincodes faster, deploy and test chaincodes very easily and substantially reduces the number of code defects. Figure 1: Low Code Chaincode Lifecycle Supported by Blockchain App Builder for Oracle Blockchain Platform Then we asked ourselves, what other common building blocks we can help to generate automatically so developers can move even faster? It didn???t take long to recognize that use of tokenization was accelerating and it was becoming a core feature in many innovative blockchain solutions. We???ve decided to expand the App Builder capabilities to support tokenization and released it as part of the version 21.2.3 

## Load model

In [92]:
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
doc_bin = DocBin() # create a DocBin object

In [93]:
from spacy.util import filter_spans

In [99]:
for training_example  in tqdm(trainingData['annotations']): 
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity") 
            print(text[start:end] +":" + label)
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents 
    doc_bin.add(doc)

doc_bin.to_disk("trainingData.spacy") # save the docbin object

100%|██████████| 4/4 [00:00<00:00,  8.99it/s]


Skipping entity
TLC:ORG
Skipping entity
cryptotrading:BLOCKCHAIN_TECH


## Train model by terminal

In [118]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [101]:
import torch
if torch.cuda.is_available():
    print("The code is running on GPU")
else:
    print("The code is running on CPU")

The code is running on GPU


In [121]:
!python -m spacy train config.cfg --output ./ --paths.train ./trainingData.spacy --paths.dev ./trainingData.spacy --training.max_epochs 50 --training.dropout 0.15 --training.max_steps 1200

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00   2795.50    0.00    0.00    0.00    0.00
 12     200       7703.25  109428.00   90.71   91.58   89.86    0.91
 25     400       4891.52  13264.02   99.06   98.80   99.33    0.99
 37     600       3558.79   4407.48   99.60   99.27   99.93    1.00
[38;5;2m✔ Saved pipeline to output directory[0m
model-last


## Test new model 

In [122]:
nlp_ner = spacy.load("model-best")

In [125]:
from spacy import displacy
test1 = input()
doc = nlp_ner(test1)
spacy.displacy.render(doc, style = 'ent', jupyter = True)