Tutorial link : https://www.newscatcherapi.com/blog/train-custom-named-entity-recognition-ner-model-with-spacy-v3

In [1]:
!pip install spacy



# Loading data

In [2]:
import json

with open("data.json", "r") as f:
    data = json.load(f)

data["examples"][0]

{'id': '18c2f619-f102-452f-ab81-d26f7e283ffe',
 'content': "While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]",
 'metadata': {},
 'annotations': [{'id': '0825a1

In [3]:
training_data = {
    'classes' : ["MEDICINE", "MEDICALCONDITION", "PATHOGEN"], 
    'annotations' : []
}

for example in data['examples']:
    temp_dict = {}
    temp_dict['text'] = example['content']
    temp_dict['entities'] = []

    for annotation in example['annotations']:
        start = annotation['start']
        end = annotation['end']
        label = annotation['tag_name'].upper()

        temp_dict['entities'].append((start, end, label))
    
    training_data["annotations"].append(temp_dict)

In [4]:
training_data["annotations"][0]

{'text': "While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]",
 'entities': [(360, 371, 'MEDICINE'),
  (383, 408, 'MEDICINE'),
  (104, 112, 'MEDICALCONDITION'),


# creating docbin object and handling overlaps

In [5]:
# docbin objects are used to create and save .spacy files

In [6]:
import spacy
from spacy.tokens import DocBin

nlp = spacy.blank("en")     #creates a blank model
doc_bin = DocBin()

In [7]:
#since some indices overlap, we'll need to handle those
from spacy.util import filter_spans

for training_eg in training_data["annotations"]:
    text = training_eg["text"]
    labels = training_eg["entities"]
    doc = nlp.make_doc(text)
    ents = []
    
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")

        if(span is None):
            print("Skipping entity")
        else:
            ents.append(span)

    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents
    doc_bin.add(doc)

doc_bin.to_disk("training_data.spacy")

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity


# Config files

In [9]:
#We’ll be working with a base config file created using the quickstart page. 
#This is an incomplete file with only our custom options, so we’ll have to fill in the rest with the default values.

In [1]:
# Remember to change the path of training data to your training data file and validation file, 
# in the base_config.cfg and config.cfg file 

#since we dont have validation dataset, use training_data in dev as well

In [18]:
#run this in terminal, not here

# python -m spacy init fill-config base_config.cfg config.cfg   
#           creates config file, using base_config.cfq
#           select only NER when making base_config on website

# Train model

In [19]:
# python -m spacy train config.cfg --output ./outputs
#           saves outputs to ./outputs

# Load and test model

In [21]:
nlp_ner = spacy.load("./outputs/model-best")

In [22]:
doc = nlp_ner("Antiretroviral therapy (ART) is recommended for all HIV-infected\
individuals to reduce the risk of disease progression.\nART also is recommended \
for HIV-infected individuals for the prevention of transmission of HIV.\nPatients \
starting ART should be willing and able to commit to treatment and understand the\
benefits and risks of therapy and the importance of adherence. Patients may choose\
to postpone therapy, and providers, on a case-by-case basis, may elect to defer\
therapy on the basis of clinical and/or psychosocial factors.")

In [23]:
colors = {"PATHOGEN": "#F67DE3", "MEDICINE": "#7DF6D9", "MEDICALCONDITION":"#FFFFFF"}
options = {"colors": colors} 

In [24]:
from spacy import displacy

In [25]:
spacy.displacy.render(doc, style="ent", options= options, jupyter=True)