In [1]:
# Example code for fine-tuning a SpaCy model (simplified)
from spacy.training import Example
import spacy

# Load a blank model
nlp = spacy.load("en_core_web_lg")

doc = nlp("Caucasian female postmenopausal patients")

print(doc.ents)



(Caucasian,)


In [2]:
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)

import json
# https://www.kaggle.com/datasets/finalepoch/medical-ner 
with open('Corona2.json', 'r') as f:
    data = json.load(f)

In [12]:
training_data = []
for example in data['examples']:
  temp_dict = {}
  temp_dict['text'] = example['content']
  temp_dict['entities'] = []
  for annotation in example['annotations']:
    start = annotation['start']
    end = annotation['end']
    label = annotation['tag_name'].upper()
    temp_dict['entities'].append((start, end, label))
  training_data.append(temp_dict)

[{'text': "While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]",
  'entities': [(360, 371, 'MEDICINE'),
   (383, 408, 'MEDICINE'),
   (104, 112, 'MEDICALCONDITION

In [4]:
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
doc_bin = DocBin()
     
from spacy.util import filter_spans

In [15]:

for training_example  in tqdm(training_data): 
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents 
    doc_bin.add(doc)

doc_bin.to_disk("train.spacy") 

100%|██████████| 31/31 [00:00<00:00, 646.53it/s]

While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]

Diosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.

Racecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]
Skipping entity
Skipping entity
[Diosmectite, aluminomagnesium silicate, diarrhea, kaopectate, bismuth compounds, 




In [7]:
!python3 -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [8]:
!python3 -m spacy train config.cfg --output ./ --paths.train ./train.spacy --paths.dev ./train.spacy 

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    153.29    1.10    1.80    0.79    0.01
  7     200        590.89   3582.73   68.90   65.37   72.83    0.69
 14     400        107.66   1186.42   88.31   90.50   86.22    0.88
 22     600         56.79    411.59   94.57   93.13   96.06    0.95
 30     800         93.01    381.11   98.22   98.42   98.03    0.98
 40    1000         76.05    243.43   97.46   96.89   98.03    0.97
 51    1200        147.55    273.18   98.43   98.43   98.43    0.98
^C


In [9]:
nlp_ner = spacy.load("model-best")

In [19]:
doc = nlp_ner("Suspicious lymphogenic metastases (cN1-3) Acute or chronic hepatic diseases Manifest renal diseases with renal dysfunction Relevant cardiac disease Preceding therapy of breast tumour under investigation Patients with multiple attempts of hook-wire placement in preparation of surgery Dementia or psychic condition that might interfere with the ability to understand the study and thus give a written informed consent Simultaneous participation in another clinical study or participation in another clinical study in the 30 days directly preceding treatment")

colors = {"PATHOGEN": "#F67DE3", "MEDICINE": "#7DF6D9", "MEDICALCONDITION":"#a6e22d"} 
options = {"colors": colors} 

spacy.displacy.render(doc, style="ent", options= options, jupyter=True)