In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import spacy
print(spacy.__version__)

3.4.4


In [None]:
!pip install -U spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#### Practice Spacy NER

In [None]:
## Download spacy model large
!python -m spacy download en_core_web_lg

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.1/en_core_web_lg-3.4.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.4.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [None]:
## Create model object
nlp = spacy.load('en_core_web_lg')

In [None]:
import warnings
warnings.filterwarnings("ignore")


In [None]:
doc = nlp('Donald Trump was the President of USA')

In [None]:
for ent in doc.ents:
  print(ent.text, "|", ent.label_, '|', spacy.explain(ent.label_))

Donald Trump | PERSON | People, including fictional
USA | GPE | Countries, cities, states


In [None]:
## To show the spacy
from spacy import displacy

displacy.render(doc, style = 'ent', jupyter= True)

In [None]:
[(X, X.ent_iob_, X.ent_type_) for X in doc if X.ent_type_]

[(Donald, 'B', 'PERSON'), (Trump, 'I', 'PERSON'), (USA, 'B', 'GPE')]

## Medical Custom NER

In [None]:
## Import data
import json
with open('/content/drive/MyDrive/Datasets/NLP_NER/Corona2.json', 'r') as f:
  data = json.load(f)

In [None]:
data

{'examples': [{'id': '18c2f619-f102-452f-ab81-d26f7e283ffe',
   'content': "While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]",
   'metadata': {},
   'annotatio

In [None]:
type(data)

dict

In [None]:
data.keys()
## There is only one key

dict_keys(['examples'])

### Obs:
- Inside that data dictionary there is only one key: examples
- Inside the examples value there is array of numbers
- each array index consist of dictionary having key id, content, metadata, annotations, classifications 

In [None]:
## printing the keys of the dictionary
data['examples'][0].keys()

dict_keys(['id', 'content', 'metadata', 'annotations', 'classifications'])

In [None]:
# print(data['examples'][1])
data['examples'][1].keys()

dict_keys(['id', 'content', 'metadata', 'annotations', 'classifications'])

In [None]:
data['examples'][0]['content']

"While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]"

In [None]:
data['examples'][0]['annotations'][0]

{'id': '0825a1bf-6a6e-4fa2-be77-8d104701eaed',
 'tag_id': 'c06bd022-6ded-44a5-8d90-f17685bb85a1',
 'end': 371,
 'start': 360,
 'example_id': '18c2f619-f102-452f-ab81-d26f7e283ffe',
 'tag_name': 'Medicine',
 'value': 'Diosmectite',
 'correct': None,
 'human_annotations': [{'timestamp': '2020-03-21T00:24:32.098000Z',
   'annotator_id': 1,
   'tagged_token_id': '0825a1bf-6a6e-4fa2-be77-8d104701eaed',
   'name': 'Ashpat123',
   'reason': 'exploration'}],
 'model_annotations': []}

## Note:
- Annotation is also a type of array of dictionary
- We are here interested in three things 
content, & annotations(tag_name, start, end)

In [None]:
## Now iterate over whole data and collect all these requirements form one list of training data

def data_training(data_json):
  '''
  Here we are creating one list of dictionary whose keys are 'text/content' and 'entities'
  'entities' contain a list of tuple contain start, end and label values
  format: training = [ {'text': string},
            {'entities': [(start , end , label), (.. , .. , .. ).. ]} ]
  i.e training is a list and each index is a dictionary 
  '''
  training = []    ## creating blank traing list
  for example in data_json['examples']:  ## accesing examples key
    temp_dict = {}    ## creating blank dict
    temp_dict['text'] = example['content']  ## creating  'text' key
    temp_dict['entities'] = []   ## creating 'entities' key whose value will be a list
    ## iterate over annotations to get start, end and tag_name 
    for anno in example['annotations']: ## accesing the value of annotation dict key
      start = anno['start']  ## extract the start key value
      end = anno['end']   ## extract the end key value
      label = anno['tag_name'].upper()     ## extract the tag_name key value and uppercasing
      temp_dict['entities'].append((start, end, label))  ## creating tuple of start, end and label & appending on entities
    training.append(temp_dict)   ## appending on training list
  return training


In [None]:
training_data = data_training(data)

In [None]:
training_data

[{'text': "While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]",
  'entities': [(360, 371, 'MEDICINE'),
   (383, 408, 'MEDICINE'),
   (104, 112, 'MEDICALCONDITION

In [None]:
## checking the type of training data
type(training_data)  

list

In [None]:
## type of each element of the training data
type(training_data[0])

dict

In [None]:
## printing the keys
training_data[0].keys()

dict_keys(['text', 'entities'])

In [None]:
## 1st element of the list
training_data[0]

{'text': "While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]",
 'entities': [(360, 371, 'MEDICINE'),
  (383, 408, 'MEDICINE'),
  (104, 112, 'MEDICALCONDITION'),


In [None]:
## Accesing first index 1st key value
training_data[0]['text']

"While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]"

In [None]:
## Accesing first index 2nd key value
training_data[0]['entities']

[(360, 371, 'MEDICINE'),
 (383, 408, 'MEDICINE'),
 (104, 112, 'MEDICALCONDITION'),
 (679, 689, 'MEDICINE'),
 (6, 23, 'MEDICINE'),
 (25, 37, 'MEDICINE'),
 (461, 470, 'MEDICALCONDITION'),
 (577, 589, 'MEDICINE'),
 (853, 865, 'MEDICALCONDITION'),
 (188, 198, 'MEDICINE'),
 (754, 762, 'MEDICALCONDITION'),
 (870, 880, 'MEDICALCONDITION'),
 (823, 833, 'MEDICINE'),
 (852, 853, 'MEDICALCONDITION'),
 (461, 469, 'MEDICALCONDITION'),
 (535, 543, 'MEDICALCONDITION'),
 (692, 704, 'MEDICINE'),
 (563, 571, 'MEDICALCONDITION')]

In [None]:
training_data[0]['text'][0:10]   ## slicing each characterwise as it is string

'While bism'

In [None]:
training_data[0]['entities'][0]

(360, 371, 'MEDICINE')

## Notes: Spacy requires the traing data in docbin format
- Traget : Convert our traing data in doc entities and span objects

In [None]:
from spacy.tokens import DocBin
from tqdm import tqdm
from spacy.util import filter_spans

#### Note:
- This utility function was introduced in version 2.0. It will filter a sequence of span objects and also removes the duplicates. This function is very useful for creating the named entities.
- https://www.tutorialspoint.com/spacy/spacy_util_filter_spans.htm
- span object will alwys in the form of list

In [None]:
## create our our own docbin and can pass any training or test list data
def doc_bin(data_list):
  
  ## Creating blank spacy model
  nlp_en = spacy.blank("en")
  doc_bin = DocBin() ## create docbin object
  for exmpl in tqdm(data_list):
    text = exmpl['text']
    labels = exmpl['entities']
    doc = nlp_en.make_doc(text)  ## Create a Doc from raw text i.e each text will consider as doc
    ents = []  ## creating blank entity list to append entities in this doc object
    for start, end, label in labels:
      ## create each entity as span object
      span = doc.char_span(start, end, label = label, alignment_mode = 'contract')  ## create span object from each doc created
      ## it will find out the given span from doc and will label it
      if span is None:
        print('Skipping Entities')
      else:
        ## now all span will append inside ents list with label to create own or custom entity list
        ents.append(span)

      ## using filter_span it will remove all duplicate span and entity so that to get filtered entities
      filtered_ents = filter_spans(ents)
      ## now putting those enties inside doc.entities
      doc.ents = filtered_ents
      ## now appending that document in the doc bin object
      doc_bin.add(doc)

  return doc_bin


In [None]:
## save training doc bin to disk
training_docBin = doc_bin(training_data)
training_docBin.to_disk('train_corona_docbin.spacy')

100%|██████████| 31/31 [00:00<00:00, 152.19it/s]

Skipping Entities
Skipping Entities
Skipping Entities
Skipping Entities
Skipping Entities
Skipping Entities
Skipping Entities
Skipping Entities
Skipping Entities
Skipping Entities
Skipping Entities
Skipping Entities
Skipping Entities
Skipping Entities
Skipping Entities
Skipping Entities





### Obs:
- Now we have data i.e docBin which is nothing but a list of doc in the spacy format

### *** Create base config.cfg from
https://spacy.io/usage/training

In [None]:
## Create spacy config file from spacy.io which will create automatically our final_config file
## S-1 create spacy base_config.cfg file from https://spacy.io/usage/training
## S-2 Get your config.cfg file and modify it

!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
!python -m spacy train config.cfg --output ./ --paths.train ./train_corona_docbin.spacy --paths.dev ./train_corona_docbin.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2023-01-14 14:39:09,485] [INFO] Set up nlp object from config
INFO:spacy:Set up nlp object from config
[2023-01-14 14:39:09,495] [INFO] Pipeline: ['tok2vec', 'ner']
INFO:spacy:Pipeline: ['tok2vec', 'ner']
[2023-01-14 14:39:09,499] [INFO] Created vocabulary
INFO:spacy:Created vocabulary
[2023-01-14 14:39:12,361] [INFO] Added vectors: en_core_web_lg
INFO:spacy:Added vectors: en_core_web_lg
tcmalloc: large alloc 1233977344 bytes == 0x9eb7c000 @  0x7f32e14e72a4 0x7f32d5e23e09 0x7f32d5e22cdf 0x7f32d5e1f675 0x7f32d5e1fe2e 0x4f750a 0x4997a2 0x55cd91 0x5d8941 0x49abe4 0x7f319e523d78 0x7f319e52670e 0x7f319e52e5a7 0x7f319e531e85 0x5d8d8c 0x55dc1e 0x5d8868 0x4990ca 0x7f319e523d78 0x7f319e52670e 0x7f319e52de4b 0x5d80be 0x5d8d8c 0x55ea20 0x55d078 0x5d8941 0x49abe4 0x4fe253 0x49abe4 0x55d078 0x5d8941
tcmalloc: large alloc 1248116736 bytes == 0x79f14000 @  0x7f32e14e72a4 0x7f32d5e213a2 0x7f32d5e22cdf 0x7f32d5e1f675 0x7f32d

In [None]:
## creating function for test sentence
def evaluateDisplayNer():
  # Loaded newly created model-best

  nlp_custom_ner = spacy.load('model-best')
  test_txt = input(str('Put your input string:'))
  print('')
  user_doc = nlp_custom_ner(test_txt)
  colors = {"PATHOGEN": "#F67DE3", "MEDICINE": "#7DF6D9", "MEDICALCONDITION":"#85C1E9"}
  options = {"colors": colors} 
  print('')
  for ent in user_doc.ents:
    print(ent.text, "|", ent.label_, '|', spacy.explain(ent.label_))

  print('-------DISPLACY------')
  displacy.render(user_doc, style = 'ent',options= options, jupyter= True)
  

  

In [None]:
evaluateDisplayNer()

Put your input string:While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]


bismuth compounds | MEDICINE | None
Pepto-Bismol | MEDICINE | None
diarrhea | MEDICALC

## Sample Test doc:
While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]

In [None]:
# !zip -r /content/file.zip /content/model-best

  adding: content/model-best/ (stored 0%)
  adding: content/model-best/meta.json (deflated 57%)
  adding: content/model-best/vocab/ (stored 0%)
  adding: content/model-best/vocab/strings.json (deflated 77%)
  adding: content/model-best/vocab/vectors (deflated 8%)
  adding: content/model-best/vocab/lookups.bin (stored 0%)
  adding: content/model-best/vocab/key2row (deflated 16%)
  adding: content/model-best/vocab/vectors.cfg (stored 0%)
  adding: content/model-best/ner/ (stored 0%)
  adding: content/model-best/ner/model (deflated 7%)
  adding: content/model-best/ner/cfg (deflated 33%)
  adding: content/model-best/ner/moves (deflated 62%)
  adding: content/model-best/tok2vec/ (stored 0%)
  adding: content/model-best/tok2vec/model (deflated 8%)
  adding: content/model-best/tok2vec/cfg (stored 0%)
  adding: content/model-best/config.cfg (deflated 60%)
  adding: content/model-best/tokenizer (deflated 81%)


In [None]:
# !pip freeze > requirements.txt