Spacy for Entity Recognition

References :
- https://spacy.io/usage/visualizers
- https://explosion.ai/blog/deep-learning-formula-nlp
- https://spacy.io/usage/training#ner

- video 
https://www.youtube.com/watch?v=l4scwf8KeIA
- training 
https://towardsdatascience.com/a-review-of-named-entity-recognition-ner-using-automatic-summarization-of-resumes-5248a75de175

## Creating training dataset

- format
    - ("Uber blew through $1 million a week", {'entities':[(0, 4, 'ORG')]}), 

### retrieve sample table data

In [1]:
import pandas as pd

In [2]:
titles = pd.read_csv('./titles_condition.tsv', sep='\t', header=None)
titles.columns = ['id', 'title']

In [3]:
titles.head()

Unnamed: 0,id,title
0,4106,Analysis of efficacy
1,4107,Comparisons of postoperative CA19-9 levels on ...
2,4108,Pattern of disease relapse
3,4109,Grade 15 adverse events with gemcitabine alone...
4,4112,Treatment with zoledronic acid


In [21]:
titles.isna().sum()

id       0
title    0
dtype: int64

In [20]:
titles = titles[titles.title.notna()]

## retrieve all drug names

## retrieve all Side effects

In [4]:
entity = pd.read_csv('./entities_sample.csv', sep=',')
entity = entity[['entity_term', 'entity_name']]
entity.head()

Unnamed: 0,entity_term,entity_name
0,11-beta-hydroxylase deficiency,adverse
1,17 ketosteroids urine decreased,adverse
2,17 ketosteroids urine low,adverse
3,17 ketosteroids urine high,adverse
4,17 ketosteroids urine increased,adverse


## find drug names

In [5]:
from flashtext import KeywordProcessor

In [None]:
key_pros = {}
for e, r in entity.groupby('entity_name'):
    key_pro = KeywordProcessor(case_sensitive=False)
    for v in r.entity_term.values.tolist():
        key_pro.add_keyword(v.strip())
    key_pros[e] = key_pro

In [22]:
c_entities = []
no_entities = []
for ind, title in titles.iterrows():
    ti = title['title']
    
    ents = [] 
    for ent, processor in key_pros.items():
        found = processor.extract_keywords(ti)
        for f in found:
            if ti.find(f)>=0:
                ents.append((ti.find(f), ti.find(f)+len(f), ent))
            
    if len(ents)>0:
        c_entities.append((ti, {'entities':ents}))
    else:
        no_entities.append((ti, {'entities':[]}))

In [23]:
len(c_entities)

3486

In [24]:
len(no_entities)

24441

In [25]:
c_entities[:5]

[('Grade 15 adverse events with gemcitabine alone and gemcitabine plus capecitabine',
  {'entities': [(29, 40, 'drug'), (29, 40, 'drug'), (68, 80, 'drug')]}),
 ('Treatment with zoledronic acid', {'entities': [(15, 30, 'drug')]}),
 ('Treatment with docetaxel', {'entities': [(15, 24, 'drug')]}),
 ('Antibiotic-associated diarrhoea and morbidity in the first 3 weeks after recruitment',
  {'entities': [(0, 31, 'adverse')]}),
 ('Cancer incidence and cancer mortality by site',
  {'entities': [(0, 6, 'adverse'), (0, 6, 'adverse')]})]

In [26]:
com_list = c_entities + no_entities[:len(c_entities)]

Reference : https://spacy.io/usage/training#train-entity  

- prediction is based on the examples the model has seen during training.
- not memorizing -> why the training data should be representative of the data we want to process

In [27]:
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

In [28]:
import spacy
import random
nlp = spacy.load('en')

In [29]:
# nlp = spacy.load(model)  # load existing spaCy model
# print("Loaded model '%s'" % model)

In [30]:
# Add entity recognizer to model if it's not in the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
# otherwise, get it, so we can add labels to it
else:
    ner = nlp.get_pipe('ner')

In [31]:
for k in key_pros.keys():
    ner.add_label(k) 
# add new entity label to entity recognizer

In [32]:
optimizer = nlp.entity.create_optimizer()

In [33]:
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

In [None]:
with nlp.disable_pipes(*other_pipes):  # only train NER
    for itn in range(10):
        random.shuffle(com_list)
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(com_list, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.1,#0.35,
                       losses=losses)
        print('Losses', losses)

Losses {'ner': 278.89288702670655}


In [None]:
# test the trained model
no_entities[3500]

In [None]:
# test_text = 'comparisons of responses to graded dobutamine infusion, y versus o'
test_text = 'Cosentyx, trade name Secukinumab, is a human antibody that binds to the protein interleukin (IL)-17A, and is marketed by Novartis for the treatment of psoriasis, ankylosing spondylitis, and psoriatic arthritis.'
# test_text = no_entities[3500][0]
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
    print(ent.label_, ent.text)

In [None]:
# save model to output directory

# output_dir = Path('./')
# if not output_dir.exists():
#     output_dir.mkdir()
# nlp.meta['name'] = new_model_name  # rename model
# nlp.to_disk(output_dir)
# print("Saved model to", output_dir)

# # test the saved model
# print("Loading from", output_dir)
# nlp2 = spacy.load(output_dir)
# doc2 = nlp2(test_text)
# for ent in doc2.ents:
#     print(ent.label_, ent.text)