Spacy for Entity Recognition

References :
- https://spacy.io/usage/visualizers
- https://explosion.ai/blog/deep-learning-formula-nlp
- https://spacy.io/usage/training#ner

- video 
https://www.youtube.com/watch?v=l4scwf8KeIA
- training 
https://towardsdatascience.com/a-review-of-named-entity-recognition-ner-using-automatic-summarization-of-resumes-5248a75de175

## Creating training dataset

- format
    - ("Uber blew through $1 million a week", {'entities':[(0, 4, 'ORG')]}), 

### retrieve sample table data

In [None]:
import db_conn
import pandas as pd

In [None]:
conn = db_conn.get_connection()
cur = conn.cursor()

In [None]:
cur.execute("""
select lower(table_title) from article_tables;
""")
titles = cur.fetchall()

In [None]:
titles = list(map(lambda x: x[0], titles))
len(titles)

In [None]:
titles[:5]

## retrieve all drug names

In [None]:
cur.execute("""
SELECT distinct(lower(cui1_str)) FROM dict_collapsed_final;
""")
d1 = cur.fetchall()
d1 = pd.DataFrame(d1)
d1.head()

In [None]:
cur.execute("""
SELECT distinct(lower(cui2_str)) FROM dict_collapsed_final;
""")
d2 = cur.fetchall()
d2 = pd.DataFrame(d2)
d2.head()

In [None]:
drugs = d1.append(d2)
drugs.columns = ['name']
drugs.drop_duplicates(inplace=True)
drugs.reset_index(drop=True)

In [9]:
drugs.to_csv('./drug_dictionary.csv', sep=',', header=None)

## retrieve all Side effects

In [10]:
cur.execute("""
SELECT distinct(lower(llt_name)) from meddra_llt_181022;
""")
sd = pd.DataFrame(cur.fetchall())
sd.head()

Unnamed: 0,0
0,"""ventilation"" pneumonitis"
1,11-beta-hydroxylase deficiency
2,11-oxysteroid activity incr
3,11-oxysteroid activity increased
4,17 ketosteroids urine abnormal


In [11]:
sd.columns = ['name']
sd.drop_duplicates(inplace=True)
sd.reset_index(drop=True)

Unnamed: 0,name
0,"""ventilation"" pneumonitis"
1,11-beta-hydroxylase deficiency
2,11-oxysteroid activity incr
3,11-oxysteroid activity increased
4,17 ketosteroids urine abnormal
5,17 ketosteroids urine abnormal nos
6,17 ketosteroids urine decreased
7,17 ketosteroids urine high
8,17 ketosteroids urine increased
9,17 ketosteroids urine low


## find drug names

In [12]:
from flashtext import KeywordProcessor

In [14]:
drug_keyword_processor = KeywordProcessor(case_sensitive=False)
for i, r in drugs.iterrows():
    drug_keyword_processor.add_keyword(r['name'].strip())

In [None]:
ad_keyword_processor = KeywordProcessor(case_sensitive=False)
for i, sd in sd.iterrows():
    ad_keyword_processor.add_keyword(sd['name'].strip())
#    keyword_processor.add_keyword(r['name'].strip())
# for i, sd in sd.iterrows():
#     keyword_processor.add_keyword(sd['name'].strip())
#
# keyword_processor.get_keywords()

In [15]:
c_entities = []
no_entities = []
for ti in titles:
    found = drug_keyword_processor.extract_keywords(ti.strip())
    ad_found = ad_keyword_processor.extract_keywords(ti.strip())
    ents = [] 
    for f in found:
        if ti.find(f)>=0:
            ents.append((ti.find(f), ti.find(f)+len(f), 'DRUG'))
        
    for f in ad_found:
        if ti.find(f)>=0:
            ents.append((ti.find(f), ti.find(f)+len(f), 'SIDE_EFFECT'))
        
    if len(ents)>0:
        c_entities.append((ti, {'entities':ents}))
    else:
        no_entities.append((ti, {'entities':[]}))

In [None]:
len(c_entities)

In [None]:
len(no_entities)
c_entities[:5]

In [16]:
com_list = c_entities + no_entities[:len(c_entities)]

[('table 3 comparisons of responses to graded dobutamine infusion, y versus o',
  {'entities': [(43, 53, 'DRUG')]}),
 ('table univariate and multivariate analyses of baseline factors associated with progression-free survival and overall survival in patients treated with dabrafenib and trametinib',
  {'entities': [(151, 161, 'DRUG'), (166, 176, 'DRUG')]})]

Reference : https://spacy.io/usage/training#train-entity  

- prediction is based on the examples the model has seen during training.
- not memorizing -> why the training data should be representative of the data we want to process

In [17]:
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

In [18]:
import spacy
import random
nlp = spacy.load('en')

In [19]:
# nlp = spacy.load(model)  # load existing spaCy model
# print("Loaded model '%s'" % model)

In [20]:
# Add entity recognizer to model if it's not in the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
# otherwise, get it, so we can add labels to it
else:
    ner = nlp.get_pipe('ner')

In [21]:
ner.add_label('DRUG') 
ner.add_label('SIDE_EFFECT')
# add new entity label to entity recognizer

In [22]:
optimizer = nlp.entity.create_optimizer()

In [23]:
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

In [24]:
with nlp.disable_pipes(*other_pipes):  # only train NER
    for itn in range(10):
        random.shuffle(com_list)
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(com_list, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.1,#0.35,
                       losses=losses)
        print('Losses', losses)

In [25]:
# test the trained model
no_entities[3500]

Entities in 'comparisons of responses to graded dobutamine infusion, y versus o'
DRUG dobutamine


In [None]:
# test_text = 'comparisons of responses to graded dobutamine infusion, y versus o'
test_text = 'Cosentyx, trade name Secukinumab, is a human antibody that binds to the protein interleukin (IL)-17A, and is marketed by Novartis for the treatment of psoriasis, ankylosing spondylitis, and psoriatic arthritis.'
# test_text = no_entities[3500][0]
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
    print(ent.label_, ent.text)

In [None]:
# save model to output directory

# output_dir = Path('./')
# if not output_dir.exists():
#     output_dir.mkdir()
# nlp.meta['name'] = new_model_name  # rename model
# nlp.to_disk(output_dir)
# print("Saved model to", output_dir)

# # test the saved model
# print("Loading from", output_dir)
# nlp2 = spacy.load(output_dir)
# doc2 = nlp2(test_text)
# for ent in doc2.ents:
#     print(ent.label_, ent.text)