Spacy for Entity Recognition

References :
- https://spacy.io/usage/visualizers
- https://explosion.ai/blog/deep-learning-formula-nlp
- https://spacy.io/usage/training#ner

- video 
https://www.youtube.com/watch?v=l4scwf8KeIA
- training 
https://towardsdatascience.com/a-review-of-named-entity-recognition-ner-using-automatic-summarization-of-resumes-5248a75de175

## Creating training dataset

- format
    - ("Uber blew through $1 million a week", {'entities':[(0, 4, 'ORG')]}), 

### retrieve sample table data

In [1]:
import db_conn
import pandas as pd

  """)


In [2]:
conn = db_conn.get_connection()
cur = conn.cursor()

In [3]:
cur.execute("""
select lower(table_title) from article_tables;
""")
titles = cur.fetchall()

In [4]:
titles = list(map(lambda x: x[0], titles))
len(titles)

46691

In [5]:
titles[:5]

['table 2 comparison between rome iv functional dyspepsia subtypes',
 'table 3 characteristics of children included in analyses (n=2449)',
 'table 1 univariate association between baseline clinical and laboratory variables and aortic sclerosis',
 'table 3 comparisons of responses to graded dobutamine infusion, y versus o',
 'table 2 estimated marginal means and f tests of time × condition interactions']

## retrieve all drug names

In [6]:
cur.execute("""
SELECT distinct(lower(cui1_str)) FROM dict_collapsed_final;
""")
d1 = cur.fetchall()
d1 = pd.DataFrame(d1)
d1.head()

Unnamed: 0,0
0,indoramin
1,rescinnamine
2,rivastigmine
3,edoxaban
4,efaproxiral


In [7]:
cur.execute("""
SELECT distinct(lower(cui2_str)) FROM dict_collapsed_final;
""")
d2 = cur.fetchall()
d2 = pd.DataFrame(d2)
d2.head()

Unnamed: 0,0
0,digoxin didier
1,efaproxiral
2,vitaminum a
3,urion
4,klonopin


In [8]:
drugs = d1.append(d2)
drugs.columns = ['name']
drugs.drop_duplicates(inplace=True)
drugs.reset_index(drop=True)

Unnamed: 0,name
0,indoramin
1,rescinnamine
2,rivastigmine
3,edoxaban
4,efaproxiral
5,anidulafungin
6,methylergometrine
7,procarbazine
8,bambuterol
9,cyproheptadine


In [None]:
drugs.to_csv('./drug_dictionary.csv', sep=',', header=None)

## find drug names

In [9]:
from flashtext import KeywordProcessor

In [10]:
keyword_processor = KeywordProcessor(case_sensitive=False)
for i, r in drugs.iterrows():
    keyword_processor.add_keyword(r['name'].strip())

c_entities = []
for ti in titles:
    found = keyword_processor.extract_keywords(ti.strip())
    ents = [] 
    if len(found)>0:
#         print(found) 
        for f in found:
            if ti.find(f)>=0:
                ents.append((ti.find(f), ti.find(f)+len(f), 'DRUG'))
        c_entities.append((ti, {'entities':ents}))

In [11]:
c_entities[:2]

[('table 3 comparisons of responses to graded dobutamine infusion, y versus o',
  {'entities': [(43, 53, 'DRUG')]}),
 ('table univariate and multivariate analyses of baseline factors associated with progression-free survival and overall survival in patients treated with dabrafenib and trametinib',
  {'entities': [(151, 161, 'DRUG'), (166, 176, 'DRUG')]})]

Reference : https://spacy.io/usage/training#train-entity  

- prediction is based on the examples the model has seen during training.
- not memorizing -> why the training data should be representative of the data we want to process

In [12]:
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

In [13]:
import spacy
import random
nlp = spacy.load('en')

In [14]:
# nlp = spacy.load(model)  # load existing spaCy model
# print("Loaded model '%s'" % model)

In [15]:
# Add entity recognizer to model if it's not in the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
# otherwise, get it, so we can add labels to it
else:
    ner = nlp.get_pipe('ner')

In [16]:
ner.add_label('DRUG') 

In [17]:
  # add new entity label to entity recognizer
# if model is None:
#     optimizer = nlp.begin_training()
# else:
    # Note that 'begin_training' initializes the models, so it'll zero out
    # existing entity types.
optimizer = nlp.entity.create_optimizer()

In [18]:
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

In [None]:
with nlp.disable_pipes(*other_pipes):  # only train NER
    for itn in range(100):
        random.shuffle(c_entities[50:])
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(c_entities, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
                       losses=losses)
        print('Losses', losses)

Losses {'ner': 699.0591161333103}
Losses {'ner': 614.5265844495248}
Losses {'ner': 602.5799704423433}
Losses {'ner': 594.6451911860214}
Losses {'ner': 593.8750052786496}
Losses {'ner': 594.6505455840362}
Losses {'ner': 588.652153398819}
Losses {'ner': 592.572937130124}
Losses {'ner': 587.4335027273939}
Losses {'ner': 586.2609388498374}
Losses {'ner': 588.1102732083157}
Losses {'ner': 588.473771588657}
Losses {'ner': 586.2142116063847}
Losses {'ner': 589.2023367660195}


In [None]:
# test the trained model
test_text = 'comparisons of responses to graded dobutamine infusion, y versus o'
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
    print(ent.label_, ent.text)

In [None]:
# save model to output directory

output_dir = Path('./')
if not output_dir.exists():
    output_dir.mkdir()
nlp.meta['name'] = new_model_name  # rename model
nlp.to_disk(output_dir)
print("Saved model to", output_dir)

# test the saved model
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
doc2 = nlp2(test_text)
for ent in doc2.ents:
    print(ent.label_, ent.text)

In [None]:
# import spacy
# import random
# nlp = spacy.load('en')

# DRUG = nlp.vocab.strings.add('DRUG')

# optimizer = nlp.begin_training()
# for i in range(10):
#     random.shuffle(c_entities)
#     for text, annotations in c_entities:
#         nlp.update([text], [annotations], drop=0.5, sgd=optimizer)
# nlp.to_disk('./model')