In [1]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp("Best Behavioral Economist of this generation is Dr. Richard Thaler")

In [4]:
print([(ent.text,ent.label_) for ent in doc.ents])

[('Richard Thaler', 'PERSON')]


### Expanding Named Entities

For example, the corpus spaCy’s English models were trained on defines a PERSON entity as just the person name, without titles like “Mr” or “Dr”. This makes sense, because it makes it easier to resolve the entity type back to a knowledge base. But what if your application needs the full names, including the titles?

In [5]:
def addTitle(doc):
    newEnt = []
    for ent in doc.ents:
        if ent.label_ == 'PERSON' and ent.start != 0:
            prev_token = doc[ent.start - 1]
            if prev_token.text in ('Dr','Dr.','Mr','Mr.','Mrs','Mrs.','Master','Master.','Miss','Miss.'):
                new_ent = Span(doc, ent.start - 1, ent.end, label = ent.label_)
                newEnt.append(new_ent)
            else:
                newEnt.append(ent)
    doc.ents = newEnt
    return doc

In [6]:
nlp.add_pipe(addTitle, after = 'ner')

In [7]:
doc = nlp("Best Behavioral Economist of this generation is Dr. Richard Thaler")

In [8]:
print([(ent.text,ent.label_) for ent in doc.ents])

[('Dr. Richard Thaler', 'PERSON')]


### Using Dependency Parsing and POS for implementing Custom Rules

In [9]:
nlp = spacy.load('en_core_web_sm')

In [10]:
doc = nlp('Alex Smith was working at Google')

In [11]:
displacy.render(doc, style='dep', options = {'compact':True, 'distance':100})

In [12]:
def get_person_orgs(doc):
    person_entities = [ent for ent in doc.ents if ent.label_=="PERSON"]
    for ent in person_entities:
        head = ent.root.head
        if head.lemma_ == 'work':
            preps = [token for token in head.children if token.dep_ == 'prep']
            for prep in preps:
                orgs = [token for token in prep.children if token.ent_type_ == 'ORG']
                print({'person': ent, 'orgs': orgs, 'past': head.tag_ == "VBD"})
    return doc

In [13]:
from spacy.pipeline import merge_entities

In [14]:
nlp = spacy.load('en_core_web_sm')

In [15]:
nlp.add_pipe(merge_entities)

In [16]:
nlp.add_pipe(get_person_orgs)

In [17]:
doc = nlp('Alex Smith worked at Google')

{'person': Alex Smith, 'orgs': [Google], 'past': True}


In [18]:
doc = nlp('Alex Smith was working at Google')

{'person': Alex Smith, 'orgs': [Google], 'past': False}


### Modifying get_person_orgs as it is not considering Auxillary words

In [19]:
def get_person_orgs(doc):
    person_entities = [ent for ent in doc.ents if ent.label_=="PERSON"]
    for ent in person_entities:
        head = ent.root.head
        if head.lemma_ == 'work':
            preps = [token for token in head.children if token.dep_ == 'prep']
            for prep in preps:
                orgs = [token for token in prep.children if token.ent_type_ == 'ORG']
                
                aux = [token for token in head.children if token.dep_ == 'aux']
                past_aux = any(t.tag_ == 'VBD' for t in aux)
                past = head.tag_ == 'VBD' or head.tag_ == 'VBG' and past_aux
             
            print({'person': ent, 'orgs': orgs, 'past': past})   
    return doc

In [20]:
nlp = spacy.load('en_core_web_sm')

In [21]:
nlp.add_pipe(merge_entities)

In [22]:
nlp.add_pipe(get_person_orgs)

In [23]:
doc = nlp('Alex Smith worked at Google')

{'person': Alex Smith, 'orgs': [Google], 'past': True}


In [24]:
doc = nlp('Alex Smith was working at Google')

{'person': Alex Smith, 'orgs': [Google], 'past': True}
