In [1]:
import spacy
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe("merge_entities")

<function spacy.pipeline.functions.merge_entities(doc: spacy.tokens.doc.Doc)>

In [2]:
doc = nlp("A leak was noticed from the RCP pump 1A.")
spacy.displacy.render(doc, style="dep")

In [3]:
doc = nlp("RCP pump 1A pressure gauge was found not operating.")
spacy.displacy.render(doc, style="dep")

In [4]:
doc = nlp("RCP pump 1A pressure gauge was found inoperative.")
spacy.displacy.render(doc, style="dep")

In [5]:
doc = nlp("Rupture of pump bearings caused shaft degradation.")
spacy.displacy.render(doc, style="dep")

In [6]:
doc = nlp("Rupture of pump bearings caused shaft degradation and consequent flow reduction.")
spacy.displacy.render(doc, style="dep")

In [7]:
doc = nlp("Pump power supply has been found burnout.")
spacy.displacy.render(doc, style="dep")

In [8]:
doc = nlp("Pump test failed due to power supply failure.")
spacy.displacy.render(doc, style="dep")

In [9]:
doc = nlp("Pump inspection revealed excessive impeller degradation.")
spacy.displacy.render(doc, style="dep")

In [10]:
doc = nlp("Pump inspection revealed excessive impeller degradation likely due to cavitation.")
spacy.displacy.render(doc, style="dep")

In [11]:
doc = nlp("Pump inspection revealed excessive impeller degradation likely due to cavitation.")
spacy.displacy.render(doc, style="ent")



In [32]:
if nlp.has_pipe('entity_ruler'):
    nlp.remove_pipe('entity_ruler')
if nlp.has_pipe('merge_entities'):
    nlp.remove_pipe('merge_entities')

ruler = nlp.add_pipe("entity_ruler")
patterns = [{"label": "ORG", "pattern": "MyCorp Inc."}]
ruler.add_patterns(patterns)
nlp.add_pipe("merge_entities")

doc = nlp("MyCorp Inc. is a company in the U.S.")
print([(ent.text, ent.label_) for ent in doc.ents])
spacy.displacy.render(doc, style='ent')

[('MyCorp Inc.', 'ORG'), ('U.S.', 'GPE')]


In [34]:
for ent in doc.ents:
    print(ent.label_)

ORG
GPE


In [13]:

spacy.displacy.render(doc, style='dep')

In [14]:
doc = nlp("Alex Smith worked at Acme Corp Inc.")
# If you're not in a Jupyter / IPython environment, use displacy.serve
spacy.displacy.render(doc, options={"fine_grained": True})

## Example that can be used for Relation Extraction after Named Entity Recognizer 

In [15]:
import spacy
from spacy.language import Language
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

# @Language.component("extract_person_orgs")
# def extract_person_orgs(doc):
#     person_entities = [ent for ent in doc.ents if ent.label_ == "PERSON"]
#     for ent in person_entities:
#         head = ent.root.head
#         if head.lemma_ == "work":
#             preps = [token for token in head.children if token.dep_ == "prep"]
#             for prep in preps:
#                 orgs = [token for token in prep.children if token.ent_type_ == "ORG"]
#                 print({'person': ent, 'orgs': orgs, 'past': head.tag_ == "VBD"})
#     return doc

@Language.component("extract_person_orgs")
def extract_person_orgs(doc):
    person_entities = [ent for ent in doc.ents if ent.label_ == "PERSON"]
    for ent in person_entities:
        head = ent.root.head
        if head.lemma_ == "work":
            preps = [token for token in head.children if token.dep_ == "prep"]
            for prep in preps:
                orgs = [t for t in prep.children if t.ent_type_ == "ORG"]
                aux = [token for token in head.children if token.dep_ == "aux"]                
                past_aux = any(t.tag_ == "VBD" for t in aux)                
                past = head.tag_ == "VBD" or head.tag_ == "VBG" and past_aux                
                print({'person': ent, 'orgs': orgs, 'past': past})
    return doc

# To make the entities easier to work with, we'll merge them into single tokens
nlp.add_pipe("merge_entities")
nlp.add_pipe("extract_person_orgs")

doc = nlp("Alex Smith worked at Acme Corp Inc.")
# If you're not in a Jupyter / IPython environment, use displacy.serve
displacy.render(doc, options={"fine_grained": True})

{'person': Alex Smith, 'orgs': [Acme Corp Inc.], 'past': True}


## Coreferee Resolution: require spacy>=3.1.0,<3.2.0
WIP PR for spacy to integrate both coreferee and nueralcoef into spacy
https://github.com/explosion/spaCy/pull/7264

In [39]:
import coreferee, spacy
nlp = spacy.load('en_core_web_trf')
nlp.add_pipe('coreferee')

<coreferee.manager.CorefereeBroker at 0x1408bbd60>

In [43]:
doc = nlp("Although he was very busy with his work, Peter had had enough of it. He and his wife decided they needed a holiday. They travelled to Spain because they loved the country very much.")

In [44]:
doc._.coref_chains.print()

0: he(1), his(6), Peter(9), He(16), his(18)
1: work(7), it(14)
2: [He(16); wife(19)], they(21), They(26), they(31)
3: Spain(29), country(34)


coreferee.data_model.ChainHolder

In [66]:
for token in doc:
    if token._.coref_chains:
        # print(token._.coref_chains)
        for chain in token._.coref_chains:
            for tk in chain:
                print(doc[tk[0]])
        ccc
            

he
his
Peter
He
his


NameError: name 'ccc' is not defined

In [80]:
for ent in doc.ents:
    print(ent.root.head)

had
to


In [50]:
doc[0]

Although

In [82]:
doc[16]._.coref_chains.print()

0: he(1), his(6), Peter(9), He(16), his(18)
2: [He(16); wife(19)], they(21), They(26), they(31)


In [20]:
doc._.coref_chains.resolve(doc[31])

[Peter, wife]

#### chains and mentions can be navigated much as if they were lists 

In [21]:
for chain in doc._.coref_chains:
    for mention in chain:
        print(mention)

[1]
[6]
[9]
[16]
[18]
[7]
[14]
[16, 19]
[21]
[26]
[31]
[29]
[34]


In [22]:
doc._.coref_chains[1].pretty_representation

'1: work(7), it(14)'

## Example from https://github.com/blueprints-for-text-analytics-python/blueprints-text/blob/master/ch12/Knowledge_Graph.ipynb

In [24]:
import spacy
from spacy import displacy
from spacy.pipeline import EntityRuler
from spacy.tokens import Span

nlp = spacy.load('en_core_web_sm')
print(*nlp.pipeline, sep='\n')

('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x14a7672c0>)
('tagger', <spacy.pipeline.tagger.Tagger object at 0x14a767360>)
('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x141fef100>)
('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x140e48dc0>)
('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x14a165f40>)
('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x141ff7160>)


In [35]:
from spacy.tokens import Span
from spacy.language import Language

@Language.component("normEntities")
def norm_entities(doc):
    ents = []
    for ent in doc.ents:
        if ent[0].pos_ == "DET": # leading article
            ent = Span(doc, ent.start+1, ent.end, label=ent.label)
        if len(ent) > 0:
            if ent[-1].pos_ == "PART": # trailing particle like 's
                ent = Span(doc, ent.start, ent.end-1, label=ent.label)
            if len(ent) > 0:
                ents.append(ent)
    doc.ents = tuple(ents)
    return doc

In [37]:
nlp.add_pipe("normEntities")

<function __main__.norm_entities(doc)>