In [3]:
import pandas as pd
import spacy
from spacy.tokens import Span

In [4]:
def displayNER(doc, includePunct=False):
  """
    Generate data frame for visualization of spaCy doc with custom attributes.
  """
  rows = []
  for i, t in enumerate(doc):
    if not t.is_punct or includePunct:
      row = {'token': i,
             'text': t.text, 'lemma': t.lemma_,
             'pos': t.pos_, 'dep': t.dep_, 'ent_type': t.ent_type_,
             'ent_iob_': t.ent_iob_}
      if doc.has_extension('coref_chains'):
        if t.has_extension('coref_chains') and t._.coref_chains: # neuralcoref attributes
          row['coref_chains'] = t._.coref_chains.pretty_representation
        else:
          row['coref_chains'] = None
      if t.has_extension('ref_n'): # referent attribute
        row['ref_n'] = t._.ref_n
        row['ref_t'] = t._.ref_t
      if t.has_extension('ref_ent'): # ref_n/ref_t
        row['ref_ent'] = t._.ref_ent
      rows.append(row)
  df = pd.DataFrame(rows).set_index('token')
  df.index.name = None

  return df

In [5]:
def resetPipeline(nlp, pipes):
  """
    remove all custom pipes, and add new pipes
  """
  customPipes = [pipe for (pipe, _) in nlp.pipeline
                  if pipe not in ['tagger', 'parser', 'ner',
                                  'tok2vec', 'attribute_ruler', 'lemmatizer']]
  for pipe in customPipes:
    _ = nlp.remove_pipe(pipe)
  # re-add specified pipes
  for pipe in pipes:
    nlp.add_pipe(pipe)
  logger.info(f"Model: {nlp.meta['name']}, Language: {nlp.meta['lang']}")
  logger.info('\n'.join([pipe for (pipe,_) in nlp.pipeline]))

In [6]:
def printDepTree(doc, skipPunct=True):
  """
    Utility function to pretty print the dependency tree.
  """
  def printRecursive(root, indent, skipPunct):
    if not root.dep_ == 'punct' or not skipPunct:
      print(" "*indent + f"{root} [{root.pos_}, {root.dep_}]")
    for left in root.lefts:
      printRecursive(left, indent=indent+4, skipPunct=skipPunct)
    for right in root.rights:
      printRecursive(right, indent=indent+4, skipPunct=skipPunct)

  for sent in doc.sents: # iterate over all sentences in a doc
    printRecursive(sent.root, indent=0, skipPunct=skipPunct)

## Custom pipelines  

In [7]:
from spacy.language import Language
from spacy.tokens import Span
from spacy.matcher import Matcher
from spacy.tokens import Token

In [8]:
customLabel = ['STRUCTURE', 'COMPONENT', 'SYSTEM']
aliasLookup = {}

In [9]:
@Language.component("normEntities")
def normEntities(doc):
  """
    Normalizing Named Entities, remove the leading article and trailing particle
    @ In, doc, spacy.tokens.doc.Doc
    @ Out, doc, spacy.tokens.doc.Doc
  """
  ents = []
  for ent in doc.ents:
    if ent[0].pos_ == "DET": # leading article
      ent = Span(doc, ent.start+1, ent.end, label=ent.label)
    if len(ent) > 0:
      if ent[-1].pos_ == "PART": # trailing particle like 's
        ent = Span(doc, ent.start, ent.end-1, label=ent.label)
      if len(ent) > 0:
        ents.append(ent)
  doc.ents = tuple(ents)
  return doc

In [10]:
@Language.component("initCoref")
def initCoref(doc):
  for e in doc.ents:
    # if e.label_ in customLabel:
      e[0]._.ref_n, e[0]._.ref_t = e.text, e.label_
  return doc

In [11]:
@Language.component("aliasResolver")
def aliasResolver(doc):
  """
    Lookup aliases and store result in ref_t, ref_n
  """
  for ent in doc.ents:
    token = ent[0].text
    if token in aliasLookup:
      aName, aType = aliasLookup[token]
      ent[0]._.ref_n, ent[0]._.ref_t = aName, aType
  return propagateEntType(doc)

In [12]:
def propagateEntType(doc):
  """
    propagate entity type stored in ref_t
  """
  ents = []
  for e in doc.ents:
    if e[0]._.ref_n != '': # if e is a coreference
      e = Span(doc, e.start, e.end, label=e[0]._.ref_t)
    ents.append(e)
  doc.ents = tuple(ents)
  return doc

In [13]:
@Language.component("anaphorCoref")
def anaphorCoref(doc):
  """
    Anaphora resolution using coreferee
    This pipeline need to be added after NER.
    The assumption here is: The entities need to be recognized first, then call
    pipeline "initCoref" to assign initial custom attribute "ref_n" and "ref_t",
    then call pipeline "aliasResolver" to resolve all the aliases used in the text.
    After all these pre-processes, we can use "anaphorCoref" pipeline to resolve the
    coreference.
  """
  if not Token.has_extension('coref_chains'):
    return doc
  for token in doc:
    coref = token._.coref_chains
    # if token is coref and not already dereferenced
    if coref and token._.ref_n == '':
      # check all the references, if "ref_n" is available (determined by NER and initCoref),
      # the value of "ref_n" will be assigned to current totken
      for chain in coref:
        for ref in chain:
          refToken = doc[ref[0]]
          if refToken._.ref_n != '':
            token._.ref_n = refToken._.ref_n
            token._.ref_t = refToken._.ref_t
            break
  return doc

In [14]:
@Language.component("expandEntities")
def expandEntities(doc):
  """
    Expand the current entities, recursive function to extend entity with all previous NOUN
  """
  newEnts = []
  isUpdated = False
  for ent in doc.ents:
    if ent.label_ == "SSC" and ent.start != 0:
      prevToken = doc[ent.start - 1]
      if prevToken.pos_ in ['NOUN']:
        newEnt = Span(doc, ent.start - 1, ent.end, label=ent.label)
        newEnts.append(newEnt)
        isUpdated = True
    else:
      newEnts.append(ent)
  doc.ents = newEnts
  if isUpdated:
    doc = expandEntities(doc)
  return doc

In [15]:
import coreferee, spacy
nlp = spacy.load("en_core_web_lg")
import logging
logger = logging.getLogger(__name__)

In [16]:
ch = logging.StreamHandler()
logger.addHandler(ch)

In [17]:
#### Using spacy's Token extensions for coreferee
if Token.has_extension('ref_n'):
  _ = Token.remove_extension('ref_n')
if Token.has_extension('ref_t'):
  _ = Token.remove_extension('ref_t')
if Token.has_extension('ref_t_'):
  _ = Token.remove_extension('ref_t_')
Token.set_extension('ref_n', default='')
Token.set_extension('ref_t', default='')

In [18]:
pipelines = ['entity_ruler','normEntities', 'initCoref', 'aliasResolver', 'coreferee','anaphorCoref', 'expandEntities']

In [19]:
pipelines

['entity_ruler',
 'normEntities',
 'initCoref',
 'aliasResolver',
 'coreferee',
 'anaphorCoref',
 'expandEntities']

In [20]:
resetPipeline(nlp, pipelines)

In [21]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x13e63a6d0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x10ea68180>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x13e350ee0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x13e616380>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x13e376040>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x13e350ca0>),
 ('entity_ruler', <spacy.pipeline.entityruler.EntityRuler at 0x13e36e080>),
 ('normEntities', <function __main__.normEntities(doc)>),
 ('initCoref', <function __main__.initCoref(doc)>),
 ('aliasResolver', <function __main__.aliasResolver(doc)>),
 ('coreferee', <coreferee.manager.CorefereeBroker at 0x14545d7c0>),
 ('anaphorCoref', <function __main__.anaphorCoref(doc)>),
 ('expandEntities', <function __main__.expandEntities(doc)>)]

In [22]:
text = r"""A leak was noticed from the RCP pump 1A.
          The RCP pump 1A pressure gauge was found not operating, and it was found inoperative.
          The RCP pump 1A pressure gauge was found inoperative.
          Rupture of pump bearings caused shaft degradation.
          Rupture of pump bearings caused shaft degradation and consequent flow reduction.
          Pump power supply has been found burnout.
          Pump test failed due to power supply failure.
          Pump inspection revealed excessive impeller degradation.
          Pump inspection revealed excessive impeller degradation likely due to cavitation.
        """

In [23]:
patterns = [{"label":"comp", "pattern":[{"LOWER":"gauge"}], "id":"ssc"}]
ruler = nlp.get_pipe('entity_ruler')
ruler.add_patterns(patterns)
rules = [{"LOWER":"pump"}]
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
matcher.add('comp', [rules])

In [24]:
doc = nlp(text)

In [25]:
matches = matcher(doc, as_spans=True)
for span in matches:
    print(span.sent, span.label_)

A leak was noticed from the RCP pump 1A.
          The RCP pump 1A pressure gauge was found not operating, and it was found inoperative. comp
A leak was noticed from the RCP pump 1A.
          The RCP pump 1A pressure gauge was found not operating, and it was found inoperative. comp
The RCP pump 1A pressure gauge was found inoperative. comp

          Rupture of pump bearings caused shaft degradation. comp

          Rupture of pump bearings caused shaft degradation and consequent flow reduction. comp

          Pump power supply has been found burnout. comp

          Pump test failed due to power supply failure. comp

          Pump inspection revealed excessive impeller degradation. comp

          Pump inspection revealed excessive impeller degradation likely due to cavitation. comp


In [26]:
print(type(doc.ents))

<class 'tuple'>


In [27]:
from spacy import displacy

In [28]:
displacy.render(doc, style='ent', jupyter=True)

In [29]:
patterns = [{"label":"comp", "pattern":[{"LOWER":"pressure gauge"}, {"POS":"NOUN"}], "id":"ssc"}]

In [30]:
printDepTree(doc)

noticed [VERB, ROOT]
    leak [NOUN, nsubjpass]
        A [DET, det]
    was [AUX, auxpass]
    from [ADP, prep]
        RCP [NOUN, pobj]
            the [DET, det]
    pump [VERB, advcl]
        
           [SPACE, dobj]
            1A. [NUM, compound]
    found [VERB, conj]
        gauge [NOUN, nsubjpass]
            pump [NOUN, compound]
                RCP [NOUN, nsubj]
                    The [DET, det]
            1A [NOUN, compound]
            pressure [NOUN, compound]
        was [AUX, auxpass]
        operating [VERB, xcomp]
            not [PART, neg]
        and [CCONJ, cc]
        found [VERB, conj]
            it [PRON, nsubjpass]
            was [AUX, auxpass]
            inoperative [ADJ, oprd]

           [SPACE, ROOT]
found [VERB, ROOT]
    gauge [NOUN, nsubjpass]
        pump [NOUN, compound]
            RCP [NOUN, nsubj]
                The [DET, det]
        1A [NOUN, compound]
        pressure [NOUN, compound]
    was [AUX, auxpass]
    inoperative [ADJ, oprd]
cau

In [31]:
df = displayNER(doc)

In [32]:
df

Unnamed: 0,text,lemma,pos,dep,ent_type,ent_iob_,coref_chains,ref_n,ref_t
0,A,a,DET,det,,O,,,
1,leak,leak,NOUN,nsubjpass,,O,,,
2,was,be,AUX,auxpass,,O,,,
3,noticed,notice,VERB,ROOT,,O,,,
4,from,from,ADP,prep,,O,,,
...,...,...,...,...,...,...,...,...,...
94,likely,likely,ADV,ccomp,,O,,,
95,due,due,ADP,prep,,O,,,
96,to,to,ADP,pcomp,,O,,,
97,cavitation,cavitation,NOUN,pobj,,O,,,


In [33]:
doc._.coref_chains.pretty_representation

'0: RCP(6), RCP(11), RCP(29); 1: gauge(15), it(22); 2: Rupture(39), Rupture(48)'

In [34]:
for ent in doc.ents:
    print(ent)

RCP
RCP
1A
gauge
RCP
1A
gauge


In [35]:
doc[22]._.ref_n

'gauge'

In [36]:
for token in doc:
    coref = token._.coref_chains
    
    # if token is coref and not already dereferenced
    if coref and token._.ref_n == '':
      print('token', token)
      # print(token,coref.pretty_representation)
      # check all the references, if "ref_n" is available (determined by NER and initCoref),
      # the value of "ref_n" will be assigned to current totken
      for chain in coref:
        for ref in chain:
          refToken = doc[ref[0]]
          print(refToken)
          print(refToken._.ref_n)
          if refToken._.ref_n != '':
            token._.ref_n = refToken._.ref_n
            token._.ref_t = refToken._.ref_t
            break

token Rupture
Rupture

Rupture

token Rupture
Rupture

Rupture



In [37]:
import spacy
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
matcher.add("HelloWorld", [pattern])

In [38]:
matcher.get('HelloWorld')

(None, [[{'LOWER': 'hello'}, {'IS_PUNCT': True}, {'LOWER': 'world'}]])

In [39]:
doc.ents

(RCP, RCP, 1A, gauge, RCP, 1A, gauge)

In [48]:
sl = []
for ent in doc.ents:
    sent = ent.sent
    if sent not in sl:
        sl.append(sent)
print(sl)

[A leak was noticed from the RCP pump 1A.
          The RCP pump 1A pressure gauge was found not operating, and it was found inoperative., The RCP pump 1A pressure gauge was found inoperative.]


In [50]:
for sent in sl:
    print(sent.ents)
    print(set(sent.ents))

[RCP, RCP, 1A, gauge]
{1A, gauge, RCP, RCP}
[RCP, 1A, gauge]
{RCP, 1A, gauge}


In [58]:
for sent in sl:
    for token in sent:
        print(token.i)
        

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
28
29
30
31
32
33
34
35
36
37
