In [1]:
%reset -f

In [3]:
# One time run.
# ! python -m spacy download en_core_web_sm

  tok2vec.upstream_name if tok2vec.upstream_name is not "*" else "tok2vec"
Collecting en-core-web-sm==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl (13.7 MB)
[K     |████████████████████████████████| 13.7 MB 13.2 MB/s eta 0:00:01
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.0.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


## Imports

In [20]:
from pronto import Ontology
import spacy
from spacy.tokens import Doc, Span, Token
from spacy.matcher import PhraseMatcher
from spacy.util import filter_spans
from spacy.language import Language
#import progressbar

from spacy import displacy


In [21]:
class ENVOExtractor(object):
    
    @Language.component('envo_extractor')
    
    def __init__(self, nlp, label="ENVO"):
        self.label = label
        
        print('Loading ENVO')
        envo = Ontology("http://purl.obolibrary.org/obo/envo.owl")
        
        self.terms = {}
        patterns = []
        
        barStatus = 0
        ner_terms = len(envo)
        print("Importing terms")
        bar = progressbar.ProgressBar(maxval=ner_terms, 
                                      widgets=[progressbar.Bar('=', '[', ']'), progressbar.Percentage()])
        #bar.start()
        
        # Iterate over terms in ontology
        for term in envo:
            if term.name is not None:
                self.terms[term.name.lower()] = {'id': term.id}
                patterns.append(nlp(term.name))
            #barStatus += 1
            
        #bar.finish()
        
        # Initialize matcher & add patterns
        self.matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
        self.matcher.add(label, None, *patterns)
        
        # Set extensions to tokens, spans and docs
        Token.set_extension('is_envo_term', default=False, force=True)
        Token.set_extension("envo_id", default=False, force=True)
        Token.set_extension("merged_concept", default=False, force=True)
        Doc.set_extension("has_envo_ids", getter=self.has_envo_ids, force=True)
        Doc.set_extension("envo_ids", default=[], force=True)
        Span.set_extension("has_envo_ids", getter=self.has_envo_ids, force=True)
        
    def __call__(delf, doc):
        matches = self.matcher(doc)
        spans = [Span(doc, match[1], match[2], label=self.label) for match in matches]
        for i, span in enumerate(spans):
          span._.set("has_envo_ids", True)
          for token in span:
               token._.set("is_envo_term", True)
               token._.set("envo_id", self.terms[span.text.lower()]["id"])

        with doc.retokenize() as retokenizer:
            for span in filter_spans(spans):
                retokenizer.merge(span, attrs={"_": {"merged_concept": True}})
                doc._.envo_ids = list(doc._.envo_ids) + [span]

        return doc
        
    # Setter function for doc level
    def has_envo_ids(self, tokens):
        return any([t._.get("is_envo_term") for t in tokens])
        

In [22]:
%%time
nlp = spacy.load("en_core_web_sm")
envo_extractor = ENVOExtractor(nlp)
nlp.add_pipe('envo_extractor', after='ner')
nlp.pipeline

Loading ENVO
Importing terms
CPU times: user 18.5 s, sys: 58.2 ms, total: 18.5 s
Wall time: 19.7 s


[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7fd6d49e3a90>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7fd6d4b7f8b0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7fd6e22d2220>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7fd6e2290ca0>),
 ('envo_extractor',
  <function __main__.ENVOExtractor.__init__(self, nlp, label='ENVO')>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7fd6e227b580>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7fd6e2251d00>)]

In [18]:
test = "Whether this may represent an early manifestation of a viral myocarditis is not certain,\
but there was no significant brisk lymphocytic inflammatory infiltrate consistent with the\
typical pattern of viral myocarditis...\
There is prior evidence of viral infection causing activation of both maladaptive cytokine pathways,\
and platelet response, and our findings suggest that these immune functions may be related to\
severe forms of Covid-19. In response to systemic and pulmonary viral infections of H1N1\
influenza and dengue, megakaryocytes have been known to respond by overexpressing IFITM3,\
and producing platelets with the same over-expression."

In [19]:
doc = nlp(test)

TypeError: __init__() missing 1 required positional argument: 'nlp'

In [None]:
displacy.render(doc, style='ent')