In [1]:
%reset -f

In [2]:
# One time run.
# ! python -m spacy download en_core_web_sm

## Imports

In [3]:
#from pronto import Ontology
import spacy
from spacy.tokens import Doc, Span, Token
from spacy.matcher import PhraseMatcher
from spacy.util import filter_spans
from spacy.language import Language
from spacy.lang.en import English
from spacy import displacy
import os
import pandas as pd
import json

In [4]:
cols = ['CUI', 'source', 'CURIE', 'name', 'description', 'category']
envoTerms = pd.read_csv(os.path.join('terms', 'envo_termlist.tsv'), sep='\t', low_memory=False, names=cols)
envoTerms = envoTerms.drop(['CUI', 'source','description'], axis=1)
envoTerms.head()

Unnamed: 0,CURIE,name,category
0,CHEBI:25213,metal cation,biolink:ChemicalSubstance
1,CHEBI:25212,metabolite,biolink:ChemicalSubstance
2,CHEBI:25216,metalloporphyrin,biolink:ChemicalSubstance
3,UBERON:0011595,jaw region,biolink:AnatomicalEntity
4,CHEBI:76807,EC 3.5.1.* (non-peptide linear amide C-N hydro...,biolink:ChemicalSubstance


In [5]:
%%time
nlp = spacy.load("en_core_web_sm")
# Set terms and patterns
terms = {}
patterns = []
for curie, name, category in envoTerms.to_records(index=False):
    if name is not None:
        terms[name.lower()] = {'id': curie, 'category': category}
        patterns.append(nlp(name))

CPU times: user 34.1 s, sys: 106 ms, total: 34.3 s
Wall time: 34.3 s


In [6]:
@Language.component('envo_extractor')
def envo_extractor(doc):
    
    matches = matcher(doc)
    
    spans = [Span(doc, start, end, label = 'ENVO') for matchId, start, end in matches]
    
    doc.ents = spans
    
    for i, span in enumerate(spans):
        span._.set("has_envo_ids", True)
        for token in span:
            token._.set("is_envo_term", True)
            token._.set("envo_id", terms[span.text.lower()]["id"])
            token._.set("category", terms[span.text.lower()]["category"])

    '''with doc.retokenize() as retokenizer:
        for span in filter_spans(spans):
            retokenizer.merge(span, attrs={"_": {"merged_concept": True}})
            doc._.envo_ids = list(doc._.envo_ids) + [span]'''
    
    return doc

# Setter function for doc level
def has_envo_ids(self, tokens):
    return any([t._.get("is_envo_term") for t in tokens])

def resolve_substrings(matcher, doc, i, matches):
    # Get the current match and create tuple of entity label, start and end.
    # Append entity to the doc's entity. (Don't overwrite doc.ents!)
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="DUPLICATE")
    doc.ents += (entity,)
    print(entity.text)
    

In [7]:
%%time
nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ENVO", None, *patterns, on_match=resolve_substrings)
nlp.add_pipe('envo_extractor', after='ner')
nlp.pipeline

CPU times: user 459 ms, sys: 16.3 ms, total: 476 ms
Wall time: 475 ms


[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7fa3310e7b80>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7fa3310fac20>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7fa3310b5040>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7fa3310b57c0>),
 ('envo_extractor', <function __main__.envo_extractor(doc)>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7fa2f2668900>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7fa2f26a1b40>)]

In [8]:
# Set extensions to tokens, spans and docs
Token.set_extension('is_envo_term', default=False, force=True)
Token.set_extension("envo_id", default=False, force=True)
#Token.set_extension("merged_concept", default=False, force=True)
Token.set_extension("category", default=False, force=True)
Doc.set_extension("has_envo_ids", getter=has_envo_ids, force=True)
Doc.set_extension("envo_ids", default=[], force=True)
Span.set_extension("has_envo_ids", getter=has_envo_ids, force=True)

In [9]:
test = """Treponema denticola ATCC 35405. Treponema denticola ATCC 35405 was isolated from a human periodontal
pocket and is the type strain of this organism.Treponema pallidum subsp. pallidum. This organism is the 
causative agent of endemic and venereal syphilis. This sexual transmitted disease was first discovered in 
Europe at the end of the fifteenth century, however, the causative agent was not identified until 1905. 
At one time syphilis was the third most commonly reported communicable disease in the USA. Syphilis is 
characterized by multiple clinical stages and long periods of latent, asymptomatic infection. Although 
effective therapies have been available since the introduction of penicillin, syphilis remains a global 
health problem. Contains many homopolymeric tracts in the genome which result in hypervariable sequences
in many genes having to do with cells. These result in hypervarible antigenic 
diversity and immune evasion. Overall, the genome contains very few repetitive DNA sequences and very few 
coding regions arranged in operons. Pathogenicity factors include cytolethal-distending toxin, hemolysin-like 
toxins, and a type II secretion system. Francisella tularensis subsp. tularensis strain Schu S4. 
This subspecies is virulent in humans, and the strain is a clinical isolate that is also virulent 
in an animal model. Originally isolated from a human case of tularemia in 1951. Genome analysis indicates 
the presence of a large number of pseudogenes, many that disrupt important metabolic pathways, which is the 
reason for the exacting nutritional requirements of this organism. There is a duplicated pathogenicity island 
that has also been detected in strain LVS and mutations in the iglA, iglC, or pdpD genes affect the ability 
of the pathogen to survive in macrophages. The genome also contains genes for type IV pili production and 
capsular polysaccharide genes similar to those of Bacillus anthracis as well as numerous ABC transporters. 
There are a large number of insertion sequences including a mariner element, which is a transposon typically 
found in eukaryotes and is the first instance of this element to be found in a microbe, which may have 
acquired it during transit through one of the insect vectors. Pseudomonas fluorescens strain Pf0-1. 
This strain was isolated from agricultural loam (sand, clay, and organic matter) soil in 1988 by 
Compeau et al. and is well adapted to soil environments. Ralstonia solanacearum strain GMI1000. 
This strain is a race 1 strain isolated from the tomato plant and is also pathogenic on Arabidopsis thaliana. 
It specifically invades the plant xylem. 
"""
doc = nlp(test)
displacy.render(doc, style='ent')

### old code

In [10]:
'''class ENVOExtractor(object):
    
    name = 'envo_extractor'
    
    def __init__(self, doc, label="ENVO"):
        self.label = label
        
        print('Loading ENVO')
        envo = Ontology("http://purl.obolibrary.org/obo/envo.owl")
        
        self.terms = {}
        patterns = []
        ner_terms = len(envo)
        print("Importing terms")
        
        # iterate over terms in ontology
        for idx, term in envo.terms.items():
            # if term has a name
            if term.name is not None:
                self.terms[term.name.lower()] = {'id': term.id}
                patterns.append(nlp(term.name))
        
        # Initialize matcher & add patterns
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add(label, None, *patterns)
        
        # Set extensions to tokens, spans and docs
        Token.set_extension('is_envo_term', default=False, force=True)
        Token.set_extension("envo_id", default=False, force=True)
        Token.set_extension("merged_concept", default=False, force=True)
        Doc.set_extension("has_envo_ids", getter=self.has_envo_ids, force=True)
        Doc.set_extension("envo_ids", default=[], force=True)
        Span.set_extension("has_envo_ids", getter=self.has_envo_ids, force=True)
        
    def __call__(self, doc):
        matches = self.matcher(doc)
        spans = [Span(doc, start, end, label=self.label) for matchId, start, end in matches]
        doc.ents = spans
        
        for i, span in enumerate(spans):
          span._.set("has_envo_ids", True)
          for token in span:
               token._.set("is_envo_term", True)
               token._.set("envo_id", self.terms[span.text.lower()]["id"])

        with doc.retokenize() as retokenizer:
            for span in filter_spans(spans):
                retokenizer.merge(span, attrs={"_": {"merged_concept": True}})
                doc._.envo_ids = list(doc._.envo_ids) + [span]

        return doc
        
    # Setter function for doc level
    def has_envo_ids(self, tokens):
        return any([t._.get("is_envo_term") for t in tokens])
        '''

'class ENVOExtractor(object):\n    \n    name = \'envo_extractor\'\n    \n    def __init__(self, doc, label="ENVO"):\n        self.label = label\n        \n        print(\'Loading ENVO\')\n        envo = Ontology("http://purl.obolibrary.org/obo/envo.owl")\n        \n        self.terms = {}\n        patterns = []\n        ner_terms = len(envo)\n        print("Importing terms")\n        \n        # iterate over terms in ontology\n        for idx, term in envo.terms.items():\n            # if term has a name\n            if term.name is not None:\n                self.terms[term.name.lower()] = {\'id\': term.id}\n                patterns.append(nlp(term.name))\n        \n        # Initialize matcher & add patterns\n        self.matcher = PhraseMatcher(nlp.vocab)\n        self.matcher.add(label, None, *patterns)\n        \n        # Set extensions to tokens, spans and docs\n        Token.set_extension(\'is_envo_term\', default=False, force=True)\n        Token.set_extension("envo_id", defa

## Gotcha!

In [11]:
##########
# IMPORTANT GOTCHAS!
# While creating termlists make sure terms that are the same after lemmatization are assigned the same ID.
# This dictates to undo the'_SYNONYM' added for synonyms
##########
#print(envoTerms[envoTerms['name'].str.contains('surface structure')])
test1 = """surface structures."""
doc = nlp(test1)


ValueError: [E1010] Unable to set entity information for token 0 which is included in more than one span in entities, blocked, missing or outside.

In [None]:
print(envoTerms[envoTerms['name'].str.contains('tissue culture')])
test2 = """tissue culture"""
doc = nlp(test2)


In [None]:
!python -m spacy info