In [1]:
!python -m spacy info

[1m

spaCy version    3.1.3                         
Location         /opt/anaconda3/envs/ontoSpacy/lib/python3.9/site-packages/spacy
Platform         macOS-10.16-x86_64-i386-64bit 
Python version   3.9.2                         
Pipelines        en_core_web_sm (3.1.0)        



In [2]:
%reset -f

In [3]:
# One time run.
# ! python -m spacy download en_core_web_sm

## Imports

In [3]:
#from pronto import Ontology
import spacy
from spacy.tokens import Doc, Span, Token
from spacy.matcher import PhraseMatcher
from spacy.util import filter_spans
from spacy.language import Language
from spacy.lang.en import English
from spacy import displacy
import os
import pandas as pd
import json

from dframcy import DframCy

In [4]:
cols = ['CUI', 'source', 'CURIE', 'name', 'description', 'category']
envoTerms = pd.read_csv(os.path.join('terms', 'envo_termlist.tsv'), sep='\t', low_memory=False, names=cols)
envoTerms = envoTerms.drop(['CUI', 'source','description'], axis=1)
envoTerms['CURIE']=envoTerms['CURIE'].str.strip('_SYNONYM')
envoTerms = envoTerms.drop_duplicates()
print(len(envoTerms))
envoTerms.head()

11581


Unnamed: 0,CURIE,name,category
0,CHEBI:25213,metal cation,biolink:ChemicalSubstance
1,CHEBI:25212,metabolite,biolink:ChemicalSubstance
2,CHEBI:25216,metalloporphyrin,biolink:ChemicalSubstance
3,UBERON:0011595,jaw region,biolink:AnatomicalEntity
4,CHEBI:76807,EC 3.5.1.* (non-peptide linear amide C-N hydro...,biolink:ChemicalSubstance


In [5]:
%%time
nlp = English() # spacy.load("en_core_web_sm")
# Set terms and patterns
terms = {}
patterns = []
for curie, name, category in envoTerms.to_records(index=False):
    if name is not None:
        terms[name.lower()] = {'id': curie, 'category': category}
        patterns.append(nlp(name))

CPU times: user 663 ms, sys: 21.4 ms, total: 685 ms
Wall time: 684 ms


### old code

In [7]:
'''class ENVOExtractor(object):
    
    name = 'envo_extractor'
    
    def __init__(self, doc, label="ENVO"):
        self.label = label
        
        print('Loading ENVO')
        envo = Ontology("http://purl.obolibrary.org/obo/envo.owl")
        
        self.terms = {}
        patterns = []
        ner_terms = len(envo)
        print("Importing terms")
        
        # iterate over terms in ontology
        for idx, term in envo.terms.items():
            # if term has a name
            if term.name is not None:
                self.terms[term.name.lower()] = {'id': term.id}
                patterns.append(nlp(term.name))
        
        # Initialize matcher & add patterns
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add(label, None, *patterns)
        
        # Set extensions to tokens, spans and docs
        Token.set_extension('is_envo_term', default=False, force=True)
        Token.set_extension("envo_id", default=False, force=True)
        Token.set_extension("merged_concept", default=False, force=True)
        Doc.set_extension("has_envo_ids", getter=self.has_envo_ids, force=True)
        Doc.set_extension("envo_ids", default=[], force=True)
        Span.set_extension("has_envo_ids", getter=self.has_envo_ids, force=True)
        
    def __call__(self, doc):
        matches = self.matcher(doc)
        spans = [Span(doc, start, end, label=self.label) for matchId, start, end in matches]
        doc.ents = spans
        
        for i, span in enumerate(spans):
          span._.set("has_envo_ids", True)
          for token in span:
               token._.set("is_envo_term", True)
               token._.set("envo_id", self.terms[span.text.lower()]["id"])

        with doc.retokenize() as retokenizer:
            for span in filter_spans(spans):
                retokenizer.merge(span, attrs={"_": {"merged_concept": True}})
                doc._.envo_ids = list(doc._.envo_ids) + [span]

        return doc
        
    # Setter function for doc level
    def has_envo_ids(self, tokens):
        return any([t._.get("is_envo_term") for t in tokens])
        '''

'class ENVOExtractor(object):\n    \n    name = \'envo_extractor\'\n    \n    def __init__(self, doc, label="ENVO"):\n        self.label = label\n        \n        print(\'Loading ENVO\')\n        envo = Ontology("http://purl.obolibrary.org/obo/envo.owl")\n        \n        self.terms = {}\n        patterns = []\n        ner_terms = len(envo)\n        print("Importing terms")\n        \n        # iterate over terms in ontology\n        for idx, term in envo.terms.items():\n            # if term has a name\n            if term.name is not None:\n                self.terms[term.name.lower()] = {\'id\': term.id}\n                patterns.append(nlp(term.name))\n        \n        # Initialize matcher & add patterns\n        self.matcher = PhraseMatcher(nlp.vocab)\n        self.matcher.add(label, None, *patterns)\n        \n        # Set extensions to tokens, spans and docs\n        Token.set_extension(\'is_envo_term\', default=False, force=True)\n        Token.set_extension("envo_id", defa

### Functions

In [6]:
@Language.component('envo_extractor')
def envo_extractor(doc):
    
    matches = matcher(doc)
    
    spans = [Span(doc, start, end, label = 'ENVO') for matchId, start, end in matches]
    
    #doc.ents = spans
    doc.spans['ENVO'] = spans
    
    for i, span in enumerate(spans):
        span._.set("has_envo_ids", True)
        for token in span:
            token._.set("is_envo_term", True)
            token._.set("envo_id", terms[span.text.lower()]["id"])
            token._.set("category", terms[span.text.lower()]["category"])

    # Add ENVO labelld spans along with built-in ones
    doc.ents += tuple(filter_spans(doc.spans['ENVO']))

    return doc

# Setter function for doc level
def has_envo_ids(self, tokens):
    return any([t._.get("is_envo_term") for t in tokens])

def resolve_substrings(matcher, doc, i, matches):
    # Get the current match and create tuple of entity label, start and end.
    # Append entity to the doc's entity. (Don't overwrite doc.ents!)
    print('###########')
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="DUPLICATE")
    doc.ents += (entity,)
    #doc.spans["ENVO"] = entity
    

In [7]:
%%time
nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ENVO", None, *patterns, on_match = resolve_substrings)
nlp.add_pipe('envo_extractor', after='ner')
nlp.pipeline

CPU times: user 379 ms, sys: 28.9 ms, total: 408 ms
Wall time: 419 ms


[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7fe26927cf40>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7fe238409e50>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7fe288947d60>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7fe269236e00>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7fe2692c6880>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7fe288947b80>),
 ('envo_extractor', <function __main__.envo_extractor(doc)>)]

In [8]:
# Set extensions to tokens, spans and docs
Token.set_extension('is_envo_term', default=False, force=True)
Token.set_extension("envo_id", default=False, force=True)
#Token.set_extension("merged_concept", default=False, force=True)
Token.set_extension("category", default=False, force=True)
Doc.set_extension("has_envo_ids", getter=has_envo_ids, force=True)
Doc.set_extension("envo_ids", default=[], force=True)
Span.set_extension("has_envo_ids", getter=has_envo_ids, force=True)

In [15]:
test = """
However, while Dehalococcoides spp. 
have received much attention with respect to genome sequencing two published complete genomes, 
three draft genomes , only limited efforts have been undertaken to elucidate genomes of 
halorespiring bacteria belonging to the low-GC Gram-positive bacteria, which comprise one of 
the largest groups of environmental isolates able to grow on the expense of reductive dehalogenation. 
In fact, until recently no genome sequencing project has yet been initiated towards Dehalobacter spp.,
an intriguing group of isolates that are restricted in their metabolism to halorespiration, similar 
to Dehalococcoides spp. That is why we have now elucidated the genome sequence of Dehalobacter sp. 
strain E1, which grows in syntrophic coculture with a fermenative Sedimentibacter sp Maphosa, , unpublished . 
In line with the above mentioned lack of information with respect to their genomic microdiversity, 
we here propose the genomic sequencing of a number of closely related strains of the 
Gram-positive halorespiring phyla Desulfitobacterium and Dehalobacter, which, while highly related 
phylogenetically, differ significantly in their biodegradative activities and eco-physiological features, 
such as source of isolation, spectrum of electron donors, and alternative electron acceptors. 
In addition, we propose to include a number of populations that did not undergo a lengthy 
in vitro isolation procedure, which can be expected to impose non-natural selection pressures. 
This has become feasible because of recent developments in the areas of whole genome amplication 
of small amounts of genomic DNA, in combination with high throughput cultivation of environmental
microorganisms at the microcolony scale , . The recently obtained Fe III -reducing ‚
Deferrispora aromatica gen. nov. sp. nov.‚Äù Kunapuli, , unpublished results , 
phylogenetically most closely related to Desulfitobacterium and Dehalobacter, 
is rather dedicated to degradation of a wide range of aromatic hydrocarbons under Fe III -reducing conditions, 
and as such the first non-proteobacterial isolate with this feature. 
The availability of these genome sequences will provide pivotal information to 
further our understanding of evolution, speciation and functionality of this environmentally highly
relevant group of bacteria. The here proposed sequencing project is strongly supported by several 
researchers leading in the field of anaerobic degradation of environmental pollutants. 
Scientists agreed to actively participate in sequence annotation, building
on their strong expertise from previous and current genome sequence projects. 
"""
doc = nlp(test)
displacy.render(doc, style='ent')

In [38]:
dframcy = DframCy(nlp)
annotation_df = dframcy.to_dataframe(doc)
annotation_df[annotation_df['token_ent_type_']=='ENVO']

Unnamed: 0,token_text,token_start,token_end,token_pos_,token_tag_,token_dep_,token_head,token_ent_type_
13,human,83,88,ADJ,JJ,amod,pocket,ENVO
24,organism,139,147,NOUN,NN,pobj,of,ENVO
33,organism,189,197,NOUN,NN,nsubj,is,ENVO
77,time,425,429,NOUN,NN,pobj,At,ENVO
100,long,568,572,ADJ,JJ,amod,periods,ENVO
166,contains,965,973,VERB,VBZ,ROOT,contains,ENVO
188,toxin,1110,1115,NOUN,NN,dobj,include,ENVO
200,secretion,1155,1164,NOUN,NN,compound,system,ENVO
201,system,1165,1171,NOUN,NN,conj,toxins,ENVO
218,humans,1262,1268,NOUN,NNS,pobj,in,ENVO


## Gotcha!

In [12]:
##########
# IMPORTANT GOTCHAS!
# While creating termlists make sure terms that are the same after lemmatization are assigned the same ID.
# This dictates to undo the'_SYNONYM' added for synonyms
##########
print(envoTerms[envoTerms['name'].str.contains('surface structure')])
test1 = """surface structures."""
doc = nlp(test1)


              CURIE                name                  category
747  UBERON:0003102  surface structures  biolink:AnatomicalEntity
750  UBERON:0003102   surface structure  biolink:AnatomicalEntity


In [13]:
print(envoTerms[envoTerms['name'].str.contains('tissue culture')])
test2 = """tissue culture"""
doc = nlp(test2)


               CURIE                  name               category
10248  ENVO:02000009  plant tissue culture  biolink:OntologyClass
10271  ENVO:02000007        tissue culture  biolink:OntologyClass


In [14]:
doc
displacy.render(doc, style='ent')