In [1]:
!python -m spacy info

[1m

spaCy version    3.2.3                         
Location         /opt/anaconda3/envs/ontospacy/lib/python3.9/site-packages/spacy
Platform         macOS-10.16-x86_64-i386-64bit 
Python version   3.9.7                         
Pipelines        en_ner_craft_md (0.5.0), en_core_web_sm (3.2.0), en_core_web_lg (3.0.0), en_core_sci_scibert (0.5.0)



In [2]:
%reset -f

In [None]:
# One time run.
# ! python -m spacy download en_core_web_sm

## Imports

In [3]:
from spacy.language import Language
from spacy.pipeline import entityruler
import pandas as pd
import os
import spacy
from collections import defaultdict
from scispacy.linking import EntityLinker
from spacy.tokens import Doc, Span, Token
from spacy.matcher import PhraseMatcher
from spacy import displacy
from dframcy import DframCy
import pickle

Your CPU supports instructions that this binary was not compiled to use: AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib


## Actual code

In [5]:

class OntoRuler(object):
    def __init__(self):
        self.label = "ontology"
        self.phrase_matcher_attr = "LOWER"
        self.processing_threshold = 100_000
        self.terms = {}
        self.list_of_pattern_dicts = []
#         self.list_of_obj_docs = []
        self.nlp = spacy.load("en_ner_craft_md")
        self.nlp.rename_pipe("ner", "craft_ner")  # To avoid conflict
        # Source for below: https://spacy.io/usage/processing-pipelines
        self.nlp.add_pipe(
            "ner", source=spacy.load("en_core_web_sm"), before="craft_ner"
        )
        self.phrase_matcher = PhraseMatcher(
            self.nlp.vocab, attr=self.phrase_matcher_attr
        )
        
        with open(os.path.join('input', 'terms.pickle'), "rb") as tf:
            self.terms = pickle.load(tf)
        with open(os.path.join('input', 'patterns.pickle'), "rb") as pf:
            self.list_of_pattern_dicts = pickle.load(pf)
         
        ruler = self.nlp.add_pipe("entity_ruler", after="craft_ner")
        ruler.add_patterns(self.list_of_pattern_dicts)

        with open(os.path.join('input', 'phrase_matcher.pickle'), "rb") as pmp:
            self.phrase_matcher = pickle.load(pmp)

        # variables for spans and docs extensions
        self.span_term_extension = "is_an_ontology_term"
        self.span_id_extension = "object_id"
        self.has_id_extension = "has_curies"

        Span.set_extension(self.span_term_extension, default=False, force=True)
        Span.set_extension(self.span_id_extension, default=False, force=True)
        Span.set_extension("object_category", default=False, force=True)
        Span.set_extension("object_label", default=False, force=True)
        Span.set_extension("object_match_field", default=False, force=True)
        Span.set_extension("origin", default=False, force=True)
        Span.set_extension("start", default=False, force=True)
        Span.set_extension("end", default=False, force=True)

        Span.set_extension(
            self.has_id_extension, getter=self.has_curies, force=True
        )

        Doc.set_extension(
            self.has_id_extension, getter=self.has_curies, force=True
        )
        Doc.set_extension(self.label.lower(), default=[], force=True)

#         self.nlp.add_pipe(
#             "scispacy_linker",
#             config={"resolve_abbreviations": True, "linker_name": "umls"},
#         )  # Must be one of 'umls' or 'mesh'.

    # getter function for doc level
    def has_curies(self, tokens):
        """Check if any one token has CURIEs

        :param tokens: token
        :return: Boolean
        """
        return any([t._.get(self.span_term_extension) for t in tokens])

    

    def get_terms_patterns(self, *args):
        """Get dictionaries of terms and patterns, 
        along with doc format of the term.

        :return: Dictionary of terms; 
        Dictionary of patterns; 
        doc object of the term
        """
        origin, object_id, matched_term, description, object_category = args[0]
        terms_dict = {}
        pattern_dict = {}
        object_match_field = ""

        if "[SYNONYM_OF:" in description:
            object_label = description.split("[SYNONYM_OF:")[-1].rstrip("]")
            object_match_field = "hasRelatedSynonym"
        else:
            object_label = matched_term
            # object_match_field = "isExactMatch"

        if matched_term is not None and matched_term == matched_term:
            terms_dict[matched_term.lower()] = {
                "object_id": object_id,
                "object_category": object_category,
                "object_label": object_label,
                "object_match_field": object_match_field,
                "origin": origin,
            }
            pattern_dict["id"] = object_id
            pattern_dict["label"] = origin.split(".")[0]
            pattern_dict["pattern"] = matched_term

        return terms_dict, pattern_dict, self.nlp(matched_term)
    
    def get_ont_terms_df(self) -> pd.DataFrame:
        """Get Ontology terms from external source in the form of a pandas DataFrame.

        :return: Pandas DataFrame for of termlist.
        """
        cols = [
            "CUI",
            "origin",
            "CURIE",
            "matched_term",
            "description",
            "object_category",
        ]

        ontoTerms = pd.read_csv(os.path.join('terms', 'onto_termlist.tsv'), sep='\t', low_memory=False, names=cols)
        df = ontoTerms.drop_duplicates()
        df.columns = cols
        df = df.drop(["CUI"], axis=1)
        df = df.fillna("")
        return df

In [6]:
%%time
__spec__ = None
onto_ruler_obj = OntoRuler()



CPU times: user 37min 54s, sys: 2min 37s, total: 40min 32s
Wall time: 41min 15s


In [None]:
test = """
However, while Dehalococcoides spp. 
have received much attention with respect to genome sequencing two published complete genomes, 
three draft genomes , only limited efforts have been undertaken to elucidate genomes of 
halorespiring bacteria belonging to the low-GC Gram-positive bacteria, which comprise one of 
the largest groups of environmental isolates able to grow on the expense of reductive dehalogenation. 
In fact, until recently no genome sequencing project has yet been initiated towards Dehalobacter spp.,
an intriguing group of isolates that are restricted in their metabolism to halorespiration, similar 
to Dehalococcoides spp. That is why we have now elucidated the genome sequence of Dehalobacter sp. 
strain E1, which grows in syntrophic coculture with a fermenative Sedimentibacter sp Maphosa, , unpublished . 
In line with the above mentioned lack of information with respect to their genomic microdiversity, 
we here propose the genomic sequencing of a number of closely related strains of the 
Gram-positive halorespiring phyla Desulfitobacterium and Dehalobacter, which, while highly related 
phylogenetically, differ significantly in their biodegradative activities and eco-physiological features, 
such as source of isolation, spectrum of electron donors, and alternative electron acceptors. 
In addition, we propose to include a number of populations that did not undergo a lengthy 
in vitro isolation procedure, which can be expected to impose non-natural selection pressures. 
This has become feasible because of recent developments in the areas of whole genome amplication 
of small amounts of genomic DNA, in combination with high throughput cultivation of environmental
microorganisms at the microcolony scale , . The recently obtained Fe III -reducing ‚
Deferrispora aromatica gen. nov. sp. nov.‚Äù Kunapuli, , unpublished results , 
phylogenetically most closely related to Desulfitobacterium and Dehalobacter, 
is rather dedicated to degradation of a wide range of aromatic hydrocarbons under Fe III -reducing conditions, 
and as such the first non-proteobacterial isolate with this feature. 
The availability of these genome sequences will provide pivotal information to 
further our understanding of evolution, speciation and functionality of this environmentally highly
relevant group of bacteria. The here proposed sequencing project is strongly supported by several 
researchers leading in the field of anaerobic degradation of environmental pollutants. 
Scientists agreed to actively participate in sequence annotation, building
on their strong expertise from previous and current genome sequence projects. 
"""
doc = onto_ruler_obj.nlp(test)
displacy.render(doc, style='ent')

## Output

In [7]:
input_text_file = os.path.join('input', 'text.txt')

with open(input_text_file, "r") as txt:
    text = txt.read().replace('\n',"")
    

In [None]:
'''
    PERSON:      People, including fictional.
    NORP:        Nationalities or religious or political groups.
    FAC:         Buildings, airports, highways, bridges, etc.
    ORG:         Companies, agencies, institutions, etc.
    GPE:         Countries, cities, states.
    LOC:         Non-GPE locations, mountain ranges, bodies of water.
    PRODUCT:     Objects, vehicles, foods, etc. (Not services.)
    EVENT:       Named hurricanes, battles, wars, sports events, etc.
    WORK_OF_ART: Titles of books, songs, etc.
    LAW:         Named documents made into laws.
    LANGUAGE:    Any named language.
    DATE:        Absolute or relative dates or periods.
    TIME:        Times smaller than a day.
    PERCENT:     Percentage, including ”%“.
    MONEY:       Monetary values, including unit.
    QUANTITY:    Measurements, as of weight or distance.
    ORDINAL:     “first”, “second”, etc.
    CARDINAL:    Numerals that do not fall under another type.
'''

In [8]:
doc = onto_ruler_obj.nlp(text)
svg = displacy.render(doc, style='ent')


In [9]:
dframcy = DframCy(onto_ruler_obj.nlp)
annotation_df = dframcy.to_dataframe(doc)

In [10]:
annotation_df.head()

Unnamed: 0,token_text,token_start,token_end,token_pos_,token_tag_,token_dep_,token_head,token_ent_type_
0,ABSTRACT,0,8,NOUN,NN,nsubjpass,obtained,
1,A,9,10,DET,DT,det,isolate,to
2,bacterial,11,20,ADJ,JJ,amod,isolate,TAXON
3,isolate,21,28,NOUN,NN,dobj,ABSTRACT,
4,",",28,29,PUNCT,",",punct,isolate,


In [11]:
annotation_df[annotation_df["token_ent_type_"] != ""].head()

Unnamed: 0,token_text,token_start,token_end,token_pos_,token_tag_,token_dep_,token_head,token_ent_type_
1,A,9,10,DET,DT,det,isolate,to
2,bacterial,11,20,ADJ,JJ,amod,isolate,TAXON
6,strain,41,47,NOUN,NN,compound,SZ,pato
13,creek,86,91,NOUN,NN,compound,sediment,envo
14,sediment,92,100,NOUN,NN,compound,microcosms,envo
