## NER and RE using LLM and RAG

### Set the envirnment variables

In [1]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = open('langchain-api-key.txt', 'r').read()
os.environ['OPENAI_API_KEY'] = open('openai-api-key-pitt.txt', 'r').read()

### Imports

In [2]:
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
import re
import urllib
from time import sleep
import requests
from xml.etree import ElementTree
from tqdm import tqdm
import pandas as pd
import tiktoken 
import openai
from datetime import datetime
from pronto import Ontology
from tqdm import tqdm
import json
import pickle
gpt3 = "gpt-3.5-turbo-0125"
gpt4 = "gpt-4-0125-preview"

### Process GO File
Read the go-basic.obo file and convert to a dict for indexin with Chroma

In [3]:
def parse_go_file(file_path):
    # Load the ontology from a file
    ont = Ontology(file_path)

    # Initialize a dictionary to hold the GO terms
    go_dict = {}

    # Iterate through each term in the ontology
    for term in ont.terms():
        if term.namespace == "biological_process":
            go_dict[term.id] = {
                'id': term.id,
                'name': term.name,
                'namespace': term.namespace,
                'definition': term.definition,
                'is_a': ',\n'.join([parent.name for parent in term.superclasses()]),
                'synonyms': ',\n'.join([syn.description for syn in term.synonyms])
            }
            
    return go_dict

# Path to your GO file
go_file_path = './Ontology/go-basic.obo'

# Parse the GO file and convert to a dictionary
go_dict = parse_go_file(go_file_path)

print(len(go_dict))

# Example: print details of a specific GO term
go_id = 'GO:0006915'  # Example GO ID
if go_id in go_dict:
    print(f"Details for {go_id}:")
    for key, value in go_dict[go_id].items():
        print(f"{key}: {value}")
else:
    print(f"{go_id} not found in the GO dictionary.")

30655
Details for GO:0006915:
id: GO:0006915
name: apoptotic process
namespace: biological_process
definition: A programmed cell death process which begins when a cell receives an internal (e.g. DNA damage) or external signal (e.g. an extracellular death ligand), and proceeds through a series of biochemical events (signaling pathway phase) which trigger an execution phase. The execution phase is the last step of an apoptotic process, and is typically characterized by rounding-up of the cell, retraction of pseudopodes, reduction of cellular volume (pyknosis), chromatin condensation, nuclear fragmentation (karyorrhexis), plasma membrane blebbing and fragmentation of the cell into apoptotic bodies. When the execution phase is completed, the cell has died.
is_a: apoptotic process,
programmed cell death,
cell death,
cellular process,
biological_process
synonyms: apoptosis,
apoptotic programmed cell death,
apoptotic program,
apoptosis activator activity,
apoptosis signaling,
caspase-dependen

### Index the GO Biological Processes with Chroma 
For implementating RAG, we need to index the go_dict object with Chroma. This will help LLM for the annotation of Biological Processes.
We'll save the vector database locally and load from the local store for later.

In [4]:
%%time

ids = list(go_dict.keys())
docs = [v.get('name', '') for v in go_dict.values()]
metadatas = [v for v in go_dict.values()]

# Embed
vectorstore = Chroma.from_texts(ids=ids,
                                texts=docs, 
                                metadatas=metadatas,
                                collection_name='GO_BP',
                                embedding=OpenAIEmbeddings())


CPU times: user 2min 47s, sys: 7.02 s, total: 2min 54s
Wall time: 2min 22s


### Index the GO Biological Processes with BM25 


In [5]:
%%time
ids = list(go_dict.keys())
docs = [v.get('name', '') for v in go_dict.values()]
metadatas = [v for v in go_dict.values()]

bm25_retriever = BM25Retriever.from_texts(texts = docs, metadatas = metadatas)
bm25_retriever.k = 20

CPU times: user 749 ms, sys: 72 ms, total: 821 ms
Wall time: 814 ms


In [7]:
# define the prompt templates for NER
ner_template = """
Following is the list of candidate GO biological process concepts:\n
{context}\n
Your job is to parse the following title and identify all instances of the same, or equivalent, or similar biological process concepts given the above list of candidate concepts.\n
Mark up the concepts (if any) in double square brackets preserving the original text of the title inside the brackets like [[original text]].\n
If no biological process is explicitly mentioned in the supplied text, return the supplied text unchanged.\n\n
Example:
text: "Interference with KCNJ2 inhibits proliferation, migration and EMT progression of papillary thyroid carcinoma cells by upregulating GNG2 expression."
output: "Interference with KCNJ2 inhibits [[proliferation]], [[migration]] and EMT progression of papillary thyroid carcinoma cells by upregulating GNG2 expression."
\n\n
title:{text}
output: 
"""

ner_prompt = ChatPromptTemplate.from_template(ner_template)
print(ner_prompt)

# LLM
llm = ChatOpenAI(model_name=gpt4, temperature=0)

# initialize the ector_retriver
vector_retriever = vectorstore.as_retriever(search_kwargs={"k": 50})
# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[vector_retriever, bm25_retriever], weights=[1.0, 1.0]
)
# Post-processing
def format_docs(docs):
    print("\n".join(doc.page_content for doc in docs))
    return "\n".join(doc.page_content for doc in docs)
    #return "fibroblast proliferation, angiogenesis, wound healing, cell proliferation"

ner_rag_chain = (
    {"context": ensemble_retriever | format_docs,\
     "text": RunnablePassthrough()}
    | ner_prompt
    | llm
    | StrOutputParser()
)

input_variables=['context', 'text'] messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'text'], template='\nFollowing is the list of candidate GO biological process concepts:\n\n{context}\n\nYour job is to parse the following title and identify all instances of the same, or equivalent, or similar biological process concepts given the above list of candidate concepts.\n\nMark up the concepts (if any) in double square brackets preserving the original text of the title inside the brackets like [[original text]].\n\nIf no biological process is explicitly mentioned in the supplied text, return the supplied text unchanged.\n\n\nExample:\ntext: "Interference with KCNJ2 inhibits proliferation, migration and EMT progression of papillary thyroid carcinoma cells by upregulating GNG2 expression."\noutput: "Interference with KCNJ2 inhibits [[proliferation]], [[migration]] and EMT progression of papillary thyroid carcinoma cells by upregulating GNG2 expression."\n

## NER using GPT-4 and RAG

In [178]:
import json

gpt_annots = {}

gs_annots = json.load(open('gs_annots.json', 'r'))

for item in tqdm(gs_annots['documents']):
    pmid = item['pmid']
    title = item['title']

    response = ner_rag_chain.invoke(title)
    #matches = re.findall(annot_pattern, response)

    #if len(matches) > 0:
    gpt_annots[pmid] = response

       
df = pd.DataFrame.from_dict(gpt_annots, orient='index')
df.to_csv('gpt4_annots.csv')

100%|██████████| 104/104 [01:48<00:00,  1.04s/it]


### GPT annotations and concept grounding

In [237]:
import json
import pandas as pd
import re
with open('gs_annots_6.json', 'r') as f:
    gs_annots = json.load(f)
gpt_annots = pd.read_csv('gpt4_annots.csv')

gpt_annots_extended = {'documents':[]}
annot_pattern = r"\[\[([^\]]+)\]\]"
for item in tqdm(gs_annots['documents']):
    pmid = item['pmid']
    title = item['title']
    annotations = []
    for a_item in item['annotations']:
        if a_item['type'] != 'Biological Process':
            annotations.append(a_item)
    annot_sent = gpt_annots[gpt_annots['pmid'] == pmid]['annotation'].values[0]
    assert title == annot_sent.replace('[[', '').replace(']]', '')
    i = 0
    for match in re.finditer(annot_pattern, annot_sent):
        start, end = match.span()
        start = start - i
        end = end - (i + 4)
        annotations.append({'entity':title[start:end], 'type': 'Biological Process', 'kb_concept_name':'',\
                            'kb_id': '', 'start': start, 'end': end})
        #print(annotations)
        i = i + 4
    gpt_annots_extended['documents'].append({'pmid':pmid, 'title':title, 'annotations':annotations})

    
with open('gpt4_annots_2.json', 'w') as f:
    json.dump(gpt_annots_extended, f, indent = 4)

100%|██████████| 104/104 [00:00<00:00, 2818.19it/s]


### Concept Grounding

In [244]:
from operator import itemgetter

ground_prompt_template = """
Your role is to assign a concept ID that best matches the supplied text, using the supplied list of candidate concepts.
Return as a string "CONCEPT_NAME|CONCEPT_ID|Score" triple where the score represents the confidence 
of the assignment and should be between 0 and 1.
Only use concept IDs from the supplied list of candidate concepts.
Only return a row if the concept ID is a match for the input text.
If there is no match, return NOT FOUND.\n
Here are the candidate concepts, as CONCEPT_NAME|CONCEPT_ID pairs:
{candidates}
The overall context for this is the sentence: {context}
Concept to ground: {text}
"""

ground_prompt = ChatPromptTemplate.from_template(ground_prompt_template)
#print(prompt)
# Post-processing
def format_docs(docs):
    #print("\n".join(doc.metadata['name']+'//'+doc.metadata['id'] for doc in docs))
    return ("\n".join(doc.metadata['name']+'|'+doc.metadata['id'] for doc in docs))
    #return "fibroblast proliferation, angiogenesis, wound healing, cell proliferation"

rag_chain = (
    {"candidates": itemgetter("text") | vector_retriever | format_docs,\
     "context": itemgetter("context"), \
     "text": itemgetter("text")}
    | ground_prompt
    | llm
    | StrOutputParser()
)

response = rag_chain.invoke({'context':"Nicotinamide inhibits corneal endothelial mesenchymal transition and accelerates wound healing.",\
                             'text': 'corneal endothelial mesenchymal transition'})
#print(response)
# print({'kb_concept_name':response.split('|')[0], 'kb_id':response.split('|')[1], 
#        'confidence_score':response.split('|')[2]})

with open('gpt4_annots_2.json', 'r') as f:
    gpt_annots = json.load(f)
gpt_annots_new = {'documents':[]}
for item in tqdm(gpt_annots['documents']):
    new_item = {'pmid':item['pmid'], 'title':item['title'], 'annotations':[]}
    for ann in item['annotations']:
        if ann['type'] == 'Biological Process':
            text = ann['entity']
            response = rag_chain.invoke({'context': item['title'], 'text': text})
            if "NOT FOUND" not in response:
                ann['kb_concept_name'] = response.split('|')[0]
                ann['kb_id'] = response.split('|')[1]
        new_item['annotations'].append(ann)
    gpt_annots_new['documents'].append(item)
    #break
with open('gpt4_annots_3.json', 'w') as f:
    json.dump(gpt_annots_new, f, indent = 4)

100%|██████████| 104/104 [03:09<00:00,  1.82s/it]


### Relation Extraction

In [250]:
from operator import itemgetter

re_prompt_template = """{context}
Based on the above statement only, your job is to label the following sentences as true, false, or unknown if there's a strong evidence for the regulatory relationship between the two concepts in the above statement.
and provide a chain-of-thought (CoT) supporting the answer. 
The answers should be in the following format.
1. label//CoT
2. label//CoT
3. label//Cot
..... etc.
{candidates}
"""

re_prompt = ChatPromptTemplate.from_template(re_prompt_template)
#print(prompt)

# retriever
vector_retriever = vectorstore.as_retriever(search_kwargs={"k": 50})
# Post-processing
def format_docs(docs):
    #print("\n".join(doc.metadata['name']+'//'+doc.metadata['id'] for doc in docs))
    return ("\n".join(doc.metadata['name']+'//'+doc.metadata['id'] for doc in docs))
    #return "fibroblast proliferation, angiogenesis, wound healing, cell proliferation"

rag_chain_re = (
    {"context": itemgetter("context"), "candidates": itemgetter("candidates")}
    | re_prompt
    | llm
    | StrOutputParser()
)

gpt_annots = json.load(open('gpt4_annots_3.json','r'))
gpt_annots_new = {'documents': []}
def generate_candidate_sentences(annotations):
    sources = []
    targets = []
    for item in annotations:
        if item['type'] == 'Biological Process':
            targets.append(item['entity'])
        else:
            sources.append(item['entity'])
    if not sources:
        return None
    if not targets:
        return None
    candidates = []
    i = 1
    for s in sources:
        for t in targets:
            candidates.append(f"{i}. {s} upregulates {t}")
            i = i + 1
            candidates.append(f"{i}. {s} downregulates {t}")
            i = i + 1
    return '\n'.join(candidates)

def extract_relations(candidate, response):
    relations = []
    responses = response.splitlines()
    sents = candidates.splitlines()
    sents = [re.sub(r"(\d+)\. ", '', sent) for sent in sents]
    for sent, response in zip(sents, responses):
        if "true" in response.split('//')[0].lower():
            cot = response.split('//')[1]
            if 'upregulates' in sent:
                rel_type = 'positive'
                sent = sent.replace(' upregulates ', '||')
            else:
                rel_type = 'negative'
                sent = sent.replace(' downregulates ', '||')
            relations.append({'source':sent.split('||')[0], 'target': sent.split('||')[1], \
                              'relation': rel_type, 'chain-of-thought': cot})
    return relations

for item in tqdm(gpt_annots['documents']):
    title = item['title']
    annotations = item['annotations']
    candidates = generate_candidate_sentences(annotations)
    if candidates is None:
        item['relations'] = []
        gpt_annots_new['documents'].append(item)
        continue
    response = rag_chain_re.invoke({'context': title, 'candidates': candidates})
    item['relations'] = extract_relations(candidates, response)
    gpt_annots_new['documents'].append(item)


with open('gpt4_annots_4.json', 'w') as f:
    json.dump(gpt_annots_new, f, indent = 4) 

100%|██████████| 104/104 [03:56<00:00,  2.27s/it]


### Rule-Based NER, Grounding and RE

In [3]:
import spacy
import scispacy
from scispacy.linking import EntityLinker
from scispacy.abbreviation import AbbreviationDetector
from spacy.tokens import Span
from spacy.pipeline import merge_entities
import pandas as pd
from spacy import displacy
import pickle
import json


class SpacyExtractor:
    def __init__(self, model_name = "en_core_sci_sm", umls_cuis_path = "umls_cuis_go.pkl"):  
        self.nlp = self.load_model(model_name)
        self.tuis = pd.read_csv("umls_tuis.txt").set_index('tui')['label'].to_dict() 
        self.umls_cuis = pickle.load(open(umls_cuis_path, 'rb')) 
        Span.set_extension("cui", default='', force=True)
        Span.set_extension("tuis", default=[], force=True)
        Span.set_extension("mapping", default=[], force=True)  

    def load_model(self, model_name):
        self.nlp = spacy.load(model_name)
        # Add the abbreviation pipe to the spacy pipeline.
        #self.nlp.add_pipe("abbreviation_detector")
        # that linking will only be performed on the long form of abbreviations.
        self.nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls", "threshold":0.7})
        # add merge_entities pipe for RE
        self.nlp.add_pipe("merge_entities")
        return self.nlp
    
    def expand_entities(self, doc):
        linker = self.nlp.get_pipe("scispacy_linker")
        ents = list(doc.ents)
        allowed_tuis = self.tuis.keys()
        new_ents = []
        for i in range(len(ents)):
            old_ent = ents[i]
            if old_ent._.kb_ents:
                umls_ent = linker.kb.cui_to_entity[old_ent._.kb_ents[0][0]]
                cui = umls_ent.concept_id
                # if cui in list(self.umls_cuis.keys()):
                #     mappings = self.umls_cuis[cui]
                # else:
                mapping = self.umls_cuis.get(cui, None)
                tuis = list([self.tuis[t] for t in umls_ent.types if t in allowed_tuis])
                if mapping is not None:
                    label = '[' + ', '.join(set(tuis)) + ']'
                    new_ent = Span(doc, old_ent.start, old_ent.end, label = label)
                    new_ent._.set('cui', cui)
                    #new_ent._.set('tuis', tuis)
                    new_ent._.set('mapping', mapping)
                    new_ents.append(new_ent)

        doc.ents = new_ents
        return doc
umls_cuis_go = pickle.load(open('umls_cuis_go.pkl', 'rb'))
extractor = SpacyExtractor()
doc = extractor.nlp("microRNA-23b suppresses epithelial-mesenchymal transition (EMT) and metastasis in hepatocellular carcinoma via targeting Pyk2.")
doc = extractor.expand_entities(doc)
for ent in doc.ents:
    print(ent.text, ent._.cui, ent._.mapping)

Your CPU supports instructions that this binary was not compiled to use: SSE3 SSE4.1 SSE4.2 AVX AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib
  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


epithelial-mesenchymal transition C1523298 {'kb': 'GO', 'term_type': 'PT', 'kb_cui': 'GO:0001837', 'kb_name': 'epithelial to mesenchymal transition'}


In [4]:
#### identify GO biological process concepts in the text
import json

with open('gs_annots_6.json', 'r') as f:
    gs_annots = json.load(f)

spacy_annots = {'documents':[]}

for item in gs_annots['documents']:
    title = item['title']
    pmid = item['pmid']
    annotations = item['annotations']
    new_annotations = []
    for ann in annotations:
        if ann['type'] != 'Biological Process':
            new_annotations.append(ann)
    doc = extractor.nlp(title)
    doc = extractor.expand_entities(doc)
    for ent in doc.ents:
        if ent._.mapping:
            new_annotations.append({'entity': ent.text, 'type': 'Biological Process', 'kb_concept_name':ent._.mapping['kb_name'],\
                        'kb_id': ent._.mapping['kb_cui'], 'start': ent.start_char, 'end': ent.end_char})
    spacy_annots['documents'].append({'pmid':pmid, 'title':title, 'annotations':new_annotations})

In [5]:
with open('spacy_annots.json', 'w') as f:
    json.dump(spacy_annots, f, indent = 4)

### Rule-based RE using Spacy

In [6]:
from rules import *
from spacy.util import filter_spans
from spacy.matcher import DependencyMatcher



def assign_rules_to_matcher(matcher, ent_types):
    dep_pattern1 = [{'RIGHT_ID': 'verb', 'RIGHT_ATTRS': {'DEP': {'IN':['ROOT','ccomp']},'POS': 'VERB', 
                                                     'LEMMA': {'IN':rule_1_triggers}}},
               {'LEFT_ID': 'verb', 'REL_OP': '>', 
                'RIGHT_ID': 'subject', 
                'RIGHT_ATTRS': {'DEP': 'nsubj','ENT_TYPE': {'IN': ent_types}}},
               {'LEFT_ID': 'verb', 'REL_OP': '>', 'RIGHT_ID': 'd_object', 
                'RIGHT_ATTRS': {'DEP': 'dobj', 'ENT_TYPE': {'IN': ent_types}}}]
    # matches ===> x promotes y and z; we found that x promotes y and z
    dep_pattern2 = [{'RIGHT_ID': 'verb', 'RIGHT_ATTRS': {'DEP': {'IN':['ROOT','ccomp']},'POS': 'VERB',
                                                        'LEMMA': {'IN':rule_1_triggers}}},
                   {'LEFT_ID': 'verb', 'REL_OP': '>', 
                    'RIGHT_ID': 'subject', 
                    'RIGHT_ATTRS': {'DEP': 'nsubj','ENT_TYPE': {'IN': ent_types}}},
                   {'LEFT_ID': 'verb', 'REL_OP': '.', 'RIGHT_ID': 'd_object', 
                    'RIGHT_ATTRS': {'DEP': 'conj', 'ENT_TYPE': {'IN': ent_types}}}]

    # matches ===> x ....., and supresses w and z
    dep_pattern3 = [{'RIGHT_ID': 'verb', 'RIGHT_ATTRS': {'DEP': {'IN':['conj','advcl']},'POS': 'VERB',
                                                        'LEMMA': {'IN':rule_1_triggers}}},
                   {'LEFT_ID': 'verb', 'REL_OP': '$--', 
                    'RIGHT_ID': 'subject', 
                    'RIGHT_ATTRS': {'DEP': 'nsubj','ENT_TYPE': {'IN': ent_types}}},
                   {'LEFT_ID': 'verb', 'REL_OP': '>>', 'RIGHT_ID': 'd_object', 
                    'RIGHT_ATTRS': {'DEP': {'IN':['dobj','conj']}, 'ENT_TYPE': {'IN': ent_types}}}
                    ]

    # matches ===> x causes upregulation of y; we found that x causes upregulation of y
    dep_pattern4 = [{'RIGHT_ID': 'verb', 'RIGHT_ATTRS': {'DEP': {'IN':['ROOT','ccomp']},'POS': 'VERB',
                                                        'LEMMA': {'IN':rule_2_verb_triggers}}},
                   {'LEFT_ID': 'verb', 'REL_OP': '>', 
                    'RIGHT_ID': 'subject', 
                    'RIGHT_ATTRS': {'DEP': 'nsubj','ENT_TYPE': {'IN': ent_types}}},
                   {'LEFT_ID': 'verb', 'REL_OP': '>', 'RIGHT_ID': 'effect', 
                    'RIGHT_ATTRS': {'DEP': 'dobj', 'LEMMA': {'IN':rule_2_effect_triggers}}},
                   {'LEFT_ID': 'effect', 'REL_OP': '>', 'RIGHT_ID': 'd_object', 
                    'RIGHT_ATTRS': {'DEP': 'nmod', 'ENT_TYPE': {'IN': ent_types},
                                    'LEMMA': {'NOT_IN':rule_2_effect_triggers}}}]

    # matches ===> x causes upregulation of y and z; we found that x causes upregulation of y and z
    dep_pattern5 = [{'RIGHT_ID': 'verb', 'RIGHT_ATTRS': {'DEP': {'IN':['ROOT','ccomp']},'POS': 'VERB',
                                                        'LEMMA': {'IN':rule_2_verb_triggers}}},
                   {'LEFT_ID': 'verb', 'REL_OP': '>', 
                    'RIGHT_ID': 'subject', 
                    'RIGHT_ATTRS': {'DEP': 'nsubj','ENT_TYPE': {'IN': ent_types}}},
                   {'LEFT_ID': 'verb', 'REL_OP': '>', 'RIGHT_ID': 'effect', 
                    'RIGHT_ATTRS': {'DEP': 'dobj', 'LEMMA': {'IN':rule_2_effect_triggers}}},
                   {'LEFT_ID': 'effect', 'REL_OP': '>>', 'RIGHT_ID': 'd_object', 
                    'RIGHT_ATTRS': {'DEP': 'conj', 'ENT_TYPE': {'IN': ent_types},
                                    'LEMMA': {'NOT_IN':rule_2_effect_triggers}}}]

    # matches ===> x ....., and downregulation of z and k
    dep_pattern6 = [{'RIGHT_ID': 'verb', 'RIGHT_ATTRS': {'DEP': {'IN':['ROOT','ccomp']},'POS': 'VERB',
                                                        'LEMMA': {'IN':rule_2_verb_triggers}}},
                   {'LEFT_ID': 'verb', 'REL_OP': '>', 
                    'RIGHT_ID': 'subject', 
                    'RIGHT_ATTRS': {'DEP': 'nsubj','ENT_TYPE': {'IN': ent_types}}},
                   {'LEFT_ID': 'verb', 'REL_OP': '>>', 'RIGHT_ID': 'effect', 
                    'RIGHT_ATTRS': {'DEP': 'conj', 'LEMMA': {'IN':rule_2_effect_triggers}}},
                   {'LEFT_ID': 'effect', 'REL_OP': '>>', 'RIGHT_ID': 'd_object', 
                    'RIGHT_ATTRS': {'DEP': {'IN':['nmod','conj']}, 'ENT_TYPE': {'IN': ent_types},
                                   'LEMMA': {'NOT_IN':rule_2_effect_triggers}}}]

    # matches ===> y is promoted by x
    dep_pattern7 = [{'RIGHT_ID': 'verb', 'RIGHT_ATTRS': {'DEP': {'IN':['ROOT','ccomp']},'POS': 'VERB', 
                                                         'LEMMA': {'IN':rule_1_triggers}}},
                   {'LEFT_ID': 'verb', 'REL_OP': '>', 
                    'RIGHT_ID': 'subject', 
                    'RIGHT_ATTRS': {'DEP': 'nmod','ENT_TYPE': {'IN': ent_types}}},
                   {'LEFT_ID': 'verb', 'REL_OP': '>', 'RIGHT_ID': 'd_object', 
                    'RIGHT_ATTRS': {'DEP': 'nsubjpass', 'ENT_TYPE': {'IN': ent_types}}}]

    # matches ===> .... and z is suppressed by x
    dep_pattern8 = [{'RIGHT_ID': 'verb', 'RIGHT_ATTRS': {'DEP': 'conj','POS': 'VERB',
                                                        'LEMMA': {'IN':rule_1_triggers}}},
                   {'LEFT_ID': 'verb', 'REL_OP': '>', 
                    'RIGHT_ID': 'subject', 
                    'RIGHT_ATTRS': {'DEP': 'nmod','ENT_TYPE': {'IN': ent_types}}},
                   {'LEFT_ID': 'verb', 'REL_OP': '>', 'RIGHT_ID': 'd_object', 
                    'RIGHT_ATTRS': {'DEP': 'nsubjpass', 'ENT_TYPE': {'IN': ent_types}}}]

    # matches ===> upregulation of y is caused by x; we found that upregulation of y is caused by x
    dep_pattern9 = [{'RIGHT_ID': 'verb', 'RIGHT_ATTRS': {'DEP': {'IN':['ROOT','ccomp']},'POS': 'VERB',
                                                        'LEMMA': {'IN':rule_2_verb_triggers}}},
                   {'LEFT_ID': 'verb', 'REL_OP': '>', 
                    'RIGHT_ID': 'subject', 
                    'RIGHT_ATTRS': {'DEP': 'nmod','ENT_TYPE': {'IN': ent_types}}},
                   {'LEFT_ID': 'verb', 'REL_OP': '>', 'RIGHT_ID': 'effect', 
                    'RIGHT_ATTRS': {'DEP': 'nsubjpass', 'LEMMA': {'IN':rule_2_effect_triggers}}},
                   {'LEFT_ID': 'effect', 'REL_OP': '>', 'RIGHT_ID': 'd_object', 
                    'RIGHT_ATTRS': {'DEP': 'nmod', 'ENT_TYPE': {'IN': ent_types},
                                    'LEMMA': {'NOT_IN':rule_2_effect_triggers}}}]

    # matches ===> x is an inhibitor of y

    # Add the pattern to the matcher under the name 'nsubj_verb'
    matcher.add('rule_1', patterns=[dep_pattern1, dep_pattern3, dep_pattern7, dep_pattern8])
    matcher.add('rule_2', patterns=[dep_pattern4, dep_pattern5, dep_pattern6, dep_pattern9])
    return matcher

In [40]:
### read from spacy_annots.json and create new entities
spacy_annots = json.load(open('spacy_annots.json', 'r'))

spacy_annots_new = {'documents':[]}

for item in tqdm(spacy_annots['documents']):
    title = item['title']
    pmid = item['pmid']
    annotations = item['annotations']
    doc = extractor.nlp(title.lower())
    new_ents = []
    item['relations'] = []
    #print(doc)
    doc.ents = []
    for ann in annotations:
        #print(ann)
        new_ent = doc.char_span(int(ann['start']), int(ann['end']), label = ann['type'], kb_id=ann['kb_id'])
        if new_ent is not None:
            new_ents.append(new_ent)

    if not new_ents:
        spacy_annots_new['documents'].append(item)
        continue
    
    doc.ents = filter_spans(new_ents)
    dep_matcher = DependencyMatcher(vocab=extractor.nlp.vocab)
    ent_types = list(set([ent.label_ for ent in doc.ents if ent.label_ != "ENTITY"]))
    
    dep_matcher = assign_rules_to_matcher(dep_matcher, ent_types)
    
    dep_matches = dep_matcher(doc)

    for match in dep_matches:

        # Take the first item in the tuple at [0] and assign it under
        # the variable 'pattern_name'. This item is a spaCy Lexeme object.
        pattern_name = extractor.nlp.vocab[match[0]].text
        #print(pattern_name)

        matches = match[1]

        if pattern_name == "rule_1":
            relation, subject, dobject = doc[matches[0]], doc[matches[1]], doc[matches[2]]


        elif pattern_name == "rule_2":
            subject, relation, dobject = doc[matches[1]], doc[matches[2]], doc[matches[3]]

        if dobject.ent_type_ != 'Biological Process':
            continue
        if subject.ent_type_ == 'Biological Process':
            continue

        item['relations'].append({'source':subject.text, 'target':dobject.text, 'relation':relation_triggers[relation.lemma_]})
    
    spacy_annots_new['documents'].append(item)
    
    
    

100%|██████████| 104/104 [00:01<00:00, 53.26it/s]


In [29]:
with open('spacy_annots_3.json', 'w') as f:
    json.dump(spacy_annots_new, f, indent = 4)

### EVALUATION of SPACY

In [101]:
# NER evaluation
gs_annots = json.load(open('gs_annots_6.json', 'r'))
spacy_annots = json.load(open('spacy_annots_3.json', 'r'))
tp=0
tp_fp=0
tp_fn=0

for gs_item, spacy_item in zip(gs_annots['documents'], spacy_annots['documents']):
    assert gs_item['pmid'] == spacy_item['pmid']
    gs_annotation = gs_item['annotations']
    spacy_annotation = spacy_item['annotations']

    for gs_ann in gs_annotation:
        if gs_ann['type'] == 'Biological Process':
            for spacy_ann in spacy_annotation:
                if spacy_ann['type'] == 'Biological Process':
                    if gs_ann['entity'].lower() in spacy_ann['entity'].lower():
                        tp = tp + 1

for gs_item in gs_annots['documents']:
    gs_annotation = gs_item['annotations']
    for gs_ann in gs_annotation:
        if gs_ann['type'] == 'Biological Process':
            tp_fn = tp_fn + 1

for spacy_item in spacy_annots['documents']:
    spacy_annotation = spacy_item['annotations']
    for spacy_ann in spacy_annotation:
        if spacy_ann['type'] == 'Biological Process':
            tp_fp = tp_fp + 1

print('Spacy Precision (NER): ', tp/tp_fp)
print('Spacy Recall (NER): ', tp/tp_fn)
print('Spacy F1: (NER)', 2*tp/(tp_fp + tp_fn))

Spacy Precision (NER):  0.7151515151515152
Spacy Recall (NER):  0.5784313725490197
Spacy F1: (NER) 0.6395663956639567


In [102]:
# GR evaluation
gs_annots = json.load(open('gs_annots_6.json', 'r'))
spacy_annots = json.load(open('spacy_annots_3.json', 'r'))

tp_fp = 0
tp = 0
tp_fn = 0

for gs_item, spacy_item in tqdm(zip(gs_annots['documents'], spacy_annots['documents'])):
    assert gs_item['pmid'] == spacy_item['pmid']
    gs_annotation = gs_item['annotations']
    spacy_annotation = spacy_item['annotations']

    for gs_ann in gs_annotation:
        if gs_ann['type'] == 'Biological Process':
            for spacy_ann in spacy_annotation:
                if spacy_ann['type'] == 'Biological Process':
                    if gs_ann['entity'].lower() in spacy_ann['entity'].lower() and gs_ann['kb_id'] == spacy_ann['kb_id']:
                        tp = tp + 1

for gs_item in gs_annots['documents']:
    gs_annotation = gs_item['annotations']
    for gs_ann in gs_annotation:
        if gs_ann['type'] == 'Biological Process':
            tp_fn = tp_fn + 1

for spacy_item in spacy_annots['documents']:
    spacy_annotation = spacy_item['annotations']
    for spacy_ann in spacy_annotation:
        if spacy_ann['type'] == 'Biological Process':
            tp_fp = tp_fp + 1

print('Spacy Precision (GR): ', tp/tp_fp)
print('Spacy Recall (GR): ', tp/tp_fn)
print('Spacy F1 (GR): ', 2*tp/(tp_fp + tp_fn))

104it [00:00, 212369.82it/s]

Spacy Precision (GR):  0.5757575757575758
Spacy Recall (GR):  0.46568627450980393
Spacy F1 (GR):  0.5149051490514905





In [104]:
# RE Evaluation
gs_annots = json.load(open('gs_annots_6.json', 'r'))
spacy_annots = json.load(open('spacy_annots_3.json', 'r'))

tp = 0
tp_fp = 0
tp_fn = 0

for gs_item, spacy_item in tqdm(zip(gs_annots['documents'], spacy_annots['documents'])):
    assert gs_item['pmid'] == spacy_item['pmid']
    gs_rels = gs_item['relations']
    spacy_rels = spacy_item['relations']
    for gs_rel in gs_rels:
        for spacy_rel in spacy_rels:
            if (gs_rel['source'].lower() == spacy_rel['source'].lower()) and (gs_rel['target'].lower() in spacy_rel['target'].lower()) and (gs_rel['relation'] == spacy_rel['relation']):
                tp = tp + 1

for gs_item in gs_annots['documents']:
    gs_rels = gs_item['relations']
    for gs_rel in gs_rels:
        tp_fn = tp_fn + 1

for spacy_item in spacy_annots['documents']:
    spacy_rels = spacy_item['relations']
    for spacy_rel in spacy_rels:
        tp_fp = tp_fp + 1

print('Spacy Precision (RE): ', tp/tp_fp)
print('Spacy Recall (RE): ', tp/tp_fn)
print('Spacy F1 (RE): ', 2*tp/(tp_fp + tp_fn))

104it [00:00, 384323.89it/s]

Spacy Precision (RE):  0.75
Spacy Recall (RE):  0.1477832512315271
Spacy F1 (RE):  0.24691358024691357





In [106]:
# RE/GR Evaluation
gs_annots = json.load(open('gs_annots_6.json', 'r'))
spacy_annots = json.load(open('spacy_annots_3.json', 'r'))

tp = 0
tp_fp = 0
tp_fn = 0

for gs_item, spacy_item in tqdm(zip(gs_annots['documents'], spacy_annots['documents'])):
    assert gs_item['pmid'] == spacy_item['pmid']
    gs_rels = gs_item['relations']
    spacy_rels = spacy_item['relations']
    gs_annotation = gs_item['annotations']
    spacy_annotation = spacy_item['annotations']
    for gs_rel in gs_rels:
        for spacy_rel in spacy_rels:
            if (gs_rel['source'].lower() == spacy_rel['source'].lower()) and (gs_rel['target'].lower() in spacy_rel['target'].lower()) and (gs_rel['relation'] == spacy_rel['relation']):
                gs_target = gs_rel['target']
                spacy_target = spacy_rel['target']
                for gs_ann in gs_annotation:
                    for spacy_ann in spacy_annotation:
                        if gs_ann['entity'].lower() == gs_target.lower() and spacy_ann['entity'].lower() == spacy_target.lower():
                            if gs_ann['kb_id'] == spacy_ann['kb_id']:
                                tp = tp + 1

for gs_item in gs_annots['documents']:
    gs_rels = gs_item['relations']
    for gs_rel in gs_rels:
        tp_fn = tp_fn + 1

for spacy_item in spacy_annots['documents']:
    spacy_rels = spacy_item['relations']
    for spacy_rel in spacy_rels:
        tp_fp = tp_fp + 1

print('Spacy Precision (RE/GR): ', tp/tp_fp)
print('Spacy Recall (RE/GR): ', tp/tp_fn)
print('Spacy F1 (RE/GR): ', 2*tp/(tp_fp + tp_fn))

104it [00:00, 223467.02it/s]

Spacy Precision (RE/GR):  0.625
Spacy Recall (RE/GR):  0.12315270935960591
Spacy F1 (RE/GR):  0.205761316872428





## Evaluation of GPT-4


In [8]:
### NER Evaluation

gs_annots = json.load(open('gs_annots_6.json', 'r'))
gpt_annots = json.load(open('gpt4_annots_4.json', 'r'))

tp = 0
tp_fp = 0
tp_fn = 0

for gs_item, gpt_item in tqdm(zip(gs_annots['documents'], gpt_annots['documents'])):
    assert gs_item['pmid'] == gpt_item['pmid']
    gs_annotation = gs_item['annotations']
    gpt_annotation = gpt_item['annotations']

    for gs_ann in gs_annotation:
        if gs_ann['type'] == 'Biological Process':
            for gpt_ann in gpt_annotation:
                if gpt_ann['type'] == 'Biological Process':
                    if gs_ann['entity'].lower() in gpt_ann['entity'].lower():
                        tp = tp + 1

for gs_item in gs_annots['documents']:
    gs_annotation = gs_item['annotations']
    for gs_ann in gs_annotation:
        if gs_ann['type'] == 'Biological Process':
            tp_fn = tp_fn + 1

for gpt_item in gpt_annots['documents']:
    gpt_annotation = gpt_item['annotations']
    for gpt_ann in gpt_annotation:
        if gpt_ann['type'] == 'Biological Process':
            tp_fp = tp_fp + 1

print('GPT Precision (NER): ', tp/tp_fp)
print('GPT Recall (NER): ', tp/tp_fn)
print('GPT F1: (NER)', 2*tp/(tp_fp + tp_fn))


104it [00:00, 139407.99it/s]

GPT Precision (NER):  0.8915094339622641
GPT Recall (NER):  0.9264705882352942
GPT F1: (NER) 0.9086538461538461





In [9]:
### GR Evaluation

gs_annots = json.load(open('gs_annots_6.json', 'r'))
gpt_annots = json.load(open('gpt4_annots_4.json', 'r'))

tp_fp = 0
tp = 0
tp_fn = 0

for gs_item, gpt_item in tqdm(zip(gs_annots['documents'], gpt_annots['documents'])):
    assert gs_item['pmid'] == gpt_item['pmid']
    gs_annotation = gs_item['annotations']
    gpt_annotation = gpt_item['annotations']

    for gs_ann in gs_annotation:
        if gs_ann['type'] == 'Biological Process':
            for gpt_ann in gpt_annotation:
                if gpt_ann['type'] == 'Biological Process':
                    if (gs_ann['entity'].lower() in gpt_ann['entity'].lower()) and gs_ann['kb_id'] == gpt_ann['kb_id']:
                        tp = tp + 1

for gs_item in gs_annots['documents']:
    gs_annotation = gs_item['annotations']
    for gs_ann in gs_annotation:
        if gs_ann['type'] == 'Biological Process':
            tp_fn = tp_fn + 1

for gpt_item in gpt_annots['documents']:
    gpt_annotation = gpt_item['annotations']
    for gpt_ann in gpt_annotation:
        if gpt_ann['type'] == 'Biological Process':
            tp_fp = tp_fp + 1

print('GPT Precision (GR): ', tp/tp_fp)
print('GPT Recall (GR): ', tp/tp_fn)
print('GPT F1 (GR): ', 2*tp/(tp_fp + tp_fn))


104it [00:00, 127845.14it/s]

GPT Precision (GR):  0.7122641509433962
GPT Recall (GR):  0.7401960784313726
GPT F1 (GR):  0.7259615384615384





In [10]:
# RE Evaluation
gs_annots = json.load(open('gs_annots_6.json', 'r'))
gpt_annots = json.load(open('gpt4_annots_4.json', 'r'))

tp = 0
tp_fp = 0
tp_fn = 0

for gs_item, gpt_item in tqdm(zip(gs_annots['documents'], gpt_annots['documents'])):
    assert gs_item['pmid'] == gpt_item['pmid']
    gs_rels = gs_item['relations']
    gpt_rels = gpt_item['relations']
    for gs_rel in gs_rels:
        for gpt_rel in gpt_rels:
            if (gs_rel['source'].lower() == gpt_rel['source'].lower()) and (gs_rel['target'].lower() in gpt_rel['target'].lower()) and (gs_rel['relation'].lower() == gpt_rel['relation'].lower()):
                tp = tp + 1

for gs_item in gs_annots['documents']:
    gs_rels = gs_item['relations']
    for gs_rel in gs_rels:
        tp_fn = tp_fn + 1

for gpt_item in gpt_annots['documents']:
    gpt_rels = gpt_item['relations']
    for gpt_rel in gpt_rels:
        tp_fp = tp_fp + 1

print('GPT Precision (RE): ', tp/tp_fp)
print('GPT Recall (RE): ', tp/tp_fn)
print('GPT F1 (RE): ', 2*tp/(tp_fp + tp_fn))

104it [00:00, 134965.23it/s]

GPT Precision (RE):  0.8798076923076923
GPT Recall (RE):  0.9014778325123153
GPT F1 (RE):  0.8905109489051095





In [11]:
# RE GR Evaluation
gs_annots = json.load(open('gs_annots_6.json', 'r'))
gpt_annots = json.load(open('gpt4_annots_4.json', 'r'))

tp = 0
tp_fp = 0
tp_fn = 0

for gs_item, gpt_item in tqdm(zip(gs_annots['documents'], gpt_annots['documents'])):
    assert gs_item['pmid'] == gpt_item['pmid']
    gs_rels = gs_item['relations']
    gpt_rels = gpt_item['relations']
    gs_annotation = gs_item['annotations']
    gpt_annotation = gpt_item['annotations']
    for gs_rel in gs_rels:
        for gpt_rel in gpt_rels:
            if (gs_rel['source'].lower() == gpt_rel['source'].lower()) and (gs_rel['target'].lower() in gpt_rel['target'].lower()) and (gs_rel['relation'] == gpt_rel['relation']):
                gs_target = gs_rel['target']
                gpt_target = gpt_rel['target']
                for gs_ann in gs_annotation:
                    for gpt_ann in gpt_annotation:
                        if gs_ann['entity'].lower() == gs_target.lower() and gpt_ann['entity'].lower() == gpt_target.lower():
                            if gs_ann['kb_id'] == gpt_ann['kb_id']:
                                tp = tp + 1

for gs_item in gs_annots['documents']:
    gs_rels = gs_item['relations']
    for gs_rel in gs_rels:
        tp_fn = tp_fn + 1

for gpt_item in gpt_annots['documents']:
    gpt_rels = gpt_item['relations']
    for gpt_rel in gpt_rels:
        tp_fp = tp_fp + 1

print('GPT Precision (RE/GR): ', tp/tp_fp)
print('GPT Recall (RE/GR): ', tp/tp_fn)
print('GPT F1 (RE/GR): ', 2*tp/(tp_fp + tp_fn))

104it [00:00, 53248.00it/s]

GPT Precision (RE/GR):  0.7019230769230769
GPT Recall (RE/GR):  0.7192118226600985
GPT F1 (RE/GR):  0.7104622871046229



