# Named Entity Recogniyion (NER) and Relation Extraction (RE)

## Settings

In [None]:
# First, we need to install dspy
!pip install backoff
!pip install dspy

Collecting backoff
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Downloading backoff-2.2.1-py3-none-any.whl (15 kB)
Installing collected packages: backoff
Successfully installed backoff-2.2.1
Collecting dspy
  Downloading dspy-3.0.1-py3-none-any.whl.metadata (7.1 kB)
Collecting ujson>=5.8.0 (from dspy)
  Downloading ujson-5.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.3 kB)
Collecting optuna>=3.4.0 (from dspy)
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting magicattr>=0.1.6 (from dspy)
  Downloading magicattr-0.1.6-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting litellm>=1.64.0 (from dspy)
  Downloading litellm-1.75.7-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.2/41.2 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting diskcache>=5.6.0 (from dspy)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting json-repair>=0.30.0 (from 

In [None]:
# import
import json
import random
import requests
import hashlib
import backoff
import logging
from tqdm import tqdm
from time import sleep
from typing import List
from collections import defaultdict
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from langchain_core.documents.base import Document


## PubTator Client for NER / RE

### PubTator Client Utility Class

In [None]:
class PubTatorClient:
    def __init__(self, base_url: str, max_retries: int = 3):
        self.session = requests.Session()
        self.base_url = base_url
        # Configure retry strategy
        retry_strategy = Retry(
            total=max_retries,
            backoff_factor=1,
            status_forcelist=[429, 500, 502, 503, 504],
        )

        adapter = HTTPAdapter(max_retries=retry_strategy)
        self.session.mount("http://", adapter)
        self.session.mount("https://", adapter)
        self.logger = logging.getLogger(__name__)

    @backoff.on_exception(
        backoff.expo, requests.exceptions.RequestException, max_tries=3
    )
    def _fetch_publications(self, pmids: List[str]) -> dict:
        params = {"pmids": ",".join(pmids)}

        response = self.session.get(self.base_url, params=params, timeout=300)
        response.raise_for_status()
        return response.json()

    def _parse(self, response: dict):
        doc_text = "\n".join([passage["text"] for passage in response["passages"]])
        pmid = response["id"]
        doc = {"pmid": pmid, "text": doc_text}
        biomedical_entities = []
        biomedical_relations = []
        # To save the accession number and map back the entities in the relations.
        accession_dict = {}

        for passage in response["passages"]:
            annotations = passage["annotations"]
            for annotation in annotations:
                # For simplicity, we are only going to keep the 'valid' annotations.
                if not annotation['infons'].get('valid', False):
                  continue
                span = annotation["text"]
                name = annotation["infons"]["name"]
                annot_type = annotation["infons"]["type"]
                identifier = annotation["infons"]["identifier"]
                accession = annotation["infons"]["accession"]
                accession_dict[accession] = {"type": annot_type, "identifier": identifier}

                # Sometimes, there can be an entry for identifier, but it is actaully not determined and refer the '-'.
                if identifier == "-":
                    identifier = hashlib.sha256(span.encode("utf-8")).hexdigest()
                biomedical_entities.append({
                    "label": annot_type,
                    "span": span,
                    "name": name,
                    "id": identifier,
                    "pmid": pmid
                })
        for relation_info in response['relations_display']:

          rel_type, s_accession_id, o_accession_id = relation_info['name'].split('|')
          source = accession_dict.get(s_accession_id, {})
          target = accession_dict.get(o_accession_id, {})
          if not source or not target:
            continue
          source_id = source['identifier']
          target_id = target['identifier']
          source_label = source['type']
          target_label = target['type']
          biomedical_relations.append({
                "source_id": source_id,
                "source_label": source_label,
                "target_id": target_id,
                "target_label": target_label,
                "relationship": rel_type,
                "pmid": pmid
              })

        return biomedical_entities, biomedical_relations, doc

    def _merge(self, biomedical_entities: List[dict], biomedical_relations: List[dict]) -> List[Document]:
        all_merged_biomedical_entities = {}
        all_merged_biomedical_relations = {}
        for biomedical_entity in biomedical_entities:
          if biomedical_entity['id'] not in all_merged_biomedical_entities:
            all_merged_biomedical_entities[biomedical_entity['id']] = biomedical_entity
            all_merged_biomedical_entities[biomedical_entity['id']]['alt_names'] = [biomedical_entity['span']]
            all_merged_biomedical_entities[biomedical_entity['id']]['pmids'] = [biomedical_entity['pmid']]
          else:
            _span = biomedical_entity['span']
            _pmid = biomedical_entity['pmid']
            if _span not in all_merged_biomedical_entities[biomedical_entity['id']]['alt_names']:
              all_merged_biomedical_entities[biomedical_entity['id']]['alt_names'].append(_span)

            if _pmid not in all_merged_biomedical_entities[biomedical_entity['id']]['pmids']:
              all_merged_biomedical_entities[biomedical_entity['id']]['pmids'].append(_pmid)

        for biomedical_relation in biomedical_relations:
          source_id = biomedical_relation['source_id']
          target_id = biomedical_relation['target_id']
          source_label = biomedical_relation['source_label']
          target_label = biomedical_relation['target_label']
          relationship = biomedical_relation['relationship']
          _id = f"{source_label}_{source_id}_{target_label}_{target_id}_{relationship}"
          if _id not in all_merged_biomedical_relations:
              all_merged_biomedical_relations[_id] = {
                  "source_id": source_id,
                  "source_label": source_label,
                  "target_id": target_id,
                  "target_label": target_label,
                  "relationship": relationship,
                  "pmids": [biomedical_relation['pmid']]
                }
          else:
              _pmid = biomedical_relation['pmid']
              if _pmid not in all_merged_biomedical_relations[_id]['pmids']:
                all_merged_biomedical_relations[_id]['pmids'].append(_pmid)

        merged_biomedical_entities = list(all_merged_biomedical_entities.values())
        merged_biomedical_relations = list(all_merged_biomedical_relations.values())
        return merged_biomedical_entities, merged_biomedical_relations

    def request_and_parse_pubtator(self, pmids: List[str], pubtator_chunk_size=50):
        all_responses = []

        # STEP 1: Fetching the PubTator Informations
        self.logger.info("Fetching PubTator data...")
        for i in tqdm(
            range(0, len(pmids), pubtator_chunk_size),
            desc="Fetching PubTator data",
        ):
            chunk = pmids[i : i + pubtator_chunk_size]
            try:
                response = self._fetch_publications(chunk)
                all_responses.append(response)
            except requests.exceptions.RequestException as e:
                self.logger.error(
                    f"Failed to fetch data from PubTator API for chunk {chunk}: {e}"
                )
            sleep(0.5)

        fetched_ner_re = [doc for pubtator_response in all_responses
            for doc in pubtator_response["PubTator3"]
        ]
        self.logger.info(f"Fetched {len(fetched_ner_re)} PubTator data.")

        # STEP 2: Parsing the PubTator Identified Biomedical Named Entities and Relations
        self.logger.info("Parsing PubTator data...")
        biomedical_ner = []
        biomedical_re = []
        documents_texts = []
        for item in tqdm(fetched_ner_re, desc="Parsing PubTator data"):
            _biomedical_ner, _biomedical_re, doc = self._parse(item)
            biomedical_ner += _biomedical_ner
            biomedical_re += _biomedical_re
            documents_texts.append(doc)

        # STEP 3: Merge Biomedical Entities and Relations
        merged_biomedical_entities, merged_biomedical_relations = self._merge(biomedical_ner, biomedical_re)
        return merged_biomedical_entities, merged_biomedical_relations, documents_texts



### Getting NER and RE from PubTator

In [None]:
pubtator = PubTatorClient(base_url="https://www.ncbi.nlm.nih.gov/research/pubtator3-api/publications/export/biocjson")

In [None]:
with open("/content/sampled_documents_pmids.json", "r") as f:
    all_pmids = json.load(f)

In [None]:
display(f"There are {len(all_pmids)} pubmed documents.")

In [None]:
# run a sample
# pmids = all_pmids[: 100]
# or run all
pmids = all_pmids

In [None]:
merged_biomedical_entities, merged_biomedical_relations, documents_texts = pubtator.request_and_parse_pubtator(pmids)

In [None]:
# Save to json
import json
with open("merged_biomedical_entities.json", "w") as f:
    json.dump(merged_biomedical_entities, f, indent=4)

with open("merged_biomedical_relations.json", "w") as f:
    json.dump(merged_biomedical_relations, f, indent=4)

with open("documents_texts.json", "w") as f:
    json.dump(documents_texts, f,indent=4)

## Large Language Model: NER and RE

### Calling LLM with dspy and structured JSON outputs

In [None]:
import os
import json
import dspy
from enum import Enum
from dotenv import load_dotenv, find_dotenv
from pydantic import BaseModel, Field
from typing import List, Optional, Literal

# load .env vars
_ = load_dotenv(find_dotenv())

In [None]:

# Define entity and relation types using Pydantic models
class EntityType(str, Enum):
    """Entity types for biomedical NER"""
    GENE = "Gene"
    DISEASE = "Disease"
    CHEMICAL = "Chemical"
    VARIANT = "Variant"
    SPECIES = "Species"
    CELL_LINE = "Cell Line"

class RelationType(str, Enum):
    """Relation types for biomedical RE"""
    ASSOCIATE = "ASSOCIATE"
    CAUSE = "CAUSE"
    COMPARE = "COMPARE"
    COTREAT = "COTREAT"
    DRUG_INTERACT = "DRUG_INTERACT"
    INHIBIT = "INHIBIT"
    INTERACT = "INTERACT"
    NEGATIVE_CORRELATE = "NEGATIVE_CORRELATE"
    POSITIVE_CORRELATE = "POSITIVE_CORRELATE"
    PREVENT = "PREVENT"
    STIMULATE = "STIMULATE"
    TREAT = "TREAT"

class Relation(BaseModel):
    """Represents a relation between two entities"""
    entity1_span: str = Field(description="Text span of the first entity")
    entity1_type: EntityType = Field(
        description="Type of the first entity"
    )
    entity2_span: str = Field(description="Text span of the second entity")
    entity2_type: EntityType = Field(
        description="Type of the second entity"
    )
    relation_type: RelationType = Field(description="Type of relation between entities")


# DSPy signature for biomedical extraction from PubMed abstracts
class BiomedicalExtraction(dspy.Signature):
    """
    Extract biomedical entities and their relations from PubMed title and abstract text.
    Focus on scientifically relevant entities and relationships commonly found in biomedical literature.

    Entity Types with PubMed Context:
    - Gene: Protein-coding genes, gene symbols (e.g., BRCA1, TP53), gene families
    - Disease: Diseases, disorders, syndromes, pathological conditions from MeSH
    - Chemical: Drugs, compounds, metabolites, therapeutic agents from MeSH
    - Variant: SNPs (rs numbers), mutations (p.Val600Glu), copy number variants
    - Species: Organisms, model systems (Homo sapiens, Mus musculus, etc.)
    - Cell Line: Immortalized cell lines, primary cells (HeLa, MCF-7, etc.)

    Relation Types for PubMed Literature:
    - ASSOCIATE: General biological association or correlation
    - CAUSE: Causal relationships, pathogenesis, disease etiology
    - COMPARE: Comparative studies, drug comparisons, treatment efficacy
    - COTREAT: Combination therapies, drug combinations
    - DRUG_INTERACT: Drug-drug interactions, pharmacological interactions
    - INHIBIT: Inhibition, suppression, downregulation
    - INTERACT: Physical/molecular interactions, binding, complexes
    - NEGATIVE_CORRELATE: Inverse correlation, negative association
    - POSITIVE_CORRELATE: Positive correlation, co-expression, co-regulation
    - PREVENT: Prevention, protective effects, risk reduction
    - STIMULATE: Activation, upregulation, enhancement
    - TREAT: Therapeutic relationships, treatment efficacy

    Instructions:
    - Extract entities as they appear in the text (preserve original naming)
    - Focus on scientifically meaningful relationships
    - Consider both title and abstract content
    - Prioritize well-established biomedical relationships
    - Handle abbreviations and full names appropriately
    - Do NOT integrate novel Relation Types
    """

    pubmed_text: str = dspy.InputField(desc="PubMed title and abstract text")
    relations: List[Relation] = dspy.OutputField(desc="Extracted biomedical relations")

# Main extraction module
class BiomedicalNERRE(dspy.Module):
    def __init__(self):
        super().__init__()
        self.extract = dspy.Predict(BiomedicalExtraction)

    def forward(self, pubmed_text: str, demonstrations: List[dspy.Example] = []) -> List[Relation]:
        """Extract entities and relations from PubMed title and abstract"""
        result = self.extract(pubmed_text=pubmed_text, demos=demonstrations)
        return result.relations

# Configuration class for different LLM backends
class LLMConfig:
    @staticmethod
    def setup_openai(api_key: str, model: str = "gpt-4o-mini", max_tokens=2048):
        """Setup OpenAI API"""
        import openai
        openai.api_key = api_key
        lm = dspy.LM(model, max_tokens=max_tokens)
        dspy.configure(lm=lm, adapter=dspy.JSONAdapter())
        dspy.settings.lm.kwargs["temperature"] = 0.0
        return lm

    @staticmethod
    def setup_ollama(api_base: str = "http://127.0.0.1:11434", model: str = "llama3.1:8b", max_tokens=2048):
        """Setup Ollama local model"""
        lm = dspy.LM(model, api_base=api_base, max_tokens=max_tokens)
        dspy.configure(lm=lm, adapter=dspy.JSONAdapter())
        dspy.settings.lm.kwargs["temperature"] = 0.0
        return lm

# Main execution class
class BiomedicalExtractor:
    def __init__(self, llm_type: str = "openai", **kwargs):
        """
        Initialize the biomedical extractor

        Args:
            llm_type: "openai" or "ollama"
            **kwargs: Additional arguments for LLM setup
        """
        self.llm_type = llm_type

        if llm_type == "openai":
            api_key = kwargs.get("api_key", os.getenv("OPENAI_API_KEY"))
            model = kwargs.get("model", "gpt-4-turbo-preview")
            self.lm = LLMConfig.setup_openai(api_key, model)
        elif llm_type == "ollama":
            model = kwargs.get("model", "llama3.1:8b")
            base_url = kwargs.get("base_url", "http://127.0.0.1:11434")
            print(f"model: {model} / base_url: {base_url}")
            self.lm = LLMConfig.setup_ollama(base_url, model)

        # Initialize extractor
        self.extractor = BiomedicalNERRE()

    def extract(self, pubmed_text: str, demonstrations=[]) -> List[Relation]:
        """Extract entities and relations from PubMed text"""
        try:
            result = self.extractor(pubmed_text, demonstrations=demonstrations)
            return result
        except Exception as e:
            print(f"Extraction failed: {e}")
            return []


In [None]:
# SOME DEMONSTRATIONS
# PubMed-specific demonstration examples
PUBMED_EXAMPLES = [
    {
        "pubmed_text": "Bimekizumab: The First Dual Inhibitor of Interleukin (IL)-17A and IL-17F for the Treatment of Psoriatic Disease and Ankylosing Spondylitis. Psoriasis is a chronic inflammatory skin disease with significant psychological and physical impact. Over the last few decades, several highly effective target therapies have been developed, leading to a major paradigm shift in the way psoriatic disease is managed. Despite this, a proportion of patients still do not respond or lose response over time. Bispecific antibodies target two different cytokines simultaneously, potentially offering a better disease control. Interleukin (IL)-17A and IL-17F share structural homology and have similar biologic function. IL-17A is classically considered to be the most biologically active, but recent studies have shown that IL-17F is also increased in psoriatic skin and synovial cell in psoriatic arthritis, supporting the rationale for targeting both IL-17A and IL-17F in psoriatic disease. Bimekizumab is the first-in-class monoclonal antibody designed to simultaneously target IL-17A and IL-17F. Bimekizumab is currently in clinical development for psoriasis, psoriatic arthritis, and ankylosing spondylitis, with promising results. In early clinical trials, bimekizumab demonstrated a rapid onset of action, good safety profile, and high tolerability by treated study participants. Long-term results and head-to-head trials comparing bimekizumab with other agents will be crucial to define the role of bimekizumab in the treatment of psoriatic disease.",
        "relations": [
            Relation(
                entity1_span="IL-17A", entity1_type="Gene",
                entity2_span="Psoriatic Disease", entity2_type="Disease",
                relation_type="ASSOCIATE"
            ),
            Relation(
                entity1_span="IL-17F", entity1_type="Gene",
                entity2_span="Psoriatic Disease", entity2_type="Disease",
                relation_type="ASSOCIATE"
            ),
            Relation(
                entity1_span="IL-17A", entity1_type="Gene",
                entity2_span="Ankylosing Spondylitis", entity2_type="Disease",
                relation_type="ASSOCIATE"
            ),
            Relation(
                entity1_span="IL-17F", entity1_type="Gene",
                entity2_span="Ankylosing Spondylitis", entity2_type="Disease",
                relation_type="ASSOCIATE"
            ),
            Relation(
                entity1_span="bimekizumab", entity1_type="Chemical",
                entity2_span="IL-17A", entity2_type="Gene",
                relation_type="NEGATIVE_CORRELATE"
            ),
            Relation(
                entity1_span="bimekizumab", entity1_type="Chemical",
                entity2_span="IL-17F", entity2_type="Gene",
                relation_type="NEGATIVE_CORRELATE"
            ),
            Relation(
                entity1_span="bimekizumab", entity1_type="Chemical",
                entity2_span="psoriasis", entity2_type="Disease",
                relation_type="TREAT"
            ),
            Relation(
                entity1_span="bimekizumab", entity1_type="Chemical",
                entity2_span="psoriatic arthritis", entity2_type="Disease",
                relation_type="TREAT"
            ),

            Relation(
                entity1_span="bimekizumab", entity1_type="Chemical",
                entity2_span="ankylosing spondylitis", entity2_type="Disease",
                relation_type="TREAT"
            )
        ]
    },
    {
        "pubmed_text": "Molecular dynamics and protein frustration analysis of human fused in Sarcoma protein variants in Amyotrophic Lateral Sclerosis type 6: An In Silico approach. Amyotrophic lateral sclerosis (ALS) is the most frequent adult-onset motor neuron disorder. The disease is characterized by degeneration of upper and lower motor neurons, leading to death usually within five years after the onset of symptoms. While most cases are sporadic, 5%-10% of cases can be associated with familial inheritance, including ALS type 6, which is associated with mutations in the Fused in Sarcoma (FUS) gene. This work aimed to evaluate how the most frequent ALS-related mutations in FUS, R521C, R521H, and P525L affect the protein structure and function. We used prediction algorithms to analyze the effects of the non-synonymous single nucleotide polymorphisms and performed evolutionary conservation analysis, protein frustration analysis, and molecular dynamics simulations. Most of the prediction algorithms classified the three mutations as deleterious. All three mutations were predicted to reduce protein stability, especially the mutation R521C, which was also predicted to increase chaperone binding tendency. The protein frustration analysis showed an increase in frustration in the interactions involving the mutated residue 521C. Evolutionary conservation analysis showed that residues 521 and 525 of human FUS are highly conserved sites. The molecular dynamics results indicate that protein stability could be compromised in all three mutations. They also affected the exposed surface area and protein compactness. The analyzed mutations also displayed high flexibility in most residues in all variants, most notably in the interaction site with the nuclear import protein of FUS.",
        "relations": [
                Relation(
                    entity1_span="Amyotrophic lateral sclerosis", entity1_type="Disease",
                    entity2_span="Fused in Sarcoma", entity2_type="Gene",
                    relation_type="ASSOCIATE"
                ),
                Relation(
                    entity1_span="Amyotrophic lateral sclerosis", entity1_type="Disease",
                    entity2_span="p.R521C_FUS", entity2_type="Variant",
                    relation_type="ASSOCIATE"
                ),
                Relation(
                    entity1_span="Amyotrophic lateral sclerosis", entity1_type="Disease",
                    entity2_span="p.R521H_FUS", entity2_type="Variant",
                    relation_type="ASSOCIATE"
                ),
                Relation(
                    entity1_span="Amyotrophic lateral sclerosis", entity1_type="Disease",
                    entity2_span="p.P525L_FUS", entity2_type="Variant",
                    relation_type="ASSOCIATE"
                )
            ]
    },
    {
        "pubmed_text": "Management of infusion-related reactions (IRRs) in patients receiving amivantamab in the CHRYSALIS study. BACKGROUND: Amivantamab, a fully humanized EGFR-MET bispecific antibody, has antitumor activity in diverse EGFR- and MET-driven non-small cell lung cancer (NSCLC) and a safety profile consistent with associated on-target activities. Infusion-related reaction(s) (IRR[s]) are reported commonly with amivantamab. We review IRR and subsequent management in amivantamab-treated patients. METHODS: Patients treated with the approved dose of intravenous amivantamab (1050 mg, <80 kg; 1400 mg, >=80 kg) in CHRYSALIS-an ongoing, phase 1 study in advanced EGFR-mutated NSCLC-were included in this analysis. IRR mitigations included split first dose (350 mg, day 1 [D1]; remainder, D2), reduced initial infusion rates with proactive infusion interruption, and steroid premedication before initial dose. For all doses, pre-infusion antihistamines and antipyretics were required. Steroids were optional after the initial dose. RESULTS: As of 3/30/2021, 380 patients received amivantamab. IRRs were reported in 256 (67%) patients. Signs/symptoms of IRR included chills, dyspnea, flushing, nausea, chest discomfort, and vomiting. Most of the 279 IRRs were grade 1 or 2; grade 3 and 4 IRR occurred in 7 and 1 patients, respectively. Most (90%) IRRs occurred on cycle 1, D1 (C1D1); median time-to-first-IRR onset during C1D1 was 60 min; and first-infusion IRRs did not compromise subsequent infusions. Per protocol, IRR was mitigated on C1D1 with holding of infusion (56% [214/380]), reinitiating at reduced rate (53% [202/380]), and aborting infusion (14% [53/380]). C1D2 infusions were completed in 85% (45/53) of patients who had C1D1 infusions aborted. Four patients (1% [4/380]) discontinued treatment due to IRR. In studies aimed at elucidating the underlying mechanism(s) of IRR, no pattern was observed between patients with versus without IRR. CONCLUSION: IRRs with amivantamab were predominantly low grade and limited to first infusion, and rarely occurred with subsequent dosing. Close monitoring for IRR with the initial amivantamab dose and early intervention at first IRR signs/symptoms should be part of routine amivantamab administration.",
        "relations": [
                Relation(
                    entity1_span="non-small cell lung cancer", entity1_type="Disease",
                    entity2_span="EGFR", entity2_type="Gene",
                    relation_type="ASSOCIATE"
                ),
                Relation(
                    entity1_span="non-small cell lung cancer", entity1_type="Disease",
                    entity2_span="SLTM", entity2_type="Gene",
                    relation_type="ASSOCIATE"
                ),
                Relation(
                    entity1_span="amivantamab", entity1_type="Chemical",
                    entity2_span="Chills", entity2_type="Disease",
                    relation_type="CAUSE"
                ),
                Relation(
                    entity1_span="amivantamab", entity1_type="Chemical",
                    entity2_span="Dyspnea", entity2_type="Disease",
                    relation_type="CAUSE"
                ),
                Relation(
                    entity1_span="amivantamab", entity1_type="Chemical",
                    entity2_span="Flushing", entity2_type="Disease",
                    relation_type="CAUSE"
                ),
                Relation(
                    entity1_span="amivantamab", entity1_type="Chemical",
                    entity2_span="nausea", entity2_type="Disease",
                    relation_type="CAUSE"
                ),
                Relation(
                    entity1_span="amivantamab", entity1_type="Chemical",
                    entity2_span="nausea", entity2_type="Disease",
                    relation_type="CAUSE"
                ),
                Relation(
                    entity1_span="amivantamab", entity1_type="Chemical",
                    entity2_span="chest discomfort", entity2_type="Disease",
                    relation_type="CAUSE"
                ),
                Relation(
                    entity1_span="amivantamab", entity1_type="Chemical",
                    entity2_span="vomiting", entity2_type="Disease",
                    relation_type="CAUSE"
                ),
                Relation(
                    entity1_span="amivantamab", entity1_type="Chemical",
                    entity2_span="non-small cell lung cancer", entity2_type="Disease",
                    relation_type="TREAT"
                )
            ]
    }
]


### An example with OpenAI GPT-4o-mini

In [None]:
extractor_openai = BiomedicalExtractor(
    llm_type="openai",
    model="gpt-4o-mini"
)

# Test with PubMed-style abstract
pubmed_text = "PCOS is an endocrine disorder characterized by chronic anovulation, hyperandrogenism, and polycystic ovaries. Its etiology is uncertain. It is debated whether BPA would be a component of the environmental factor in the etiology of PCOS. Contamination by BPA can occur from food packaging (exposure during the diet) and through skin absorption and/or inhalation. It can be transferred to the fetus via the placenta or to the infant via breast milk, and it can be found in follicular fluid, fetal serum, and amniotic fluid. The phenolic structure of BPA allows it to interact with Estrogen Receptors (ERs) through genomic signaling, in which BPA binds to nuclear ERalpha or Erbeta, or through nongenomic signaling by binding to membrane ERs, prompting a rapid and intense response. With daily and constant exposure, BPA's tendency to bioaccumulate and its ability to activate nongenomic signaling pathways can alter women's metabolic and reproductive function, leading to hyperandrogenism, insulin resistance, obesity, atherogenic dyslipidemia, chronic inflammatory state, and anovulation and favoring PCOS. The harmful changes caused by BPA can be passed on to future generations without the need for additional exposure because of epigenetic modifications. Not only high BPA levels can produce harmful effects, but at low levels, BPA may be harmful when exposure occurs during the most vulnerable periods, such as the fetal and neonatal periods, as well as during the prepubertal age causing an early accumulation of BPA in the body. Learning how BPA participates in the pathogenesis of PCOS poses a challenge and further studies should be conducted."

result = extractor_openai.extract(pubmed_text, demonstrations=PUBMED_EXAMPLES)


In [None]:
result

## Using local models (with ollama)

### Installing ollama

In [None]:
! sudo apt update && sudo apt install pciutils lshw
!curl -fsSL https://ollama.com/install.sh | sh

In [None]:
!nohup ollama serve > ollama.log 2>&1 &

In [None]:
! ollama run llama3.1:8b “What is the capital of the Netherlands?”

### An example with LLama 3.1:8b

In [None]:
extractor_openai = BiomedicalExtractor(
    llm_type="ollama",
    model="ollama_chat/llama3.1:8b",
    base_url="http://127.0.0.1:11434"
)

# Test with PubMed-style abstract
pubmed_text = "Does bisphenol A (BPA) participates in the pathogenesis of Polycystic Ovary Syndrome (PCOS)? PCOS is an endocrine disorder characterized by chronic anovulation, hyperandrogenism, and polycystic ovaries. Its etiology is uncertain. It is debated whether BPA would be a component of the environmental factor in the etiology of PCOS. Contamination by BPA can occur from food packaging (exposure during the diet) and through skin absorption and/or inhalation. It can be transferred to the fetus via the placenta or to the infant via breast milk, and it can be found in follicular fluid, fetal serum, and amniotic fluid. The phenolic structure of BPA allows it to interact with Estrogen Receptors (ERs) through genomic signaling, in which BPA binds to nuclear ERalpha or Erbeta, or through nongenomic signaling by binding to membrane ERs, prompting a rapid and intense response. With daily and constant exposure, BPA's tendency to bioaccumulate and its ability to activate nongenomic signaling pathways can alter women's metabolic and reproductive function, leading to hyperandrogenism, insulin resistance, obesity, atherogenic dyslipidemia, chronic inflammatory state, and anovulation and favoring PCOS. The harmful changes caused by BPA can be passed on to future generations without the need for additional exposure because of epigenetic modifications. Not only high BPA levels can produce harmful effects, but at low levels, BPA may be harmful when exposure occurs during the most vulnerable periods, such as the fetal and neonatal periods, as well as during the prepubertal age causing an early accumulation of BPA in the body. Learning how BPA participates in the pathogenesis of PCOS poses a challenge and further studies should be conducted."

result = extractor_openai.extract(pubmed_text, demonstrations=PUBMED_EXAMPLES)

## Let's compare some examples

Let's take a first example of PMID 38008036.

### Utils methods for comparison

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_venn import venn2, venn2_circles
import numpy as np
from collections import defaultdict, Counter
from typing import List, Dict, Set, Tuple, Any
import warnings
warnings.filterwarnings('ignore')

# Install required packages if not already installed
# !pip install matplotlib-venn seaborn pandas matplotlib

class BiomedicalnteractionComparison:
    """
    Simplified tool for 1-vs-1 comparison of biomedical NER/RE methods
    Uses a common format with entity spans as lists to handle multiple possible spans
    """

    def __init__(self):
        pass

    def convert_pubtator_to_common_format(self, pubtator_relations: List[Dict],
                                        pubtator_entities: List[Dict]) -> List[Dict]:
        """
        Convert PubTator results to common format

        Args:
            pubtator_relations: List of PubTator relations with IDs
            pubtator_entities: List of PubTator entities with spans and alt_names

        Returns:
            List of relations in common format with entity spans as lists
        """
        # Create mapping from entity ID to all possible spans
        id_to_spans = {}
        for entity in pubtator_entities:
            entity_id = entity['id']
            spans = [entity['span'], entity['name']]
            if 'alt_names' in entity:
                spans.extend(entity['alt_names'])
            # Remove duplicates and empty strings, keep original casing
            unique_spans = list(set([span.strip() for span in spans if span and span.strip()]))
            id_to_spans[entity_id] = unique_spans

        # Convert relations
        common_format_relations = []
        for relation in pubtator_relations:
            source_id = relation['source_id']
            target_id = relation['target_id']

            # Get all possible spans for entities
            entity1_spans = id_to_spans.get(source_id, [])
            entity2_spans = id_to_spans.get(target_id, [])

            if entity1_spans and entity2_spans:  # Only add if we have spans for both entities
                common_relation = {
                    "entity1_span": entity1_spans,
                    "entity1_type": relation['source_label'],
                    "entity2_span": entity2_spans,
                    "entity2_type": relation['target_label'],
                    "relation_type": relation['relationship'].lower()  # Normalize to lowercase
                }
                common_format_relations.append(common_relation)

        return common_format_relations

    def convert_llm_to_common_format(self, llm_relations: List) -> List[Dict]:
        """
        Convert LLM results to common format

        Args:
            llm_relations: List of LLM relations (Pydantic objects or dicts)

        Returns:
            List of relations in common format with entity spans as single-item lists
        """
        common_format_relations = []

        for relation in llm_relations:
            # Handle different possible formats
            if hasattr(relation, 'entity1_span'):
                # Pydantic model format
                e1_span = relation.entity1_span
                e1_type = relation.entity1_type if isinstance(relation.entity1_type, str) else relation.entity1_type.value
                e2_span = relation.entity2_span
                e2_type = relation.entity2_type if isinstance(relation.entity2_type, str) else relation.entity2_type.value
                rel_type = relation.relation_type if isinstance(relation.relation_type, str) else relation.relation_type.value
            else:
                # Dictionary format
                e1_span = relation['entity1_span']
                e1_type = relation['entity1_type']
                e2_span = relation['entity2_span']
                e2_type = relation['entity2_type']
                rel_type = relation['relation_type']

            common_relation = {
                "entity1_span": [e1_span.strip()],  # Single span in list format
                "entity1_type": e1_type.strip(),
                "entity2_span": [e2_span.strip()],  # Single span in list format
                "entity2_type": e2_type.strip(),
                "relation_type": rel_type.lower().strip()  # Normalize to lowercase
            }
            common_format_relations.append(common_relation)

        return common_format_relations

    def normalize_relation(self, rel):
        """Convert relation dict into a normalized tuple so it can be compared."""
        return (
            tuple(rel['entity1_span']),  # keep all aliases
            rel['entity1_type'],
            tuple(rel['entity2_span']),
            rel['entity2_type'],
            rel['relation_type']
        )

    def match_relations(self, pubtator_list, llm_list):
        common = []
        pub_unique = []
        llm_unique = []

        matched_llm_indices = set()

        for i, rel1 in enumerate(pubtator_list):
            found_match = False
            for j, rel2 in enumerate(llm_list):
                # Check entity1 spans
                e1_match = any(span1.lower() == span2.lower()
                  for span1 in rel1['entity1_span']
                  for span2 in rel2['entity1_span'])
                e2_match = any(span1.lower() == span2.lower()
                              for span1 in rel1['entity2_span']
                              for span2 in rel2['entity2_span'])
                # Check types and relation
                type_match = (rel1['entity1_type'] == rel2['entity1_type']
                              and rel1['entity2_type'] == rel2['entity2_type'])
                relation_match = rel1['relation_type'] == rel2['relation_type']

                if e1_match and e2_match and type_match and relation_match:
                    common.append((rel1, rel2))
                    matched_llm_indices.add(j)
                    found_match = True
                    break
            if not found_match:
                pub_unique.append(rel1)

        # LLM uniques
        for j, rel2 in enumerate(llm_list):
            if j not in matched_llm_indices:
                llm_unique.append(rel2)

        return common, pub_unique, llm_unique


    def plot_venn(self, pub_unique, llm_unique, common):
        plt.figure(figsize=(6,6))
        venn2(subsets=(len(pub_unique), len(llm_unique), len(common)),
              set_labels=('PubTator', 'LLM'))
        plt.title("Relation Overlap")
        plt.show()

    def print_relations(self, common, pub_unique, llm_unique):
        def format_rel(rel):
            e1 = "(" + "; ".join(rel['entity1_span']) + ")"
            e2 = "(" + "; ".join(rel['entity2_span']) + ")"
            return f"[{rel['entity1_type']}] {e1}  --{rel['relation_type']}-->  [{rel['entity2_type']}] {e2}"

        print("\n=== Common Relations ===")
        for idx, (p, l) in enumerate(common, 1):
            print(f"{idx}.")
            print(f"  PubTator: {format_rel(p)}")
            print(f"  LLM     : {format_rel(l)}\n")

        print("\n=== Unique to PubTator ===")
        for idx, r in enumerate(pub_unique, 1):
            print(f"{idx}. {format_rel(r)}")

        print("\n=== Unique to LLM ===")
        for idx, r in enumerate(llm_unique, 1):
            print(f"{idx}. {format_rel(r)}")



    def compare(self, result_1, method_1, result_2, method_2):
        if method_1 == "pubtator":
            result_1 = self.convert_pubtator_to_common_format(*result_1)
        else:
            result_1 = self.convert_llm_to_common_format(result_1)

        if method_2 == "pubtator":
            result_2 = self.convert_pubtator_to_common_format(*result_2)
        else:
            result_2 = self.convert_llm_to_common_format(result_2)

        common, pub_unique, llm_unique = self.match_relations(result_1, result_2)

        self.print_relations(common, pub_unique, llm_unique)

        self.plot_venn(pub_unique, llm_unique, common)


### Run comparison

In [None]:
# let's choose a pmid. For instance the article "Does bisphenol A (BPA) participates in the pathogenesis of Polycystic Ovary Syndrome (PCOS)?" (PMID: 38008036)
selected_pmid = ["38008036"]

# Or even a random one:
selected_pmid = [random.choice(all_pmids)]

In [None]:
print(f"Selected PMID: {selected_pmid}")

In [None]:
# Let's use PubTator (classic NER) - small models and scalable.
entities_pubtator, relations_pubtator, doc_text = pubtator.request_and_parse_pubtator(selected_pmid)

In [None]:
# We then use the extracted text to call OpenAI and Ollama models
extractor_openai = BiomedicalExtractor(
    llm_type="openai",
    model="gpt-4o-mini"
)

result_openai = extractor_openai.extract(doc_text[0], demonstrations=PUBMED_EXAMPLES)

In [None]:
extractor_openai = BiomedicalExtractor(
    llm_type="ollama",
    model="ollama_chat/llama3.1:8b",
    base_url="http://127.0.0.1:11434"
)

result_llama = extractor_openai.extract(doc_text[0], demonstrations=PUBMED_EXAMPLES)

In [None]:
# load the comparator
comparator = BiomedicalnteractionComparison()

In [None]:
# Let's compare !
comparator.compare((relations_pubtator, entities_pubtator), "pubtator", result_llama, "llama")

In [None]:
# What about GPT-4o-mini vs Ollama ?
comparator.compare(result_llama, "llama", result_openai, "openai")

# More on the comparison

[*Thinking about GPT-3 In-Context Learning for Biomedical IE? Think Again*](https://doi.org/10.18653/v1/2022.findings-emnlp.329)

[*Improving large language models for clinical named entity recognition via prompt engineering*](https://doi.org/10.1093/jamia/ocad259)

[*Advancing entity recognition in biomedicine via instruction tuning of large language models*](https://doi.org/10.1093/bioinformatics/btae163)

[*LLMs in Biomedicine: A study on clinical Named Entity Recognition*](https://doi.org/10.48550/arXiv.2404.07376)

[*Do LLMs Surpass Encoders for Biomedical NER?*](https://doi.org/10.48550/arXiv.2504.00664)

[*On-the-fly Definition Augmentation of LLMs for Biomedical NER*](https://doi.org/10.18653/v1/2024.naacl-long.212)

[*LLMs are not Zero-Shot Reasoners for Biomedical Information Extraction*](https://doi.org/10.18653/v1/2025.insights-1.11)