In [11]:
import os
from langchain_neo4j import Neo4jGraph, GraphCypherQAChain
from langchain_ollama import ChatOllama
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.documents import Document

import xml.etree.ElementTree as ET
from langchain.schema import Document

import re
from langchain_ollama import ChatOllama
from langchain_core.prompts import PromptTemplate
import pandas as pd

import requests

In [5]:
llm = ChatOllama(model="llama3.2", temperature=0, max_tokens=2048) 

# API call for first abstract

In [42]:
# pmid = "1571683"
pmid = "21645374"
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
params = {
    "db": "pubmed",
    "id": pmid,
    "retmode": "xml"
}

response = requests.get(url, params=params)
root = ET.fromstring(response.content)

article = root.find('.//PubmedArticle')

# Get title
title = article.find('.//ArticleTitle').text

# Get full abstract, preserving labels
abstract_elements = article.findall('.//Abstract/AbstractText')

abstract_parts = []
for elem in abstract_elements:
    if elem.text:
        label = elem.attrib.get('Label')
        if label:
            abstract_parts.append(f"{label}: {elem.text.strip()}")
        else:
            abstract_parts.append(elem.text.strip())

abstract = ' '.join(abstract_parts)

print(f"Title: {title}")
print(f"Abstract: {abstract}")


Title: Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?
Abstract: BACKGROUND: Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants. RESULTS: The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD), cells in early stages of PCD (EPC

Prompt needs tweaking to get consistent clean output

In [43]:
structured_text = f'''
Capture the main question being answersed by this abstract in structured relationships for a knowledge graph.
Abstract Title: {title}\n Abstract Text: {abstract}
Extract medical relationships as structured triples (Entity1, Relationship, Entity2). Put all elements of a tuple on the same line
in the format ("entity1", "relationship", "entity2") . There must be two entities and a relationship and the relationship should not incluede the entities. 
'''

In [44]:
combined_formatted_graph = []
bad_batch = []

response = llm.invoke(structured_text)
# Extract the actual text from the LLM response
raw_output = response.content

# Print the cleaned raw output (for debugging)
# print("Cleaned Raw LLM Output:\n", raw_output)

# Extract (Entity1, Relationship, Entity2) triples using regex
# matches = re.findall(r'"(.*?)" , "(.*?)" , "(.*?)"', raw_output)
# matches = re.findall(r'\("([^"]+)",\s*"([^"]+)",\s*"([^"]+)"\)', raw_output)
matches = re.findall(r'^([^,]+),\s*([^,]+),\s*(.+)$', raw_output, re.MULTILINE)


print(f"matches:\n{matches}")

# Convert extracted triples into nodes and relationships
nodes = set()
relationships = []

for entity1, relation, entity2 in matches:
    nodes.add(entity1)
    nodes.add(entity2)
    relationships.append((entity1, relation, entity2))

# Convert to Full Node-Relationship-Node Format
formatted_graph = [f"({e1}) -[:{r.replace(' ', '_').upper()}]-> ({e2})" for e1, r, e2 in relationships]
if len(formatted_graph) == 0:
    print("Error no nodes found")
    bad_batch.append(pmid)

combined_formatted_graph.extend(formatted_graph)
print(len(combined_formatted_graph))
# print()
print(formatted_graph)
print("Complete")

matches:
[('Here are the extracted medical relationships as structured triples:\n\nMitochondria', 'plays a role in', 'programmed cell death (PCD)'), ('Mitochondria', 'undergoes changes during', 'PCD'), ('Mitochondria', 'dynamics', 'are delineated into, four categories (M1-M4)'), ('Mitochondrial permeability transition pore (PTP)', 'formation', 'is indirectly examined via, cyclosporine A (CsA) treatment'), ('Mitochondria', 'movement', 'is observed on, transvacuolar strands'), ('Mitochondria', 'membrane potential (ΔΨm)', 'changes during, PCD'), ('Mitochondria', 'distribution', 'is characterized by, four categories (M1-M4)'), ('Mitochondrial dynamics', 'are correlated with', 'other organelles during, PCD'), ('PCD', 'occurs in the cells at the center of', 'areoles'), ('PCD', 'progresses outwards from', 'cells that will not undergo PCD (NPCD)'), ('PCD', 'is stopped approximately five cells from', 'vasculature'), ('Mitochondria', 'dye MitoTracker Red CMXRos', 'stains, window stage leaves')]


In [31]:
# def extract_structured_triples(raw_output):
#     triples = []

#     for line in raw_output.splitlines():
#         line = line.strip()
#         if not line or line.lower().startswith("here are"):
#             continue  # Skip empty lines and intros

#         # Try to split into exactly 3 parts
#         parts = [p.strip().strip('"').strip('“”') for p in line.split(',')]
#         if len(parts) == 3:
#             entity1, relation, entity2 = parts
#             # Basic sanity checks (optional)
#             if entity1 and relation and entity2:
#                 triples.append((entity1, relation, entity2))

#     return triples


In [36]:
# extract_structured_triples(raw_output)

[('("Vaccines', 'are stored', 'in community")'),
 ('("General Practices', 'conduct', 'vaccine storage assessments")'),
 ('("Child Health Clinics', 'provide', 'vaccine storage services")'),
 ('("Department of Health', 'issues', 'guidelines for vaccine storage")'),
 ('("Guidelines', 'specify', 'temperature ranges for vaccines")'),
 ('("Vaccines', 'require', 'cold chain maintenance")'),
 ('("Refrigerators', 'are used', 'to store vaccines")'),
 ('("Thermometers', 'monitor', 'refrigerator temperatures")'),
 ('("Staff training', 'is necessary', 'for vaccine storage")'),
 ('("Equipment provision',
  'is crucial',
  'for successful immunisation programme")'),
 ("Note: I've tried to capture the main relationships between entities",
  "but some inferences were made to fill gaps in the original text. If you'd like me to revise or expand on any of these triples",
  'please let me know!')]

# Gather all article ids

In [8]:
import json

filename = "/Users/brianmann/Downloads/ori_pqal.json"

# Load the JSON data
with open(filename, 'r') as f:
    data = json.load(f)

# The article IDs are the top-level keys
article_ids = list(data.keys())

print(f"Found {len(article_ids)} article IDs")
print(article_ids[:10])  # Show a preview of the first 10


Found 1000 article IDs
['21645374', '16418930', '9488747', '17208539', '10808977', '23831910', '26037986', '26852225', '17113061', '10966337']


# Loop API call through all articles

In [9]:
import requests
import xml.etree.ElementTree as ET
import time

abstract_dict = {}

for pmid in article_ids[:10]:
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        "db": "pubmed",
        "id": pmid,
        "retmode": "xml",
        "email": "your.email@example.com"  # optional but recommended
    }

    response = requests.get(url, params=params)

    if response.status_code != 200:
        print(f"Failed to fetch PMID {pmid} (status {response.status_code})")
        continue

    try:
        root = ET.fromstring(response.content)
    except ET.ParseError:
        print(f"⚠️ Could not parse XML for PMID {pmid}")
        print("Response content:")
        print(response.text[:300])  # print a snippet to debug
        continue

    article = root.find('.//PubmedArticle')
    if article is None:
        print(f"No article found in response for PMID {pmid}")
        continue

    title_elem = article.find('.//ArticleTitle')
    title = title_elem.text if title_elem is not None else "No title"

    abstract_elements = article.findall('.//Abstract/AbstractText')
    if not abstract_elements:
        continue

    abstract_parts = []
    for elem in abstract_elements:
        if elem.text:
            label = elem.attrib.get('Label') or elem.attrib.get('label')
            if label:
                abstract_parts.append(f"{label}: {elem.text.strip()}")
            else:
                abstract_parts.append(elem.text.strip())

    abstract = ' '.join(abstract_parts)
    abstract_dict[pmid] = {
        "Abstract Title": title,
        "Abstract Text": abstract
    }

    time.sleep(0.34)  # delay to avoid NCBI rate limits (3 requests/sec)


In [10]:
print(len(abstract_dict))

10


In [16]:
# Convert to DataFrame
df = pd.DataFrame.from_dict(abstract_dict, orient='index')

# Optional: Add PMID as a column instead of index
df.reset_index(inplace=True)
df.rename(columns={"index": "PMID"}, inplace=True)

# Preview
print(df.head())

       PMID                                     Abstract Title  \
0  21645374  Do mitochondria play a role in remodelling lac...   
1  16418930  [Landolt C and snellen e acuity: differences i...   
2   9488747  [Syncope during bathing in infants, a pediatri...   
3  17208539  Are the long-term results of the transanal pul...   
4  10808977  Can tailored interventions increase mammograph...   

                                       Abstract Text  
0  BACKGROUND: Programmed cell death (PCD) is the...  
1  BACKGROUND: Assessment of visual acuity depend...  
2  BACKGROUND: Apparent life-threatening events i...  
3  PURPOSE: The transanal endorectal pull-through...  
4  BACKGROUND: Telephone counseling and tailored ...  


In [20]:
df.to_csv("pqal.csv", sep='|', index=False)