In [138]:
from langchain_neo4j import Neo4jGraph, GraphCypherQAChain
from langchain_ollama import ChatOllama
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.documents import Document

import xml.etree.ElementTree as ET
from langchain.schema import Document

import re
from langchain_ollama import ChatOllama
from langchain_core.prompts import PromptTemplate
import pandas as pd

import requests

import json
from collections import Counter

import time

from neo4j import GraphDatabase


In [139]:
llm = ChatOllama(model="llama3.2", temperature=0, max_tokens=2048) 

# Read in full abstracts

Get questions, answer_key, and ids

In [140]:
with open("/Users/brianmann/Downloads/ori_pqal.json", "r") as f:
    json_data = json.load(f)

questions = [entry.get("QUESTION", "N/A") for entry in json_data.values()]
answer_key = [entry.get("final_decision", "N/A") for entry in json_data.values()]
ids = list(json_data.keys())

num_questions = 50

questions = questions[:num_questions]
answer_key = answer_key[:num_questions]
ids = ids[:num_questions]


Full abstracts using api

In [141]:
def read_abstracts(pmid):
    # pmid = "14499029"
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        "db": "pubmed",
        "id": pmid,
        "retmode": "xml"
    }

    response = requests.get(url, params=params)
    root = ET.fromstring(response.content)

    article = root.find('.//PubmedArticle')

    # Get title
    title = article.find('.//ArticleTitle').text

    # Get full abstract, preserving labels
    abstract_elements = article.findall('.//Abstract/AbstractText')

    abstract_parts = []
    for elem in abstract_elements:
        if elem.text:
            label = elem.attrib.get('Label')
            if label:
                abstract_parts.append(f"{label}: {elem.text.strip()}")
            else:
                abstract_parts.append(elem.text.strip())

    abstract = ' '.join(abstract_parts)

    # print(f"Title: {title}")
    # print(f"Abstract: {abstract}")
    return abstract

In [142]:
# abstracts = []

# for id in ids:
#     abstracts.append(read_abstracts(id))
#     time.sleep(0.35)


# Read the file without API (if already saved)

In [143]:
# with open('abstracts_list.txt', 'w') as f:
#     for item in abstracts:
#         f.write(f"{item}\n")

In [144]:
with open('abstracts_list.txt', 'r') as f:
    abstracts = [line.strip() for line in f]

# Make Conclusion

In [172]:
def create_query_for_conclusion(abstract: str):
    query = f'''
    Summarize the conclusion of this abstract. Begin with START and end with FINISH.
    Abstract: {abstract}
    '''
    return query

def extract_summary(raw_output: str):
    # This handles: "START", "START:", "START   \n", etc.
    pattern = r"START[:\s]*([\s\S]*?)\s*FINISH"
    match = re.search(pattern, raw_output, re.IGNORECASE)
    if match:
        return match.group(1).strip()
    else:
        print("Warning: START/FINISH not found properly.")
        return raw_output.strip()  # fallback

def get_conclusion(abstract: str):
    structured_text = create_query_for_conclusion(abstract)
    response = llm.invoke(structured_text)

    # Extract the actual text from the LLM response
    raw_output = response.content
    return extract_summary(raw_output)

    

In [173]:
conclusions = [get_conclusion(abstract) for abstract in abstracts]


In [147]:
# with open('conclusions_list.txt', 'w') as f:
#     for item in conclusions:
#         f.write(f"{item}\n")

In [148]:
# with open('conclusions_list.txt', 'r') as f:
#     conclusions = [line.strip() for line in f]

# Generate examples

In [149]:
example_abstracts = []

example_ids = [14499029, 14499049]

for id in example_ids:
    example_abstracts.append(read_abstracts(id))
    time.sleep(0.35)

In [150]:
example_entities = ["START Naturopathic clinics, Conventional medical clinics, Community Health Centers, Women aged 40 years or more, Patients with menopausal symptoms FINISH"]

In [151]:
# entity_lists = [get_entities(abstract) for abstract in example_abstracts]

# Find Key entities from Title

(or Key entities from Abstract so it isn't too easy)

In [193]:
def create_query_entities(title: str):
    query = f'''List the key entities concisely in this text: {title} 
    Do not number the items. Start the list with START and end with FINISH
    '''
    return query

# def create_query_entities(title: str):
#     query = f'''
# List key entities assocciated with the important conclusions concisely. Do not number the items. Start the list with START and end with FINISH.

# Example Prompt:{example_abstracts[0]}
# Example Answer:{example_entities[0]}

# Do not use the entities from the example. Base the entities on the following prompt.

# Prompt:{title}
# Answer:
#     '''
#     return query




def clean_entities(raw_output: str):
    # This handles: "START", "START:", "START   \n", etc.
    pattern = r"START[:\s]*([\s\S]*?)\s*FINISH"
    match = re.search(pattern, raw_output, re.IGNORECASE)
    if match:
        content = match.group(1).strip()

        # Split into lines and remove leading bullets or dashes
        entities = []
        for line in content.splitlines():
            line = line.strip().lower()
            line = re.sub(r"^[•\-–\*]+\s*", "", line)  # Remove bullet symbols
            if line:
                entities.append(line)
        return entities
    else:
        print("Warning: START/FINISH not found properly.")
        return [raw_output.strip()]  # fallback as single-item list

def get_entities(title: str):
    structured_text = create_query_entities(title)
    response = llm.invoke(structured_text)

    # Extract the actual text from the LLM response
    raw_output = response.content
    # print(raw_output)
    return clean_entities(raw_output)

Can choose either titles or full abstract to generate key entities

In [194]:
# entity_lists = [get_entities(title) for title in questions]
entity_lists = [get_entities(abstract) for abstract in abstracts]

# Create KG

In [195]:
with open("graph_output.txt", "w") as f:
    for id, entity_list in zip(ids, entity_lists):
        for entity in entity_list:
            entity = entity.replace("'", "").strip()
            safe_entity = entity.replace('"', '\\"')  # escape inner double quotes
            f.write(f"({id}) -[:RELATES_TO]-> (\"{entity}\")\n")

    for id, conclusion in zip(ids, conclusions):
        conclusion = conclusion.replace("\n", " ").replace("'", "").strip()
        safe_conclusion = conclusion.replace('"', '\\"')
        f.write(f"({id}) -[:CONCLUDES]-> (\"{conclusion}\")\n")


In [196]:
from neo4j import GraphDatabase

# Neo4j connection credentials
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "sunsh1ne1"

# Initialize the Neo4j driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

def insert_data_to_neo4j(data):
    with driver.session() as session:
        session.run("MATCH (n) DETACH DELETE n")

        for line in data:
            line = line.strip()
            if not line or ' -[:' not in line or ']->' not in line:
                print(f"Skipping malformed line: {line}")
                continue

            try:
                parts = line.split(' -[:')
                node_1 = parts[0].strip()[1:-1]  # Remove surrounding parentheses
                relationship_and_node_2 = parts[1].split(']->')
                relationship = relationship_and_node_2[0].strip()
                node_2 = relationship_and_node_2[1].strip()[1:-1]  # Remove surrounding parentheses

                # Clean node names: remove wrapping quotes/parentheses, escape quotes for Cypher
                node_1 = node_1.strip(' "\'()').replace("'", "''")
                node_2 = node_2.strip(' "\'()').replace("'", "''")

                cypher_query = f"""
                MERGE (a:Entity {{name: '{node_1}'}})
                MERGE (b:Entity {{name: '{node_2}'}})
                MERGE (a)-[:{relationship}]->(b)
                """
                session.run(cypher_query)
            except Exception as e:
                print(f"Error processing line: {line}")
                print(f"Exception: {e}")

# Read the graph data from the file
with open("graph_output.txt", "r") as file:
    data = file.readlines()

# Insert the data into Neo4j
insert_data_to_neo4j(data)

print("Data inserted into Neo4j successfully.")


Data inserted into Neo4j successfully.


# Find Entities in Question

In [197]:
question_entity_lists = [get_entities(question) for question in questions]


In [198]:
question_entity_lists[0]

['mitochondria',
 'lace plant leaves',
 'programmed cell death',
 'remodelling of leaf structure',
 'cell death pathways',
 'apoptosis mechanisms']

# Find Similar Abstracts

In [199]:
def find_similar_articles(entities, top_k=10):
    # Normalize entities to lowercase for consistent matching
    normalized_entities = [e.lower() for e in entities]

    cypher = """
    WITH $entities AS input_entities
    MATCH (article:Entity)-[:RELATES_TO]->(e:Entity)
    WHERE toLower(e.name) IN input_entities 
      AND NOT toLower(article.name) IN input_entities
    RETURN article.name AS article_id, COUNT(e) AS shared_entities
    ORDER BY shared_entities DESC
    LIMIT $top_k
    """

    with driver.session() as session:
        result = session.run(cypher, entities=normalized_entities, top_k=top_k)
        return result.data()


In [159]:
# entities = ['Mitochondria', 'Lace plant leaves', 'Programmed cell death', 'Remodeling']
# matches = find_similar_articles(entities)

# print("Top matching articles:")
# for match in matches:
#     print(f"Article ID: {match['article_id']}, Shared Entities: {match['shared_entities']}")


In [200]:
best_matching_abstracts = []

for question_entity_list in question_entity_lists:
    matches = find_similar_articles(question_entity_list)
    if matches:
        best_match = matches[0]
        # print("Best matching article:")
        # print(f"Article ID: {best_match['article_id']}")
        best_matching_abstracts.append(best_match['article_id'])
        # print(f"Shared Entities: {best_match['shared_entities']}")
        # print(f"Matched Entities: {best_match.get('matched_entities', 'N/A')}")
    else:
        # print("No matching articles found.")
        best_matching_abstracts.append("")

best_matching_abstracts[:10]



['21645374', '', '', '', '', '', '', '', '', '']

# Use get Conclusion for Abstract found

In [201]:
def get_conclusion_from_kg(pmid):
    cypher = """
    MATCH (:Entity {name: $pmid})-[:CONCLUDES]->(conclusion:Entity)
    RETURN conclusion.name AS conclusion_text
    """
    with driver.session() as session:
        result = session.run(cypher, pmid=pmid)
        record = result.single()
        return record["conclusion_text"] if record else None

In [202]:
found_conclusions = [get_conclusion_from_kg(abstract_id) for abstract_id in best_matching_abstracts]

found_conclusions[:3]

['The abstract concludes that mitochondrial dynamics play a critical and early role in developmental programmed cell death (PCD) in the lace plant.  Mitochondrial dynamics were found to be correlated with other organelles during PCD, including chloroplasts and transvacuolar strands. The study also demonstrated the feasibility of using cyclosporine A (CsA) as a treatment to reduce the number of perforations in leaves, suggesting that mitochondrial permeability transition pore formation is involved in PCD.',
 None,
 None]

# Generate Answers/Accuracy

In [203]:
def create_query(question: str, conclusion: str):
    query = f'''
    Using this information {conclusion}

    Answer the following quesiton with yes, no, or maybe.: {question} 
    '''
    return query

In [204]:

def generate_answers(question, conclusion):
    response = llm.invoke(create_query(question, conclusion))
    raw_output = response.content
    first_word = raw_output.split()[0].rstrip('.').lower()
    return first_word
    # answers.append(first_word)
    # print(first_word)

In [205]:
generated_answers = [generate_answers(question, conclusion) for question, conclusion in zip(questions, found_conclusions)]

In [206]:
sum([a ==b for a,b in zip(generated_answers, answer_key)])

26

In [207]:
for i, (model_answer, correct_answer) in enumerate(zip(generated_answers, answer_key), 1):
    print(f"Q{i}: Our Answer: {model_answer.lower()}\tAnswer Key: {correct_answer}")

Q1: Our Answer: yes	Answer Key: yes
Q2: Our Answer: maybe	Answer Key: no
Q3: Our Answer: yes	Answer Key: yes
Q4: Our Answer: maybe	Answer Key: no
Q5: Our Answer: yes	Answer Key: yes
Q6: Our Answer: yes	Answer Key: yes
Q7: Our Answer: yes	Answer Key: maybe
Q8: Our Answer: maybe	Answer Key: no
Q9: Our Answer: yes	Answer Key: no
Q10: Our Answer: yes	Answer Key: yes
Q11: Our Answer: yes	Answer Key: yes
Q12: Our Answer: no	Answer Key: no
Q13: Our Answer: no	Answer Key: yes
Q14: Our Answer: maybe	Answer Key: no
Q15: Our Answer: yes	Answer Key: yes
Q16: Our Answer: yes	Answer Key: yes
Q17: Our Answer: yes	Answer Key: yes
Q18: Our Answer: yes	Answer Key: yes
Q19: Our Answer: yes	Answer Key: yes
Q20: Our Answer: yes	Answer Key: yes
Q21: Our Answer: maybe	Answer Key: yes
Q22: Our Answer: maybe	Answer Key: yes
Q23: Our Answer: yes	Answer Key: yes
Q24: Our Answer: maybe	Answer Key: yes
Q25: Our Answer: yes	Answer Key: yes
Q26: Our Answer: no	Answer Key: no
Q27: Our Answer: maybe	Answer Key: yes
Q2

# Baseline of using conclusions with perfect retrieval

In [208]:
generated_answers_perfect_retrieval = [generate_answers(question, conclusion) for question, conclusion in zip(questions, conclusions)]

In [209]:
sum([a ==b for a,b in zip(generated_answers_perfect_retrieval, answer_key)])

40