In [158]:
import json #
import pandas as pd

file_name = "/Users/brianmann/Downloads/ori_pqal.json"

# Load the JSON file
with open(file_name, 'r', encoding='utf-8') as f:
    data = json.load(f)

abstracts_dict = {}

# Extract question and long answer
for entry_id, entry in data.items():
    question = entry.get("QUESTION", "")
    long_answer = entry.get("LONG_ANSWER", "")

    abstracts_dict[entry_id] = {"title": question, "answer": long_answer}
    

abstracts = pd.DataFrame.from_dict(abstracts_dict, orient='index')[:50]

In [159]:
import os
# from langchain_neo4j import Neo4jGraph, GraphCypherQAChain
from langchain_ollama import ChatOllama
# from langchain_experimental.graph_transformers import LLMGraphTransformer
# from langchain_core.documents import Document

import xml.etree.ElementTree as ET
# from langchain.schema import Document

import re
# from langchain_ollama import ChatOllama
# from langchain_core.prompts import PromptTemplate

In [160]:
llm = ChatOllama(model="llama3.2", temperature=0, max_tokens=2048) 

In [161]:
# possible_relationships = "CAUSES, TREATS, DIAGNOSES, REDUCES_RISK_OF, INCREASES_RISK_OF, IS_A_RISK_FACTOR_FOR, IS_ASSOCIATED_WITH, PREDICTS, IS_AS_EFFECTIVE_AS, IS_MORE_EFFECTIVE_THAN, IMPROVES, WORSENS, IS_COST_EFFECTIVE_FOR, INFLUENCES, or IS_USEFUL_FOR"
possible_relationships = """
CAUSES
TREATS
DIAGNOSES
REDUCES_RISK_OF
INCREASES_RISK_OF
IS_A_RISK_FACTOR_FOR
IS_ASSOCIATED_WITH
PREDICTS
IS_AS_EFFECTIVE_AS
IS_MORE_EFFECTIVE_THAN
IMPROVES
WORSENS
IS_COST_EFFECTIVE_FOR
INFLUENCES
IS_USEFUL_FOR
DOES_NOT_CAUSE
DOES_NOT_TREAT
DOES_NOT_DIAGNOSE
IS_NOT_A_RISK_FACTOR_FOR
IS_NOT_ASSOCIATED_WITH
DOES_NOT_PREDICT
IS_LESS_EFFECTIVE_THAN
IS_NOT_COST_EFFECTIVE_FOR
DOES_NOT_INFLUENCE
IS_NOT_USEFUL_FOR"""

In [162]:
def create_query(title: str, answer: str):
    query = f'''
    Use the relationships {possible_relationships} to build a knowledge graph with answers to these questions
    Abstract Question: {title}\n Answer: {answer}
    Use the only the relationships {possible_relationships} and use as many of the words in the question in the nodes as possible: {title}. Extract medical relationships that provide answers to the question as structured triples (Entity1, Relationship, Entity2). Put all elements of a tuple on the same line
    in the format ("entity1", "relationship", "entity2") . The relationship should not incluede the entitiess. Start the list with START and end with FINISH
    '''
    return query

In [163]:
import re

combined_formatted_graph = []
bad_abstracts = []

for i, (entry_id, row) in enumerate(abstracts.iterrows()):
    if i >= 50:
        break

    structured_text = create_query(row["title"], row["answer"])
    response = llm.invoke(structured_text)
    print(response)

    raw_output = response.content

    # Extract (Entity1, Relationship, Entity2) triples using regex
    matches = [
        (e1.strip(), rel.strip(), e2.strip())
        for e1, rel, e2 in re.findall(r'^(.*?),\s*(.*?),\s*(.*?)$', raw_output, re.MULTILINE)
        if e1.strip().upper() not in {"START", "FINISH"} and e2.strip().upper() not in {"START", "FINISH"}
    ]

    print(f"matches:\n{matches}")

    # Convert extracted triples into nodes and relationships
    nodes = set()
    relationships = []

    for entity1, relation, entity2 in matches:
        nodes.add(entity1)
        nodes.add(entity2)
        relationships.append((entity1, relation, entity2))

    # Convert to Full Node-Relationship-Node Format
    formatted_graph = [f"({e1}) -[:{r.replace(' ', '_').upper()}]-> ({e2})" for e1, r, e2 in relationships]
    if len(formatted_graph) == 0:
        print(f"Error no nodes found for: {entry_id}")
        bad_abstracts.append(entry_id)

    combined_formatted_graph.extend(formatted_graph)

print("Complete")


content='START\n\nMitochondria, CAUSES, programmed cell death in lace plant leaves\n\nProgrammed cell death in lace plant leaves, IS_ASSOCIATED_WITH, mitochondrial dynamics\n\nMitochondrial dynamics, PREDICTS, remodelling of lace plant leaves\n\nRemodelling of lace plant leaves, WORSENS, quality of life in lace plant\n\nQuality of life in lace plant, DOES_NOT_INFLUENCE, programmed cell death in lace plant leaves\n\nProgrammed cell death in lace plant leaves, IS_A_RISK_FACTOR_FOR, remodelling of lace plant leaves\n\nRemodelling of lace plant leaves, IS_USEFUL_FOR, understanding of programmed cell death mechanisms\n\nUnderstanding of programmed cell death mechanisms, DOES_NOT_PREDICT, remodelling of lace plant leaves\n\nFINISH' additional_kwargs={} response_metadata={'model': 'llama3.2', 'created_at': '2025-04-08T23:25:59.894211Z', 'done': True, 'done_reason': 'stop', 'total_duration': 7393882833, 'load_duration': 816958875, 'prompt_eval_count': 593, 'prompt_eval_duration': 3720000000, '

# Clean up the output

In [164]:
with open('graph_output.txt', 'w') as f:
    for item in combined_formatted_graph:
        f.write(f"{item}\n")

In [165]:
# Function to remove quotes
def clean_file(text):
    return text.replace('"', '').replace("'", '').replace("((", "(").replace("))", ")")

# Read the content of the output file
with open('graph_output.txt', 'r') as file:
    file_content = file.read()

# Remove quotes from the file content
cleaned_content = clean_file(file_content)

# Write the cleaned content back to the file
with open('graph_output.txt', 'w') as cleaned_file:
    cleaned_file.write(cleaned_content)

print("File cleaned and saved as 'graph_output.txt'")


File cleaned and saved as 'graph_output.txt'
