In [1]:
import json
import pandas as pd

file_name = "/Users/brianmann/Downloads/ori_pqal.json"

# Load the JSON file
with open(file_name, 'r', encoding='utf-8') as f:
    data = json.load(f)

abstracts_dict = {}

# Extract question and long answer
for entry_id, entry in data.items():
    question = entry.get("QUESTION", "")
    long_answer = entry.get("LONG_ANSWER", "")

    abstracts_dict[entry_id] = {"title": question, "answer": long_answer}
    

abstracts = pd.DataFrame.from_dict(abstracts_dict, orient='index')

In [2]:
import os
# from langchain_neo4j import Neo4jGraph, GraphCypherQAChain
from langchain_ollama import ChatOllama
# from langchain_experimental.graph_transformers import LLMGraphTransformer
# from langchain_core.documents import Document

import xml.etree.ElementTree as ET
# from langchain.schema import Document

import re
# from langchain_ollama import ChatOllama
# from langchain_core.prompts import PromptTemplate

In [3]:
def create_query(title: str, answer: str):
    query = f'''
    Capture relationships that answer the given medical question.
    Abstract Question: {title}\n Answer: {answer}
    Extract medical relationships that provide answers to the question as structured triples (Entity1, Relationship, Entity2). Put all elements of a tuple on the same line
    in the format ("entity1", "relationship", "entity2") . The relationship should not incluede the entities.
    '''
    return query

In [4]:
llm = ChatOllama(model="llama3.2", temperature=0, max_tokens=2048) 

In [5]:
combined_formatted_graph = []
bad_abstracts = []

for i, (entry_id, row) in enumerate(abstracts.iterrows()):
    if i >= 5:
        break


    structured_text = create_query(row["title"], row["answer"])
    response = llm.invoke(structured_text)
    print(response)
    # Extract the actual text from the LLM response
    raw_output = response.content

    # Print the cleaned raw output (for debugging)
    # print("Cleaned Raw LLM Output:\n", raw_output)

    # Extract (Entity1, Relationship, Entity2) triples using regex
    # matches = re.findall(r'"(.*?)" , "(.*?)" , "(.*?)"', raw_output)
    matches = re.findall(r'\("([^"]+)",\s*"([^"]+)",\s*"([^"]+)"\)', raw_output)

    # print(f"matches:\n{matches}")

    # Convert extracted triples into nodes and relationships
    nodes = set()
    relationships = []

    for entity1, relation, entity2 in matches:
        nodes.add(entity1)
        nodes.add(entity2)
        relationships.append((entity1, relation, entity2))

    # Convert to Full Node-Relationship-Node Format
    formatted_graph = [f"({e1}) -[:{r.replace(' ', '_').upper()}]-> ({e2})" for e1, r, e2 in relationships]
    if len(formatted_graph) == 0:
        print(f"Error no nodes found for: {entry_id}")
        bad_abstracts.append(entry_id)

    combined_formatted_graph.extend(formatted_graph)
    # print(len(combined_formatted_graph))
    # print()
    # print(formatted_graph)
print("Complete")

content='Here are the extracted medical relationships:\n\nMitochondria and chloroplasts moving on transvacuolar strands to form a ring structure surrounding the nucleus during developmental PCD, Mitochondria play a critical role in developmentally regulated PCD in the lace plant, CsA is used in whole plant system.\n\n("Mitochondria", "moving on transvacuolar strands", "chloroplasts")\n("Mitochondria", "play a critical role", "developmentally regulated PCD")\n("CsA", "is used", "whole plant system")' additional_kwargs={} response_metadata={'model': 'llama3.2', 'created_at': '2025-04-02T17:22:48.762896Z', 'done': True, 'done_reason': 'stop', 'total_duration': 6544310458, 'load_duration': 835756583, 'prompt_eval_count': 242, 'prompt_eval_duration': 3866000000, 'eval_count': 112, 'eval_duration': 1838000000, 'message': Message(role='assistant', content='', images=None, tool_calls=None)} id='run-40cb35a7-d0bf-424c-8f99-9416906336a7-0' usage_metadata={'input_tokens': 242, 'output_tokens': 11

In [6]:
abstracts.head()

Unnamed: 0,title,answer
21645374,Do mitochondria play a role in remodelling lac...,Results depicted mitochondrial dynamics in viv...
16418930,Landolt C and snellen e acuity: differences in...,"Using the charts described, there was only a s..."
9488747,"Syncope during bathing in infants, a pediatric...","""Aquagenic maladies"" could be a pediatric form..."
17208539,Are the long-term results of the transanal pul...,Our long-term study showed significantly bette...
10808977,Can tailored interventions increase mammograph...,The effects of the intervention were most pron...


In [7]:
combined_formatted_graph

['(Mitochondria) -[:MOVING_ON_TRANSVACUOLAR_STRANDS]-> (chloroplasts)',
 '(Mitochondria) -[:PLAY_A_CRITICAL_ROLE]-> (developmentally regulated PCD)',
 '(CsA) -[:IS_USED]-> (whole plant system)',
 '(Snellen E) -[:HAS_SIMILAR_VISUAL_ACUITY]-> (Landolt C)',
 '(Strabismus Amblyopia) -[:IS_ASSOCIATED_WITH]-> (Visual Acuity)',
 '(Landolt C) -[:HAS_BETTER_VISUAL_ACUITY_THAN]-> (Snellen E)',
 '(Visual Acuity) -[:DIFFERS_SLIGHTLY_BETWEEN]-> (Snellen E and Landolt C)',
 '(Aquagenic Urticaria) -[:CAUSES]-> (Syncope During Bathing in Infants)',
 '(Aquagenic Urticaria) -[:IS_A_PEDIATRIC_FORM_OF]-> (Water-Induced Uricaria)',
 '(Transanal pull-through) -[:HAS_WORSE_RESULTS_THAN]-> (Abdominal approach)',
 '(Continence score) -[:IS_BETTER_FOR]-> (Abdominal approach)',
 '(Stool pattern) -[:IS_SOMEWHAT_BETTER_FOR]-> (TERPT group)',
 '(Enterocolitis scores) -[:ARE_SOMEWHAT_BETTER_FOR]-> (TERPT group)',
 '(Transanal pull-through) -[:HAS_SIMILAR_RESULTS_TO]-> (TERPT group)',
 "(HD (Hirschsprung's disease)) 

In [8]:
bad_abstracts

[]

In [12]:
import os

# Replace this path with the one you want to use
os.chdir('/Users/brianmann/git/knowledge_graph_creation')
# /Users/brianmann/git/knowledge_graph_creation/knowledge_graph.ipynb
# Confirm the change
print(os.getcwd())

/Users/brianmann/git/knowledge_graph_creation


In [14]:
with open('graph_output2.txt', 'w') as f:
    for item in combined_formatted_graph:
        f.write(f"{item}\n")

In [13]:
! pwd

/Users/brianmann/git/knowledge_graph_creation
