In [76]:
import json #
import pandas as pd

file_name = "/Users/brianmann/Downloads/ori_pqal.json"

# Load the JSON file
with open(file_name, 'r', encoding='utf-8') as f:
    data = json.load(f)

abstracts_dict = {}

# Extract question and long answer
for entry_id, entry in data.items():
    question = entry.get("QUESTION", "")
    long_answer = entry.get("LONG_ANSWER", "")

    abstracts_dict[entry_id] = {"title": question, "answer": long_answer}
    

abstracts = pd.DataFrame.from_dict(abstracts_dict, orient='index')

In [77]:
import os
# from langchain_neo4j import Neo4jGraph, GraphCypherQAChain
from langchain_ollama import ChatOllama
# from langchain_experimental.graph_transformers import LLMGraphTransformer
# from langchain_core.documents import Document

import xml.etree.ElementTree as ET
# from langchain.schema import Document

import re
# from langchain_ollama import ChatOllama
# from langchain_core.prompts import PromptTemplate

In [78]:
llm = ChatOllama(model="llama3.2", temperature=0, max_tokens=2048) 

In [79]:
possible_relationships = "CAUSES, TREATS, DIAGNOSES, REDUCES_RISK_OF, INCREASES_RISK_OF, IS_A_RISK_FACTOR_FOR, IS_ASSOCIATED_WITH, PREDICTS, IS_AS_EFFECTIVE_AS, IS_MORE_EFFECTIVE_THAN, IMPROVES, WORSENS, IS_COST_EFFECTIVE_FOR, INFLUENCES, or IS_USEFUL_FOR"

In [80]:
def create_query(title: str, answer: str):
    query = f'''
    Use the relationships {possible_relationships} to build a knowledge graph with answers to these questions
    Abstract Question: {title}\n Answer: {answer}
    Extract medical relationships that provide answers to the question as structured triples (Entity1, Relationship, Entity2). Put all elements of a tuple on the same line
    in the format ("entity1", "relationship", "entity2") . The relationship should not incluede the entitiess.
    '''
    return query

In [81]:
combined_formatted_graph = []
bad_abstracts = []

for i, (entry_id, row) in enumerate(abstracts.iterrows()):
    if i >= 50:
        break


    structured_text = create_query(row["title"], row["answer"])
    response = llm.invoke(structured_text)
    print(response)
    # Extract the actual text from the LLM response
    raw_output = response.content

    # Print the cleaned raw output (for debugging)
    # print("Cleaned Raw LLM Output:\n", raw_output)

    # Extract (Entity1, Relationship, Entity2) triples using regex
    # matches = re.findall(r'"(.*?)" , "(.*?)" , "(.*?)"', raw_output)
    matches = re.findall(r'\("([^"]+)",\s*"([^"]+)",\s*"([^"]+)"\)', raw_output)

    # print(f"matches:\n{matches}")

    # Convert extracted triples into nodes and relationships
    nodes = set()
    relationships = []

    for entity1, relation, entity2 in matches:
        nodes.add(entity1)
        nodes.add(entity2)
        relationships.append((entity1, relation, entity2))

    # Convert to Full Node-Relationship-Node Format
    formatted_graph = [f"({e1}) -[:{r.replace(' ', '_').upper()}]-> ({e2})" for e1, r, e2 in relationships]
    if len(formatted_graph) == 0:
        print(f"Error no nodes found for: {entry_id}")
        bad_abstracts.append(entry_id)

    combined_formatted_graph.extend(formatted_graph)
    # print(len(combined_formatted_graph))
    # print()
    # print(formatted_graph)
print("Complete")

content='Here are the extracted medical relationships in the format you requested:\n\n("Mitochondria", "IS_A_RISK_FACTOR_FOR", "Programmed Cell Death")\n("Mitochondria", "REDUCES_RISK_OF", "Apoptosis")\n("Mitochondria", "PREDICTS", "Cell Death")\n("Chloroplasts", "IS_A_RISK_FACTOR_FOR", "Programmed Cell Death")\n("CsA", "TREATS", "Inflammation")\n("CsA", "IMPROVES", "Plant Growth")\n("Mitochondria", "IS_ASSOCIATED_WITH", "Nucleus")\n("Mitochondria", "INFLUENCES", "Cellular Dynamics")\n\nNote: Some of these relationships may not be directly related to the specific question about lace plant leaves, but they are extracted from the text and may provide context or supporting information.\n\nAlso, note that some relationships like "REDUCES_RISK_OF" and "PREDICTS" are not explicitly mentioned in the original text, but they can be inferred based on the context.' additional_kwargs={} response_metadata={'model': 'llama3.2', 'created_at': '2025-04-04T15:43:25.524267Z', 'done': True, 'done_reason'

In [82]:
abstracts.head()

Unnamed: 0,title,answer
21645374,Do mitochondria play a role in remodelling lac...,Results depicted mitochondrial dynamics in viv...
16418930,Landolt C and snellen e acuity: differences in...,"Using the charts described, there was only a s..."
9488747,"Syncope during bathing in infants, a pediatric...","""Aquagenic maladies"" could be a pediatric form..."
17208539,Are the long-term results of the transanal pul...,Our long-term study showed significantly bette...
10808977,Can tailored interventions increase mammograph...,The effects of the intervention were most pron...


In [83]:
import os

os.chdir('/Users/brianmann/git/knowledge_graph_creation')
print(os.getcwd())

/Users/brianmann/git/knowledge_graph_creation


In [84]:
with open('graph_output.txt', 'w') as f:
    for item in combined_formatted_graph:
        f.write(f"{item}\n")