# Knowledge Graph Creation

In [None]:
import json
import re
from langchain_ollama import ChatOllama
from neo4j import GraphDatabase

### This is where we alter the prompt for our experiment. 
#### These are currently extremely simplified and logically not very sound (in terms of medical question answering but fine for just finding relations)
#### If the kg syntax errors it's because I haven't implemented strict output types yet

In [None]:
# Queries a LLM to extract entities only
def entities_query(text: str):
     return f'''Please return a list of all single and multi-word terms and phrases from the following text that could describe a term, entity, or idea: {text}.
       Ignore stop words like and, but, or, with, and other words like that.
       If a set of words combined describe one specific thing, please return it as one item in the list. 
       For example, in the example sentence "The auxiliary medical device effects the blood pressure of the patient", the list would look like [auxiliary medical device, medical device, device, blood pressure of the patient, blood pressure]. 
       Another example, in the example input "Do radiologists tend to perform better under cooler temperatures?", the list would look like [radiologists, cooler, cooler tempreatures, temperatures].
       If an item in the list contains adjectives describing a noun, please make sure that noun is included on its own in the list as well. 
       Now please respond with absolutely nothing except the list of any and all single and multi-word terms and phrases from the following text that could describe a term, entity, or idea: {text}.'''

# Returns a query to a LLM that includes a text sample and a list of entities. The query instructs the LLM to return a set of triples with relations. 
# The possible relations are: POS, NEG, MAY
def kg_query(entities: list[str], sample: str):
    return f'''Here is a text sample with information about a medical topic: {sample}.
    Here is a list of entities pulled from that list: {entities}.
    For each entity in the list above (entity1), find every single other entities (entity2) that it is related to in any way from the list based on the text sample.
    If that relationship could be described as neutral to positive (or synonymous with positive given the context), please return that relationship as follows: [entity1, POS, entity2]
    If that relationship could be described as less than neutral to negative (or synonymous with negative given the context), please return that relationship as follows: term1 = entity1, relation = [entity1, NEG, entity2]
    If and only if that relationship is stated to be unknown or unsure with proper descriptors to match, only then return that relationship as follows: [entity1, MAY, entity2]
    Return all of these possible permutations as a list of string relationships in the format described above. 
    Do not respond with anything except that list in the following format, and make sure syntax is exact (do not add newline characters):
    [(entity1, relation, entity2), ... ]
    '''

#### Extra function definition

In [5]:
def convert_to_tuples(input_list):
    result = []
    
    for item in input_list:
        # Remove unnecessary characters and parse the string representation of a list of tuples
        item_clean = item.strip('[]').replace('\n', '')  # Remove square brackets and newlines
        tuples = re.findall(r'\(([^)]+)\)', item_clean)  # Match content inside parentheses
        for t in tuples:
            # Convert each tuple string into a tuple of terms (term, relation, term)
            term1, relation, term2 = [x.strip() for x in t.split(',')]
            result.append((term1, relation, term2))
    
    return result

### Read in json file  

In [None]:
# Global variable, set to the number of abstracts to be read in
# Set to -1 to read in all abstracts
NUM = 15

In [None]:
filename = "../ori_pqal.json"
with open(filename, 'r', encoding='utf-8') as f:
    labeled_instances = json.load(f)

#get questions and long answers 
# IMPORTANT: THIS ASSUMES THAT EACH ENTRY IN THE JSON FILE HAS A QUESTION, CONCLUSION, GROUND TRUTH IN THIS FORMAT. 
# IF WE FIND BAD RESULTS IN LATER QUESTIONS LOOK HERE
questions = []
conclusions = []
ground_truth = []
for paper in labeled_instances:
    questions.append(labeled_instances[paper]['QUESTION'])
    conclusions.append(labeled_instances[paper]['LONG_ANSWER'])
    ground_truth.append(labeled_instances[paper]['final_decision'])
    
#Only use first 5 for now
questions = questions[:NUM]
titles = questions[:NUM]
conclusions = conclusions[:NUM]
ground_truth = ground_truth[:NUM]

### Set LLM as desired model

In [None]:
llm = ChatOllama(model="llama3.2", temperature=0) 

## Querying LLM to create KG

#### First, query LLM to extract relevant entities from each conclusion, then query LLM with entities and conclusion to create KG

In [10]:
kg = []
for idx in range(len(conclusions)):
    #Use query defined above to extract entities from conclusion
    sentence_output = llm.invoke(entities_query(titles[idx])).content.replace("'", "")
    #Regex in place of strict relation types
    entities = [re.sub(r'[^a-zA-Z0-9]', '', re.sub(r'^\d+\.\s*', '', item)) for item in sentence_output.split("\n")]
    #Create kg using those entities and query defined above
    kg.append(llm.invoke(kg_query(entities, conclusions[idx])).content)

#Fix formatting of list (function defined above)
print(kg)
knowledge_graph = convert_to_tuples(kg)

['[(Mitochondria, POS, Remodelling), (Mitochondria, POS, Programmedcelldeath), (Mitochondria, POS, Celldeath), \n (Mitochondria, POS, Plantleaves), (Mitochondria, POS, Death), (Remodelling, MAY, Mitochondria), \n (Programmedcelldeath, POS, Remodelling), (Programmedcelldeath, POS, Celldeath), (Programmedcelldeath, POS, Plantleaves), \n (Programmedcelldeath, POS, Death), (Celldeath, POS, Programmedcelldeath), (Celldeath, POS, Plantleaves), \n (Celldeath, POS, Death), (Plantleaves, MAY, Mitochondria), (Death, MAY, Remodelling), \n (Remodelling, NEG, Celldeath), (Programmedcelldeath, NEG, Plantleaves)]', '[(Strabismus, POS, Amblyopia), \n (Amblyopia, POS, Strabismus), \n (Acuity, POS, LandoltC), \n (LandoltC, POS, Acuity), \n (SnellenE, POS, LandoltC), \n (LandoltC, POS, SnellenE), \n (Strabismicamblyopia, POS, Amblyopia), \n (Amblyopia, POS, Strabismicamblyopia), \n (Binocularvision, POS, Vision), \n (Vision, POS, Binocularvision)]', "['Syncope', POS, 'Urticaria']\n['Syncope', POS, 'Water

## Neo4j setup

In [12]:
uri = "bolt://localhost:7687"
username = "neo4j"
password = "neo4j_password" 

### Neo4j Knowledge Graph Creation

In [55]:
driver = GraphDatabase.driver(uri, auth=(username, password))

### Knowledge Graph Creation

def create_knowledge_graph(data):
    with driver.session() as session:
        for node1, relation, node2 in data:
            session.execute_write(create_relationship, node1.capitalize(), relation, node2.capitalize())


def create_relationship(tx, node1, relation, node2):
    relation_map = {
        "POS": "positively_related_to",
        "NEG": "negatively_related_to",
        "MAY": "may_be_related_to"
    }
    
    relationship = relation_map.get(relation, "related_to") 

    query = (
        "MERGE (a:Entity {name: $node1}) "
        "MERGE (b:Entity {name: $node2}) "
        "MERGE (a)-[r:" + relationship + "]->(b)"
    )
    tx.run(query, node1=node1, relation=relationship, node2=node2)

create_knowledge_graph(knowledge_graph)

# Close the driver connection after use


#### Take a peek in the knowledge graph

In [56]:

'''
def query_graph():
    # Open a session with the Neo4j database
    with driver.session() as session:
        # Run the query to get the first 25 nodes and their relationships
        result = session.run("""
            MATCH (n)-[r]->(m) 
            RETURN n, r, m 
            LIMIT 25
        """)
        
        # Iterate through the result and print the nodes and relationships
        for record in result:
            node1 = record['n']
            relation = record['r']
            node2 = record['m']
            
            print({
                'Node1': node1['name'],  # Assuming 'name' is a property
                'Relation': type(relation).__name__,  # Type of the relationship
                'Node2': node2['name']  # Assuming 'name' is a property
            })
    session.close()

query_graph()

'''


'\ndef query_graph():\n    # Open a session with the Neo4j database\n    with driver.session() as session:\n        # Run the query to get the first 25 nodes and their relationships\n        result = session.run("""\n            MATCH (n)-[r]->(m) \n            RETURN n, r, m \n            LIMIT 25\n        """)\n\n        # Iterate through the result and print the nodes and relationships\n        for record in result:\n            node1 = record[\'n\']\n            relation = record[\'r\']\n            node2 = record[\'m\']\n\n            print({\n                \'Node1\': node1[\'name\'],  # Assuming \'name\' is a property\n                \'Relation\': type(relation).__name__,  # Type of the relationship\n                \'Node2\': node2[\'name\']  # Assuming \'name\' is a property\n            })\n    session.close()\n\nquery_graph()\n\n'

### Question evaluation (will change with relation type)

In [None]:
with driver.session() as session:
    truth_index = -1
    numerator = 0
    not_found = 0
    for question in questions:
        truth_index = truth_index + 1
        response = "no answer found"
        question_output = llm.invoke(entities_query(question)).content.replace("'", "")
        nodes = [re.sub(r'[^a-zA-Z0-9]', '', re.sub(r'^\d+\.\s*', '', item)) for item in question_output.split("\n")]
        print("Question nodes:", nodes)
        for i in range(len(nodes)):
            for j in range(i+1, len(nodes)): 
                cypher_query = f"""
                            MATCH (n {{name: '{nodes[i].capitalize()}'}})-[r]->(m {{name: '{nodes[j].capitalize()}'}})
                            RETURN n, r, m
                        """
                result = session.run(cypher_query)
                
                if result.peek():
                    # Logic based on current relation types
                    for record in result:
                        if record['r'].type == 'negatively_related_to':
                            response = 'no'
                        elif record['r'].type == 'positively_related_to':
                            response = 'yes'
                        else: 
                            response = 'maybe'                        
                    break 
        print(truth_index)
        print('Question:', question, '\nResponse:', response, '\nGround Truth:', ground_truth[truth_index], '\n\n')
        if response == ground_truth[truth_index]:
            numerator += 1
        if response == "no answer found":
            not_found += 1
    print('Accuracy:', numerator / len(questions))
    print('Not found rate:', not_found / len(questions))
driver.close()

Question nodes: ['Mitochondria', 'Remodelling', 'Laceplant', 'Programmedcelldeath', 'Celldeath', 'Plantleaves', 'Death', 'Remodelling']
0
Question: Do mitochondria play a role in remodelling lace plant leaves during programmed cell death? 
Response: maybe 
Ground Truth: yes 


Question nodes: ['Strabismus', 'Amblyopia', 'Acuity', 'LandoltC', 'SnellenE', 'Eye', 'Vision', 'Blurredvision', 'Visualimpairment', 'Eyecondition', 'Strabismicamblyopia', 'Binocularvision']
1
Question: Landolt C and snellen e acuity: differences in strabismus amblyopia? 
Response: yes 
Ground Truth: no 


Question nodes: ['Syncope', 'Urticaria', 'Waterinducedurticaria', 'Pediatricform', 'Infants', 'Bathing', 'Water', 'Medicalcondition', 'Syndrome', 'Reaction', 'Infantileform', 'Pediatric']
2
Question: Syncope during bathing in infants, a pediatric form of water-induced urticaria? 
Response: no answer found 
Ground Truth: yes 


Question nodes: ['Transanal', 'Transabdominal', 'Pullthrough', 'Longterm', 'Results', 