In [1]:
import os
from langchain_neo4j import Neo4jGraph, GraphCypherQAChain
from langchain_ollama import ChatOllama
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.documents import Document

import xml.etree.ElementTree as ET
from langchain.schema import Document

import re
from langchain_ollama import ChatOllama
from langchain_core.prompts import PromptTemplate


In [2]:
os.environ["NEO4J_URI"] = "bolt://localhost:7687"
os.environ["NEO4J_USERNAME"] = "neo4j"
os.environ["NEO4J_PASSWORD"] = "sunsh1ne1"

In [3]:
# Set LLM
# llama3.3 has 70B params (see:https://github.com/ollama/ollama?tab=readme-ov-file)
# llama3.2 has 
# llm = ChatOllama(model="llama3.1", temperature=0)
# llm = ChatOllama(model="llama3.2", temperature=0)
llm = ChatOllama(model="llama3.2", temperature=0, max_tokens=2048) 




# Put it all together

In [42]:
# Read the test file
file_path = "/Users/brianmann/Downloads/test/test_file.txt"
# file_path="/Users/brianmann/Downloads/test/pubmed25n0001.xml"
with open(file_path, "r", encoding="utf-8") as file:
    content = file.read()

# Structure the text for better entity & relationship extraction

# Bad Prompt
# structured_text = f"Abstract: {content}\nExtract medical relationships as structured triples (Entity1, Relationship, Entity2)."
    
# Good Prompt
structured_text = f'''Abstract: {content}\nExtract medical relationships as structured triples (Entity1, Relationship, Entity2). Put all elements of a tuple on the same line
in the format ("entity1", "relationship", "entity2") . There must be two entities and a relationship and the relationship should not incluee the entities.'''

# structured_text = f"""Abstract: {content}\nExtract all meaningful relationships from the following medical abstract. 
# Provide structured triples in this exact format:
# (Entity1) - [Relationship] -> (Entity2)."""


# Call Llama3.2 dynamically and extract the response text
# This could be a real problem doesn't take very many tokens by default
response = llm.invoke(structured_text)
# raw_output = llm.invoke(structured_text, max_tokens=1024)

# Extract the actual text from the LLM response
raw_output = response.content

# Print the cleaned raw output (for debugging)
print("Cleaned Raw LLM Output:\n", raw_output)

# Extract (Entity1, Relationship, Entity2) triples using regex
# matches = re.findall(r'"(.*?)" , "(.*?)" , "(.*?)"', raw_output)
matches = re.findall(r'\("([^"]+)",\s*"([^"]+)",\s*"([^"]+)"\)', raw_output)

print(f"matches:\n{matches}")

# Convert extracted triples into nodes and relationships
nodes = set()
relationships = []

for entity1, relation, entity2 in matches:
    nodes.add(entity1)
    nodes.add(entity2)
    relationships.append((entity1, relation, entity2))

# Convert to Full Node-Relationship-Node Format
formatted_graph = [f"({e1}) -[:{r.replace(' ', '_').upper()}]-> ({e2})" for e1, r, e2 in relationships]

# Print formatted output
print("Extracted Nodes:", list(nodes))
print("Extracted Relationships:", formatted_graph)


Cleaned Raw LLM Output:
 Here are the extracted medical relationships in the format you requested:

("Paracetamol", "may reduce", "Hepatic damage")
("Methionine", "may reduce", "Hepatic damage")
("Catecholamines", "are synthesized", "Phaeochromocytomas")
("Vitamin B12", "deficiency causes", "Neurological complications")
("Folate metabolism", "is affected by", "Vitamin B12 deficiency")
("P.T.T. test", "should use", "Standardised reagent and technique")
("Intrinsic clotting abnormality", "can be detected by", "P.T.T. test")
("Commercial reagents", "may fail to detect", "Intrinsic clotting abnormality")
("Cephalin extracts", "may be insensitive to", "Commercial reagents")
("Manufacturers' techniques", "may be unreliable for", "Commercial reagents")
matches:
[('Paracetamol', 'may reduce', 'Hepatic damage'), ('Methionine', 'may reduce', 'Hepatic damage'), ('Catecholamines', 'are synthesized', 'Phaeochromocytomas'), ('Vitamin B12', 'deficiency causes', 'Neurological complications'), ('Folate

In [6]:

# Extract (Entity1, Relationship, Entity2) triples using regex
matches = re.findall(r"`(.*?)` - `(.*?)` - `(.*?)`", raw_output)

# Convert extracted triples into nodes and relationships
nodes = set()
relationships = []

for entity1, relation, entity2 in matches:
    nodes.add(entity1)
    nodes.add(entity2)
    relationships.append((entity1, relation, entity2))

# Convert to Full Node-Relationship-Node Format
formatted_graph = [f"({e1}) -[:{r.replace(' ', '_').upper()}]-> ({e2})" for e1, r, e2 in relationships]

# Print formatted output
print("Extracted Nodes:", list(nodes))
print("Extracted Relationships:", formatted_graph)
print()
# Print formatted output
print("Full Node-Relationship-Node Format:\n")
for triple in formatted_graph:
    print(triple)


Extracted Nodes: []
Extracted Relationships: []

Full Node-Relationship-Node Format:



# With Chunking

In [7]:
import re
from langchain_ollama import ChatOllama
from concurrent.futures import ThreadPoolExecutor

# Initialize Llama3.2
llm = ChatOllama(model="llama3.2", temperature=0, max_tokens=2048)

# Read test file (each line is a separate abstract)
file_path = "/Users/brianmann/Downloads/test/test_file_small.txt"
with open(file_path, "r", encoding="utf-8") as file:
    abstracts = file.readlines()  # Read each line as an abstract

# Remove empty lines and strip whitespace
abstracts = [line.strip() for line in abstracts if line.strip()]

print(f"\n📌 Total Abstracts: {len(abstracts)}")  # Debugging: See how many abstracts are created

# Function to process each abstract (single line)
def process_abstract(abstract, index):
    print(f"\n📌 Processing Abstract {index+1}/{len(abstracts)}")

    structured_text = f"Abstract: {abstract}\nExtract medical relationships as structured triples (Entity1, Relationship, Entity2)."

    # Call Llama3.2
    response = llm.invoke(structured_text)
    raw_output = response["content"] if isinstance(response, dict) and "content" in response else str(response)

    # Extract relationships using regex
    matches = re.findall(r"`(.*?)` - `(.*?)` - `(.*?)`", raw_output)
    return matches

# Process abstracts in parallel
all_relationships = []
with ThreadPoolExecutor() as executor:
    results = executor.map(lambda pair: process_abstract(pair[1], pair[0]), enumerate(abstracts))
    for result in results:
        all_relationships.extend(result)

# Extract nodes
all_nodes = set()
for entity1, relation, entity2 in all_relationships:
    all_nodes.add(entity1)
    all_nodes.add(entity2)

# Convert to Full Node-Relationship-Node Format
formatted_graph = [f"({e1}) -[:{r.replace(' ', '_').upper()}]-> ({e2})" for e1, r, e2 in all_relationships]

# Print formatted results
print("\n✅ Extracted Nodes:", list(all_nodes))
print("\n✅ Extracted Relationships:", formatted_graph)



📌 Total Abstracts: 10

📌 Processing Abstract 1/10

📌 Processing Abstract 2/10

📌 Processing Abstract 3/10

📌 Processing Abstract 4/10

📌 Processing Abstract 5/10

📌 Processing Abstract 6/10

📌 Processing Abstract 7/10

📌 Processing Abstract 8/10

📌 Processing Abstract 9/10

📌 Processing Abstract 10/10

✅ Extracted Nodes: []

✅ Extracted Relationships: []


# Changing the prompt

In [33]:
# Read the test file
file_path = "/Users/brianmann/Downloads/test/test_file.txt"
# filepath="/Users/brianmann/Downloads/pubmed25n0001.xml"
with open(file_path, "r", encoding="utf-8") as file:
    content = file.read()

relationship_prompt = PromptTemplate(
    input_variables=["content"],  # This must match the placeholder in the template
    template="""
    Given the following biomedical text:

    {content}

    Identify and extract key medical relationships as structured triples.
    The output **MUST** be in **JSON format** with the following structure:

    ```json
    [
        {"entity1": "X-ray", "relationship": "Induces", "entity2": "DNA double strand breaks"},
        {"entity1": "Neutral filter elution method", "relationship": "Detects", "entity2": "DNA double strand breaks"}
    ]
    ```

    **Rules:**
    1. Use precise, domain-specific biomedical relationships.
    2. Include only factual relationships found in the text.
    3. Maintain a structured JSON format with "entity1", "relationship", and "entity2".

    Return ONLY the JSON object, with no extra text.
    """
)

formatted_prompt = relationship_prompt.format(content=content)

response = llm.invoke(formatted_prompt)
# raw_output = llm.invoke(structured_text, max_tokens=1024)

print(response)


KeyError: '"entity1"'

# Iterative approach

In [56]:
from langchain_core.prompts import PromptTemplate

# Read the test file
file_path = "/Users/brianmann/Downloads/test/test_file.txt"
with open(file_path, "r", encoding="utf-8") as file:
    content = file.read()

# ✅ Define a flexible prompt (not enforcing JSON)
relationship_prompt = f"""
    Given the following text:

    {content}

    Identify and extract key medical relationships from this text.
    
    **Guidelines:**
    - List all meaningful **biomedical entities**.
    - Show how these entities are **related**.
    - Format relationships as **structured triples** when possible.
    - Provide at least one relationship per line of the document

    **Example Output:**
    1. **Entity 1**: X-ray
       - **Relationship**: Induces
       - **Entity 2**: DNA double strand breaks
       
    2. **Entity 1**: Neutral filter elution method
       - **Relationship**: Detects
       - **Entity 2**: DNA double strand breaks
   
   Do not provide any extra output once you have completed the task.
    """

# ✅ Correctly format the prompt
# formatted_prompt = relationship_prompt.format(content=content)

# ✅ Call the LLM
response = llm.invoke(relationship_prompt)

# ✅ Print raw output first for inspection
print("First Iteration:\n", response, "\n")

# Take the response and feed it as the content
content2 = str(response).split("additional_kwargs")[0]

# before_additional_kwargs = text.split("additional_kwargs")[0]




# Iteration 2
structured_text = f"""
Previously, I provided this text:

{content}

and you provided these relations:

{content2}

Now, please extract additional medical relationships from the text I provided.
"""

response = llm.invoke(structured_text)

# ✅ Print raw output first for inspection
print("Second Iteration:\n", response, "\n")

# Take the response and feed it as the content
content3 = str(response).split("additional_kwargs")[0]




# Iteration 3
relationship_prompt = f"""
    Given the following previous responses:

    {content2}

    and 

    {content3}

Remove any introduction and format all of the data in the form:
    1. **Entity 1**: X-ray
       - **Relationship**: Induces
       - **Entity 2**: DNA double strand breaks
       
    2. **Entity 1**: Neutral filter elution method
       - **Relationship**: Detects
       - **Entity 2**: DNA double strand breaks

    """

# ✅ Correctly format the prompt
# formatted_prompt = relationship_prompt.format(content2=content2, content3=content3)

# ✅ Call the LLM
response = llm.invoke(relationship_prompt)

response_clean = str(response).split("additional_kwargs")[0]



# ✅ Print raw output first for inspection
print("Final LLM Output:\n", response_clean, "\n")


First Iteration:
 content="Here are the extracted key medical relationships:\n\n1. **X-ray** \n    - **Induces**\n    - **DNA double strand breaks**\n\n2. **Neutral filter elution method** \n    - **Detects**\n    - **DNA double strand breaks**\n\n3. **HpA I restriction endonuclease** \n    - **Introduces**\n    - **Double strand cuts in DNA**\n\n4. **Cucumber mosaic virus** \n    - **Has RNAs with sequences of 270 residues from the 3'-terminus**\n       - **Corresponding to each segment of the influenza virus genome**\n\n5. **Influenza virus** \n    - **Has segments corresponding to each RNA in the genome**\n       - **Mapped by restriction endonuclease analysis**\n\n6. **Poly(A(+))-RNA(tot)** \n    - **Contains a major peak in the 10-13 S region**\n       - **Accounting for approximately 35% of the total poly(A(+))-RNA applied**\n\n7. **Poly(A(+))-RNA(11S)** \n    - **Contains a single major peak in the 11S region**\n       - **Resolving to a single major component**\n\n8. **(3)H-cDN

# Trying just 10 abstracts

In [67]:
# Read the test file
file_path = "/Users/brianmann/Downloads/test/test_file_small.txt"
# filepath="/Users/brianmann/Downloads/pubmed25n0001.xml"
with open(file_path, "r", encoding="utf-8") as file:
    content = file.read()

# Structure the text for better entity & relationship extraction
structured_text = f"""Abstract: {content} \n
You are an expert Knowledge Graph creater with the ability to extract relationships from medical research.
You must connect two entities and state their relationship from the content provided. Extract medical relationships as structured triples (Entity1, Relationship, Entity2).
Don't explain your reasoning just provide me with a list of the tuples that you are able to find. There must be two entities and their relationship which gives a total of three things.
\n"""

# print(structured_text)

response = llm.invoke(structured_text)

# Extract the actual text from the LLM response
# raw_output = response["content"] if isinstance(response, dict) and "content" in response else str(response)

# Print the cleaned raw output (for debugging)
print("Cleaned Raw LLM Output:\n", response.content)

# Extract (Entity1, Relationship, Entity2) triples using regex
matches = re.findall(r"`(.*?)` - `(.*?)` - `(.*?)`", raw_output)

# Convert extracted triples into nodes and relationships
nodes = set()
relationships = []

for entity1, relation, entity2 in matches:
    nodes.add(entity1)
    nodes.add(entity2)
    relationships.append((entity1, relation, entity2))

# Convert to Full Node-Relationship-Node Format
formatted_graph = [f"({e1}) -[:{r.replace(' ', '_').upper()}]-> ({e2})" for e1, r, e2 in relationships]

# Print formatted output
print("Extracted Nodes:", list(nodes))
print("Extracted Relationships:", formatted_graph)


Cleaned Raw LLM Output:
 Here is the list of extracted medical relationships:

1. Multiple Sclerosis, Myelinotoxicity, Cerebrospinal Fluid
2. Alpha Fetoprotein, Neural Tube Defects, Open Neural Tube Defects
3. Beta2-Microglobulin, Proximal Tubular Function, Diuresis
4. Gluten-Free Diet, Villopathy, Malabsorption
5. Papanicolaou Smear, Cigarette Consumption, Oral Health
6. Hepatitis B Virus, Polyarteritis, e Ag/Ab System
Extracted Nodes: []
Extracted Relationships: []
