In [2]:
import os
from langchain_neo4j import Neo4jGraph, GraphCypherQAChain
from langchain_ollama import ChatOllama
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.documents import Document

import xml.etree.ElementTree as ET
from langchain.schema import Document

import re
from langchain_ollama import ChatOllama
from langchain_core.prompts import PromptTemplate


In [3]:
# Set LLM
# llama3.3 has 70B params (see:https://github.com/ollama/ollama?tab=readme-ov-file)
# llama3.2 has 
# llm = ChatOllama(model="llama3.1", temperature=0)
# llm = ChatOllama(model="llama3.2", temperature=0)
llm = ChatOllama(model="llama3.2", temperature=0, max_tokens=2048) 




In [4]:
def read_file_in_chunks(filename, chunk_size=10):
    result = []
    current_chunk = []
    
    with open(filename, 'r') as file:
        for line in file:
            current_chunk.append(line.strip())
            
            if len(current_chunk) == chunk_size:
                result.append(current_chunk)
                current_chunk = []
    
    # Don't forget to add the last chunk if it's not empty
    if current_chunk:
        result.append(current_chunk)
    
    return result

# Example usage
file_path = "/Users/brianmann/Downloads/test/test_file1.txt"
chunks = read_file_in_chunks(file_path)
print(f"Number of chunks: {len(chunks)}")
print(f"First chunk: {chunks[0]}")

# Example usage
# file_path = "/Users/brianmann/Downloads/test/test_file1.txt"
# process_file_in_chunks(file_path)


Number of chunks: 1546
First chunk: ['<AbstractText>(--)-alpha-Bisabolol has a primary antipeptic action depending on dosage, which is not caused by an alteration of the pH-value. The proteolytic activity of pepsin is reduced by 50 percent through addition of bisabolol in the ratio of 1/0.5. The antipeptic action of bisabolol only occurs in case of direct contact. In case of a previous contact with the substrate, the inhibiting effect is lost.</AbstractText>', '<AbstractText>A report is given on the recent discovery of outstanding immunological properties in BA 1 [N-(2-cyanoethylene)-urea] having a (low) molecular mass M = 111.104. Experiments in 214 DS carcinosarcoma bearing Wistar rats have shown that BA 1, at a dosage of only about 12 percent LD50 (150 mg kg) and negligible lethality (1.7 percent), results in a recovery rate of 40 percent without hyperglycemia and, in one test, of 80 percent with hyperglycemia. Under otherwise unchanged conditions the reference substance ifosfamide 

In [9]:
combined_formatted_graph = []
bad_batch = []

for batch in chunks:
    structured_text = f'''Abstract: {batch}\nExtract medical relationships as structured triples (Entity1, Relationship, Entity2). Put all elements of a tuple on the same line
in the format ("entity1", "relationship", "entity2") . There must be two entities and a relationship and the relationship should not incluee the entities. Please at least 1 example for every line in the format specified'''

    response = llm.invoke(structured_text)
    # Extract the actual text from the LLM response
    raw_output = response.content

    # Print the cleaned raw output (for debugging)
    # print("Cleaned Raw LLM Output:\n", raw_output)

    # Extract (Entity1, Relationship, Entity2) triples using regex
    # matches = re.findall(r'"(.*?)" , "(.*?)" , "(.*?)"', raw_output)
    matches = re.findall(r'\("([^"]+)",\s*"([^"]+)",\s*"([^"]+)"\)', raw_output)

    # print(f"matches:\n{matches}")

    # Convert extracted triples into nodes and relationships
    nodes = set()
    relationships = []

    for entity1, relation, entity2 in matches:
        nodes.add(entity1)
        nodes.add(entity2)
        relationships.append((entity1, relation, entity2))

    # Convert to Full Node-Relationship-Node Format
    formatted_graph = [f"({e1}) -[:{r.replace(' ', '_').upper()}]-> ({e2})" for e1, r, e2 in relationships]
    if len(formatted_graph) == 0:
        print("Error no nodes found")
        bad_batch.append(batch)

    combined_formatted_graph.extend(formatted_graph)
    print(len(combined_formatted_graph))
    # print()
    # print(formatted_graph)
print("Complete")

12
24
Error no nodes found
24
Error no nodes found
24
Error no nodes found
24


KeyboardInterrupt: 

In [12]:
bad_batch[0]
# print(combined_formatted_graph)

['<AbstractText>A purification procedure is reported for obtaining bovine liver dihydrofolate reductase in high yield and amounts of 100-200 mg. A key step in the procedure is the use of an affinity gel prepared by coupling pteroyl-L-lysine to Sepharose. The purified reductase has a specific activity of about 100 units/mg and is homogeneous as judged by analytical ultracentrifugation, polyacrylamide gel electrophoresis, and titration with methotrexate. The products of the first step of Edman degradation indicated a minimum purity of 79%. The reductase has a molecular weight of about 21500 on the basis of amino acid composition and 22100 +/- 300 from equilibrium sedimentation. It is not inhibited by antiserum to the Streptococcus faecium reductase (isoenzyme 2). Unlike the reductase of many other vertebrate tissues, the bovine enzyme is inhibited by mercurials rather than activated and it has a single pH optimum at both low and high ionic strength. However, the position of the pH optimu

In [7]:
# Create output of all relationships found
with open('graph_output.txt', 'w') as output_file:
    for line in combined_formatted_graph:
        output_file.write(line + '\n')

print(f"Output written to graph_output.txt with {len(combined_formatted_graph)} lines")

Output written to graph_output.txt with 218 lines


# More Experimentation

In [15]:
# Read the test file
file_path = "/Users/brianmann/Downloads/test/test_file.txt"
# file_path="/Users/brianmann/Downloads/test/pubmed25n0001.xml"
with open(file_path, "r", encoding="utf-8") as file:
    content = file.read()

# Structure the text for better entity & relationship extraction

# Bad Prompt
# structured_text = f"Abstract: {content}\nExtract medical relationships as structured triples (Entity1, Relationship, Entity2). Please find 100 examples in this format and mention which line they come from"
    
# Good Prompt
structured_text = f'''Abstract: {content}\nExtract medical relationships as structured triples (Entity1, Relationship, Entity2). Put all elements of a tuple on the same line
in the format ("entity1", "relationship", "entity2") . There must be two entities and a relationship and the relationship should not incluee the entities. Please at least 1 example for every line in the format specified'''

# structured_text = f"""Abstract: {content}\nExtract all meaningful relationships from the following medical abstract. 
# Provide structured triples in this exact format:
# (Entity1) - [Relationship] -> (Entity2)."""


# Call Llama3.2 dynamically and extract the response text
# This could be a real problem doesn't take very many tokens by default
response = llm.invoke(structured_text)
# raw_output = llm.invoke(structured_text, max_tokens=1024)

# Extract the actual text from the LLM response
raw_output = response.content

# Print the cleaned raw output (for debugging)
print("Cleaned Raw LLM Output:\n", raw_output)

# Extract (Entity1, Relationship, Entity2) triples using regex
# matches = re.findall(r'"(.*?)" , "(.*?)" , "(.*?)"', raw_output)
matches = re.findall(r'\("([^"]+)",\s*"([^"]+)",\s*"([^"]+)"\)', raw_output)

# print(f"matches:\n{matches}")

# Convert extracted triples into nodes and relationships
nodes = set()
relationships = []

for entity1, relation, entity2 in matches:
    nodes.add(entity1)
    nodes.add(entity2)
    relationships.append((entity1, relation, entity2))

# Convert to Full Node-Relationship-Node Format
formatted_graph = [f"({e1}) -[:{r.replace(' ', '_').upper()}]-> ({e2})" for e1, r, e2 in relationships]
print()
print(formatted_graph)
# Print formatted output
# print("Extracted Nodes:", list(nodes))
# print("Extracted Relationships:", formatted_graph)


Cleaned Raw LLM Output:
 Here are the extracted medical relationships as structured triples:

("Sch 1000", "was found to be equipotent with", "oxazepam")
("Sch 1000", "produced a clear trend towards lowered mean values of acid", "stimulated acid")
("Sch 1000", "increased pH value only during basal period", "pH value")
("Sch 1000 + oxazepam", "was found to be equipotent with", "basal acid")
("Oxazepam", "reduced amount of basal acid only during first 30 min", "basal acid")
("Stroma from normal red cells", "inhibited lysis in sucrose test", "lysis")
("Stroma from PNH-like red cells", "enhanced lysis in acidified-serum test", "lysis")
("Exclusion peak from Sephadex G-200", "induced lysis of PNH-like cells in unacidified serum", "lysis")
("Alternate pathway activation", "controlled classical pathway activation", "classical pathway activation")
("Stroma from normal red cells", "activated complement through alternate pathway", "complement")
("Stroma from PNH-like red cells", "induced lysis o

In [14]:
len(formatted_graph)

10

In [6]:

# Extract (Entity1, Relationship, Entity2) triples using regex
matches = re.findall(r"`(.*?)` - `(.*?)` - `(.*?)`", raw_output)

# Convert extracted triples into nodes and relationships
nodes = set()
relationships = []

for entity1, relation, entity2 in matches:
    nodes.add(entity1)
    nodes.add(entity2)
    relationships.append((entity1, relation, entity2))

# Convert to Full Node-Relationship-Node Format
formatted_graph = [f"({e1}) -[:{r.replace(' ', '_').upper()}]-> ({e2})" for e1, r, e2 in relationships]

# Print formatted output
print("Extracted Nodes:", list(nodes))
print("Extracted Relationships:", formatted_graph)
print()
# Print formatted output
print("Full Node-Relationship-Node Format:\n")
for triple in formatted_graph:
    print(triple)


Extracted Nodes: []
Extracted Relationships: []

Full Node-Relationship-Node Format:



# With Chunking

In [3]:
import re
from langchain_ollama import ChatOllama
from concurrent.futures import ThreadPoolExecutor

# Initialize Llama3.2
llm = ChatOllama(model="llama3.2", temperature=0, max_tokens=2048)

# Read test file (each line is a separate abstract)
file_path = "/Users/brianmann/Downloads/test/test_file_small.txt"
with open(file_path, "r", encoding="utf-8") as file:
    abstracts = file.readlines()  # Read each line as an abstract

# Remove empty lines and strip whitespace
abstracts = [line.strip() for line in abstracts if line.strip()]

print(f"\n📌 Total Abstracts: {len(abstracts)}")  # Debugging: See how many abstracts are created

# Function to process each abstract (single line)
def process_abstract(abstract, index):
    print(f"\n📌 Processing Abstract {index+1}/{len(abstracts)}")

    structured_text = f"Abstract: {abstract}\nExtract medical relationships as structured triples (Entity1, Relationship, Entity2)."

    # Call Llama3.2
    response = llm.invoke(structured_text)
    raw_output = response["content"] if isinstance(response, dict) and "content" in response else str(response)

    # Extract relationships using regex
    matches = re.findall(r"`(.*?)` - `(.*?)` - `(.*?)`", raw_output)
    return matches

# Process abstracts in parallel
all_relationships = []
with ThreadPoolExecutor() as executor:
    results = executor.map(lambda pair: process_abstract(pair[1], pair[0]), enumerate(abstracts))
    for result in results:
        all_relationships.extend(result)

# Extract nodes
all_nodes = set()
for entity1, relation, entity2 in all_relationships:
    all_nodes.add(entity1)
    all_nodes.add(entity2)

# Convert to Full Node-Relationship-Node Format
formatted_graph = [f"({e1}) -[:{r.replace(' ', '_').upper()}]-> ({e2})" for e1, r, e2 in all_relationships]

# Print formatted results
print("\n✅ Extracted Nodes:", list(all_nodes))
print("\n✅ Extracted Relationships:", formatted_graph)



📌 Total Abstracts: 10

📌 Processing Abstract 1/10

📌 Processing Abstract 2/10

📌 Processing Abstract 3/10

📌 Processing Abstract 4/10

📌 Processing Abstract 5/10

📌 Processing Abstract 6/10

📌 Processing Abstract 7/10

📌 Processing Abstract 8/10

📌 Processing Abstract 9/10

📌 Processing Abstract 10/10


# Changing the prompt

In [33]:
# Read the test file
file_path = "/Users/brianmann/Downloads/test/test_file.txt"
# filepath="/Users/brianmann/Downloads/pubmed25n0001.xml"
with open(file_path, "r", encoding="utf-8") as file:
    content = file.read()

relationship_prompt = PromptTemplate(
    input_variables=["content"],  # This must match the placeholder in the template
    template="""
    Given the following biomedical text:

    {content}

    Identify and extract key medical relationships as structured triples.
    The output **MUST** be in **JSON format** with the following structure:

    ```json
    [
        {"entity1": "X-ray", "relationship": "Induces", "entity2": "DNA double strand breaks"},
        {"entity1": "Neutral filter elution method", "relationship": "Detects", "entity2": "DNA double strand breaks"}
    ]
    ```

    **Rules:**
    1. Use precise, domain-specific biomedical relationships.
    2. Include only factual relationships found in the text.
    3. Maintain a structured JSON format with "entity1", "relationship", and "entity2".

    Return ONLY the JSON object, with no extra text.
    """
)

formatted_prompt = relationship_prompt.format(content=content)

response = llm.invoke(formatted_prompt)
# raw_output = llm.invoke(structured_text, max_tokens=1024)

print(response)


KeyError: '"entity1"'

# Iterative approach

In [56]:
from langchain_core.prompts import PromptTemplate

# Read the test file
file_path = "/Users/brianmann/Downloads/test/test_file.txt"
with open(file_path, "r", encoding="utf-8") as file:
    content = file.read()

# ✅ Define a flexible prompt (not enforcing JSON)
relationship_prompt = f"""
    Given the following text:

    {content}

    Identify and extract key medical relationships from this text.
    
    **Guidelines:**
    - List all meaningful **biomedical entities**.
    - Show how these entities are **related**.
    - Format relationships as **structured triples** when possible.
    - Provide at least one relationship per line of the document

    **Example Output:**
    1. **Entity 1**: X-ray
       - **Relationship**: Induces
       - **Entity 2**: DNA double strand breaks
       
    2. **Entity 1**: Neutral filter elution method
       - **Relationship**: Detects
       - **Entity 2**: DNA double strand breaks
   
   Do not provide any extra output once you have completed the task.
    """

# ✅ Correctly format the prompt
# formatted_prompt = relationship_prompt.format(content=content)

# ✅ Call the LLM
response = llm.invoke(relationship_prompt)

# ✅ Print raw output first for inspection
print("First Iteration:\n", response, "\n")

# Take the response and feed it as the content
content2 = str(response).split("additional_kwargs")[0]

# before_additional_kwargs = text.split("additional_kwargs")[0]




# Iteration 2
structured_text = f"""
Previously, I provided this text:

{content}

and you provided these relations:

{content2}

Now, please extract additional medical relationships from the text I provided.
"""

response = llm.invoke(structured_text)

# ✅ Print raw output first for inspection
print("Second Iteration:\n", response, "\n")

# Take the response and feed it as the content
content3 = str(response).split("additional_kwargs")[0]




# Iteration 3
relationship_prompt = f"""
    Given the following previous responses:

    {content2}

    and 

    {content3}

Remove any introduction and format all of the data in the form:
    1. **Entity 1**: X-ray
       - **Relationship**: Induces
       - **Entity 2**: DNA double strand breaks
       
    2. **Entity 1**: Neutral filter elution method
       - **Relationship**: Detects
       - **Entity 2**: DNA double strand breaks

    """

# ✅ Correctly format the prompt
# formatted_prompt = relationship_prompt.format(content2=content2, content3=content3)

# ✅ Call the LLM
response = llm.invoke(relationship_prompt)

response_clean = str(response).split("additional_kwargs")[0]



# ✅ Print raw output first for inspection
print("Final LLM Output:\n", response_clean, "\n")


First Iteration:
 content="Here are the extracted key medical relationships:\n\n1. **X-ray** \n    - **Induces**\n    - **DNA double strand breaks**\n\n2. **Neutral filter elution method** \n    - **Detects**\n    - **DNA double strand breaks**\n\n3. **HpA I restriction endonuclease** \n    - **Introduces**\n    - **Double strand cuts in DNA**\n\n4. **Cucumber mosaic virus** \n    - **Has RNAs with sequences of 270 residues from the 3'-terminus**\n       - **Corresponding to each segment of the influenza virus genome**\n\n5. **Influenza virus** \n    - **Has segments corresponding to each RNA in the genome**\n       - **Mapped by restriction endonuclease analysis**\n\n6. **Poly(A(+))-RNA(tot)** \n    - **Contains a major peak in the 10-13 S region**\n       - **Accounting for approximately 35% of the total poly(A(+))-RNA applied**\n\n7. **Poly(A(+))-RNA(11S)** \n    - **Contains a single major peak in the 11S region**\n       - **Resolving to a single major component**\n\n8. **(3)H-cDN

# Trying just 10 abstracts

In [67]:
# Read the test file
file_path = "/Users/brianmann/Downloads/test/test_file_small.txt"
# filepath="/Users/brianmann/Downloads/pubmed25n0001.xml"
with open(file_path, "r", encoding="utf-8") as file:
    content = file.read()

# Structure the text for better entity & relationship extraction
structured_text = f"""Abstract: {content} \n
You are an expert Knowledge Graph creater with the ability to extract relationships from medical research.
You must connect two entities and state their relationship from the content provided. Extract medical relationships as structured triples (Entity1, Relationship, Entity2).
Don't explain your reasoning just provide me with a list of the tuples that you are able to find. There must be two entities and their relationship which gives a total of three things.
\n"""

# print(structured_text)

response = llm.invoke(structured_text)

# Extract the actual text from the LLM response
# raw_output = response["content"] if isinstance(response, dict) and "content" in response else str(response)

# Print the cleaned raw output (for debugging)
print("Cleaned Raw LLM Output:\n", response.content)

# Extract (Entity1, Relationship, Entity2) triples using regex
matches = re.findall(r"`(.*?)` - `(.*?)` - `(.*?)`", raw_output)

# Convert extracted triples into nodes and relationships
nodes = set()
relationships = []

for entity1, relation, entity2 in matches:
    nodes.add(entity1)
    nodes.add(entity2)
    relationships.append((entity1, relation, entity2))

# Convert to Full Node-Relationship-Node Format
formatted_graph = [f"({e1}) -[:{r.replace(' ', '_').upper()}]-> ({e2})" for e1, r, e2 in relationships]

# Print formatted output
print("Extracted Nodes:", list(nodes))
print("Extracted Relationships:", formatted_graph)


Cleaned Raw LLM Output:
 Here is the list of extracted medical relationships:

1. Multiple Sclerosis, Myelinotoxicity, Cerebrospinal Fluid
2. Alpha Fetoprotein, Neural Tube Defects, Open Neural Tube Defects
3. Beta2-Microglobulin, Proximal Tubular Function, Diuresis
4. Gluten-Free Diet, Villopathy, Malabsorption
5. Papanicolaou Smear, Cigarette Consumption, Oral Health
6. Hepatitis B Virus, Polyarteritis, e Ag/Ab System
Extracted Nodes: []
Extracted Relationships: []
