In [1]:
# Import necessary libraries
import os
import re
from pathlib import Path

from google.adk.models.lite_llm import LiteLlm # For OpenAI support

# Convenience libraries for working with Neo4j inside of Google ADK
from neo4j_for_adk import graphdb, tool_success, tool_error

import warnings
# Ignore all warnings
warnings.filterwarnings("ignore")

import logging
logging.basicConfig(level=logging.CRITICAL)

print("Libraries imported.")

Libraries imported.


In [2]:
# --- Define Model Constants for easier use ---
MODEL_GPT_4O = "openai/gpt-4o"

llm = LiteLlm(model=MODEL_GPT_4O)

# Test LLM with a direct call
print(llm.llm_client.completion(model=llm.model, messages=[{"role": "user", "content": "Are you ready?"}], tools=[]))

print("\nOpenAI ready.")

ModelResponse(id='chatcmpl-CHVPZKEaRZxgZo7MG3Bn3kEfNsMzB', created=1758288357, model='gpt-4o-2024-08-06', object='chat.completion', system_fingerprint='fp_cbf1785567', choices=[Choices(finish_reason='stop', index=0, message=Message(content="Yes, I'm ready. How can I assist you today?", role='assistant', tool_calls=None, function_call=None, provider_specific_fields={'refusal': None}, annotations=[]), provider_specific_fields={})], usage=Usage(completion_tokens=13, prompt_tokens=27, total_tokens=40, completion_tokens_details=CompletionTokensDetailsWrapper(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0, text_tokens=None), prompt_tokens_details=PromptTokensDetailsWrapper(audio_tokens=0, cached_tokens=0, text_tokens=None, image_tokens=None)), service_tier='default')

OpenAI ready.


In [3]:
# Check connection to Neo4j by sending a query
neo4j_is_ready = graphdb.send_query("RETURN 'Neo4j is Ready!' as message")

print(neo4j_is_ready)

{'status': 'success', 'query_result': [{'message': 'Neo4j is Ready!'}]}


In [4]:
from tools import load_product_nodes

load_product_nodes()

# expect to find non-entity nodes with a "Product" label
graphdb.send_query("MATCH (n) WHERE NOT n:`__Entity__` return DISTINCT labels(n) as nonEntityLabels")

{'status': 'success',
 'query_result': [{'nonEntityLabels': ['Assembly']},
  {'nonEntityLabels': ['Part']},
  {'nonEntityLabels': ['Product']},
  {'nonEntityLabels': ['Supplier']}]}

## 8.2.3 Initialize State from Previous Workflow

In [5]:
# the approved construction plan should look something like this...
approved_construction_plan = {
    "Assembly": {
        "construction_type": "node", 
        "source_file": "assemblies.csv", 
        "label": "Assembly", 
        "unique_column_name": "assembly_id", 
        "properties": ["assembly_name", "quantity", "product_id"]
    }, 
    "Part": {
        "construction_type": "node", 
        "source_file": "parts.csv", 
        "label": "Part", 
        "unique_column_name": "part_id", 
        "properties": ["part_name", "quantity", "assembly_id"]
    }, 
    "Product": {
        "construction_type": "node", 
        "source_file": "products.csv", 
        "label": "Product", 
        "unique_column_name": "product_id", 
        "properties": ["product_name", "price", "description"]
    }, 
    "Supplier": {
        "construction_type": "node", 
        "source_file": "suppliers.csv", 
        "label": "Supplier", 
        "unique_column_name": "supplier_id", 
        "properties": ["name", "specialty", "city", "country", "website", "contact_email"]
    }, 
    "Contains": {
        "construction_type": "relationship", 
        "source_file": "assemblies.csv", 
        "relationship_type": "Contains", 
        "from_node_label": "Product", 
        "from_node_column": "product_id", 
        "to_node_label": "Assembly", 
        "to_node_column": "assembly_id", 
        "properties": ["quantity"]
    }, 
    "Is_Part_Of": {
        "construction_type": "relationship", 
        "source_file": "parts.csv", 
        "relationship_type": "Is_Part_Of", 
        "from_node_label": "Part", 
        "from_node_column": "part_id", 
        "to_node_label": "Assembly", 
        "to_node_column": "assembly_id", 
        "properties": ["quantity"]
    }, 
    "Supplied_By": {
        "construction_type": "relationship", 
        "source_file": "part_supplier_mapping.csv", 
        "relationship_type": "Supplied_By", 
        "from_node_label": "Part", 
        "from_node_column": "part_id", 
        "to_node_label": "Supplier", 
        "to_node_column": "supplier_id", 
        "properties": ["supplier_name", "lead_time_days", "unit_cost", "minimum_order_quantity", "preferred_supplier"]
    }
}



In [6]:
approved_files = [
    "product_reviews/gothenburg_table_reviews.md",
    "product_reviews/helsingborg_dresser_reviews.md",
    "product_reviews/jonkoping_coffee_table_reviews.md",
    "product_reviews/linkoping_bed_reviews.md",
    "product_reviews/malmo_desk_reviews.md",
    "product_reviews/norrkoping_nightstand_reviews.md",
    "product_reviews/orebro_lamp_reviews.md",
    "product_reviews/stockholm_chair_reviews.md",
    "product_reviews/uppsala_sofa_reviews.md",
    "product_reviews/vasteras_bookshelf_reviews.md"
]

In [7]:
# approved entities from the `ner_agent` of Lesson 7
approved_entities = ['Product', 'Issue', 'Feature', 'Location']

In [8]:
# approved fact types from the `relevant_fact_agent` of Lesson 7
approved_fact_types = {'has_issue': {'subject_label': 'Product', 'predicate_label': 'has_issue', 'object_label': 'Issue'}, 'includes_feature': {'subject_label': 'Product', 'predicate_label': 'includes_feature', 'object_label': 'Feature'}, 'used_in_location': {'subject_label': 'Product', 'predicate_label': 'used_in_location', 'object_label': 'Location'}}

### 8.3 Tool Definitions for loading, chunking and entity extraction

In [9]:
from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline
 
# for example, creating a KG pipeline requires these arguments
if False:
    example = SimpleKGPipeline(
        llm=None, # the LLM to use for Entity and Relation extraction
        driver=None,  # a neo4j driver to write results to graph
        embedder=None,  # an Embedder for chunks
        from_pdf=True,   # sortof True because you will use a custom loader
        pdf_loader=None, # the custom loader for Markdown
        text_splitter=None, # the splitter you defined above
        schema=None, # that you just defined above
        prompt_template=None, # the template used for entity extraction on each chunk
    )

## 8.3.2 Text-Splitter for Chunking up the Markdown
Define a custom text splitter that uses regex patterns to chunk markdown text. This splitter breaks documents at specified delimiters (like "---") to create meaningful text segments for processing.

In [10]:
from neo4j_graphrag.experimental.components.text_splitters.base import TextSplitter
from neo4j_graphrag.experimental.components.types import TextChunk, TextChunks

# Define a custom text splitter. Chunking strategy could be yet-another-agent
class RegexTextSplitter(TextSplitter):
    """Split text using regex matched delimiters."""
    def __init__(self, re: str):
        self.re = re
    
    async def run(self, text: str) -> TextChunks:
        """Splits a piece of text into chunks.

        Args:
            text (str): The text to be split.

        Returns:
            TextChunks: A list of chunks.
        """
        texts = re.split(self.re, text)
        i = 0
        chunks = [TextChunk(text=str(text), index=i) for (i, text) in enumerate(texts)]
        return TextChunks(chunks=chunks)



## 8.3.3 Custom Markdown Data Loader
This custom loader adapts the Neo4j GraphRAG PDF loader to work with markdown files. It reads markdown content, extracts the document title from the first H1 header, and wraps it in the expected document format for the pipeline.

In [11]:
# custom file data loader

from neo4j_graphrag.experimental.components.pdf_loader import DataLoader
from neo4j_graphrag.experimental.components.types import PdfDocument, DocumentInfo

class MarkdownDataLoader(DataLoader):
    def extract_title(self,markdown_text):
        # Define a regex pattern to match the first h1 header
        pattern = r'^# (.+)$'

        # Search for the first match in the markdown text
        match = re.search(pattern, markdown_text, re.MULTILINE)

        # Return the matched group if found
        return match.group(1) if match else "Untitled"

    async def run(self, filepath: Path, metadata = {}) -> PdfDocument:
        with open(filepath, "r") as f:
            markdown_text = f.read()
        doc_headline = self.extract_title(markdown_text)
        markdown_info = DocumentInfo(
            path=str(filepath),
            metadata={
                "title": doc_headline,
            }
        )
        return PdfDocument(text=markdown_text, document_info=markdown_info)

In [12]:
from neo4j_graphrag.llm import OpenAILLM
from neo4j_graphrag.embeddings import OpenAIEmbeddings

# create an OpenAI client for use by Neo4j GraphRAG
llm_for_neo4j = OpenAILLM(model_name="gpt-4o", model_params={"temperature": 0})

# use OpenAI for creating embeddings
embedder = OpenAIEmbeddings(model="text-embedding-3-large")

# use the same driver set up by neo4j_for_adk.py
neo4j_driver = graphdb.get_driver()

## 8.3.5 Entity Schema

In [13]:
# approved entities list can be used directly 
schema_node_types = approved_entities

print("schema_node_types: ", schema_node_types)

schema_node_types:  ['Product', 'Issue', 'Feature', 'Location']


In [14]:
# the keys from approved fact types dictionary can be used for relationship types
schema_relationship_types = [key.upper() for key in approved_fact_types.keys()]

print("schema_relationship_types: ", schema_relationship_types)

schema_relationship_types:  ['HAS_ISSUE', 'INCLUDES_FEATURE', 'USED_IN_LOCATION']


In [15]:
# rewrite the fact types into a list of tuples
schema_patterns = [
    [ fact['subject_label'], fact['predicate_label'].upper(), fact['object_label'] ]
    for fact in approved_fact_types.values()
]

print("schema_patterns:", schema_patterns)

schema_patterns: [['Product', 'HAS_ISSUE', 'Issue'], ['Product', 'INCLUDES_FEATURE', 'Feature'], ['Product', 'USED_IN_LOCATION', 'Location']]


In [16]:
# the complete entity schema
entity_schema = {
    "node_types": schema_node_types,
    "relationship_types": schema_relationship_types,
    "patterns": schema_patterns,
    "additional_node_types": False, # True would be less strict, allowing unknown node types
}

### 8.3.6 Contexualized Entity Extraction Prompt
This helper function extracts the first few lines from a file to provide context for entity extraction. This context helps the LLM better understand the document structure and content when processing individual chunks.

In [17]:
def file_context(file_path:str, num_lines=5) -> str:
    """Helper function to extract the first few lines of a file

    Args:
        file_path (str): Path to the file
        num_lines (int, optional): Number of lines to extract. Defaults to 5.

    Returns:
        str: First few lines of the file
    """
    with open(file_path, 'r') as f:
        lines = []
        for _ in range(num_lines):
            line = f.readline()
            if not line:
                break
            lines.append(line)
    return "\n".join(lines)

In [18]:
# per-chunk entity extraction prompt, with context
def contextualize_er_extraction_prompt(context:str) -> str:
    """Creates a prompt with pre-amble file content for context during entity+relationship extraction.
    The context is concatenated into the string, which later will be used as a template
    for values like {schema} and {text}.
    """
    general_instructions = """
    You are a top-tier algorithm designed for extracting
    information in structured formats to build a knowledge graph.

    Extract the entities (nodes) and specify their type from the following text.
    Also extract the relationships between these nodes.

    Return result as JSON using the following format:
    {{"nodes": [ {{"id": "0", "label": "Person", "properties": {{"name": "John"}} }}],
    "relationships": [{{"type": "KNOWS", "start_node_id": "0", "end_node_id": "1", "properties": {{"since": "2024-08-01"}} }}] }}

    Use only the following node and relationship types (if provided):
    {schema}

    Assign a unique ID (string) to each node, and reuse it to define relationships.
    Do respect the source and target node types for relationship and
    the relationship direction.

    Make sure you adhere to the following rules to produce valid JSON objects:
    - Do not return any additional information other than the JSON in it.
    - Omit any backticks around the JSON - simply output the JSON on its own.
    - The JSON object must not wrapped into a list - it is its own JSON object.
    - Property names must be enclosed in double quotes
    """

    context_goes_here = f"""
    Consider the following context to help identify entities and relationships:
    <context>
    {context}  
    </context>"""
    
    input_goes_here = """
    Input text:

    {text}
    """

    return general_instructions + "\n" + context_goes_here + "\n" + input_goes_here

## 8.4 Make and Use the Knowledge Graph (KG) builder
This function creates a customized KG builder pipeline for a specific file by extracting file context and creating a contextualized extraction prompt. It combines all the previously defined components (loader, splitter, schema, LLM) into a complete pipeline.

Process each approved markdown file by creating a KG builder pipeline and running it asynchronously. This extracts entities and relationships from the text chunks and stores them in the Neo4j database as the subject graph.

In [23]:
def make_kg_builder(file_path:str) -> SimpleKGPipeline:
    """Builds a KG builder for a given file, which is used to contextualize the chunking and entity extraction."""
    context = file_context(file_path)
    contextualized_prompt = contextualize_er_extraction_prompt(context)
    print(contextualized_prompt)

    return SimpleKGPipeline(
        llm=llm_for_neo4j, # the LLM to use for Entity and Relation extraction
        driver=neo4j_driver,  # a neo4j driver to write results to graph
        embedder=embedder,  # an Embedder for chunks
        from_pdf=True,   # sortof True because you will use a custom loader
        pdf_loader=MarkdownDataLoader(), # the custom loader for Markdown
        text_splitter=RegexTextSplitter("---"), # the splitter you defined above
        schema=entity_schema, # that you just defined above
        prompt_template=contextualized_prompt,
    )

In [24]:
from helper import get_neo4j_import_dir

neo4j_import_dir = get_neo4j_import_dir() or "."
print(neo4j_import_dir)
for file_name in approved_files:
    file_path = os.path.join(neo4j_import_dir, file_name)
    print(f"Processing file: {file_path}")
    kg_builder = make_kg_builder(file_path)
    results = await kg_builder.run_async(file_path=str(file_path))
    print("\tResults:", results.result)
print("All files processed.")

/Users/ifilimon/Documents/neo4j/import
Processing file: /Users/ifilimon/Documents/neo4j/import/product_reviews/gothenburg_table_reviews.md

    You are a top-tier algorithm designed for extracting
    information in structured formats to build a knowledge graph.

    Extract the entities (nodes) and specify their type from the following text.
    Also extract the relationships between these nodes.

    Return result as JSON using the following format:
    {{"nodes": [ {{"id": "0", "label": "Person", "properties": {{"name": "John"}} }}],
    "relationships": [{{"type": "KNOWS", "start_node_id": "0", "end_node_id": "1", "properties": {{"since": "2024-08-01"}} }}] }}

    Use only the following node and relationship types (if provided):
    {schema}

    Assign a unique ID (string) to each node, and reuse it to define relationships.
    Do respect the source and target node types for relationship and
    the relationship direction.

    Make sure you adhere to the following rules to produ

CancelledError: 

### GraphRAG documentation here: https://graphrag.com/reference/knowledge-graph/domain-graph/