### Prepare the code and setup database

In [66]:
import os
import json
import pandas as pd
import re
import unicodedata
#from langchain_community.graphs import Neo4jGraph
from langchain_neo4j import Neo4jGraph
from langchain_neo4j import Neo4jVector
from langchain_ollama import OllamaEmbeddings, ChatOllama
from utils.pdf_utils import extract_pdf_text_by_page, chunk_pages
from langchain_core.messages import SystemMessage, HumanMessage

In [67]:
# Load and read data
data_path = "data_json/test_data.json"

with open(data_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

df =  pd.read_json(data_path)

In [68]:
# Log-in and setup database connection
url = "bolt://localhost:7687"
username = "neo4j"
password = "password"

graph = Neo4jGraph(
    url=url,
    username=username,
    password=password
)

### Define helpful prompts and linkages

In [69]:
entity_types = {
    "document": "Represents a digital file stored in the system, such as a PDF, drawing, or report. Each document node stores metadata like name, path, size, imported date, and template flag.",
    "user": "Represents the person or account who imported, created, or owns the document. Typically identified by username.",
    "file_type": "Represents the document format, such as 'PDF', 'DOCX', or 'DWG'. Shared across multiple documents to avoid duplication.",
    "status": "Represents the current or historical status of a document (e.g., draft, approved, archived). May also hold status codes or timestamps.",
    "file_date": "Represents temporal information related to the document, such as the creation or modification date and time. Useful for time-based queries.",
    "description": "Represents a semantic classification or content type of the document ‚Äî for example 'PROCEDURE', 'MANUAL', 'DRAWING', 'SPECIFICATION'."
}

relation_types = {
    "IMPORTED_BY": "Indicates which user imported the document into the system.",
    "HAS_FORMAT": "Links a document to its file format (PDF, DOCX, etc.).",
    "HAS_STATUS": "Associates a document with its workflow or approval status.",
    "HAS_FILETIME": "Links a document to its file date and time metadata.",
    "IS_TYPE": "Classifies the document by description or semantic category (e.g., PROCEDURE, MANUAL)."
}

entity_relationship_match = {
    "user": "IMPORTED_BY",
    "file_type": "HAS_FORMAT",
    "status": "HAS_STATUS",
    "file_date": "HAS_FILETIME",
    "description": "IS_TYPE"
}

In [70]:
system_prompt = f"""
You are an intelligent assistant that converts natural language questions into structured JSON
queries for a Neo4j document graph.

The graph stores these entity types:
{json.dumps(entity_types, indent=2)}

Relationships between them:
{json.dumps(relation_types, indent=2)}

Each user query may reference one or more of these entities (e.g., User, Description, FileType, Status, FileDate).
Your goal is to extract as many of them as possible and return a JSON object with their corresponding values.

Follow these rules:
1Ô∏è‚É£ Always output valid JSON.
2Ô∏è‚É£ Include keys only from the entity_types list.
3Ô∏è‚É£ If the query references a document or file, include `"Document": "<name>"`.
4Ô∏è‚É£ If unsure, infer the most likely match (e.g., "procedure" ‚Üí `"Description": "Procedure"`, "PDF" ‚Üí `"FileType": "PDF"`).
5Ô∏è‚É£ If no information can be extracted, return an empty JSON.

Examples:

User: "Find all procedures imported by haaler related to PDF files."
Output:
{{
  "Description": "Procedure",
  "FileType": "PDF",
  "User": "haaler"
}}

User: "When was the document Kasra Taheri created?"
Output:
{{
  "Document": "Kasra Taheri",
  "FileDate": ""
}}

User: "What is the current status on the document Dick Ackerman?"
Output:
{{
  "Document": "Dick Ackerman",
  "Status": ""
}}
"""


In [71]:
SYSTEM_REASONING_PROMPT = """
You are an assistant that answers questions about a knowledge graph of documents.
The graph contains entities like Document, User, FileType, Description, and Status.
You will be given:
1. A structured query (key‚Äìvalue pairs)
2. A subgraph context (retrieved from the graph)
Use them to answer concisely and factually.
"""

### Create the graph

In [72]:
# --- HELPER ---
def sanitize(text):
    return str(text).replace('"', '').replace("'", "").replace('{','').replace('}', '')

# Loop through each JSON object and add them to the DB
i = 1
for obj in data:
    print(f"{i}. Inserting document #{obj['uniqueid']} ({obj['Description']})")
    i += 1

    # Cypher query with fixed schema (no dynamic labels or relationships)
    query = """
    MERGE (d:Document {id: $id})
    ON CREATE SET
        d.name        = $name,
        d.title       = $title,
        d.path        = $path,
        d.user        = $user,
        d.description = $description,
        d.format      = $file_format,
        d.size        = $size_str,
        d.imported    = $imported,
        d.template    = $template

    MERGE (u:User {name: $user})
    MERGE (ff:FileType {name: $file_format})
    MERGE (s:Status {code: $status_code})
    MERGE (fd:FileDate {date_str: $file_date, time_str: $file_time})
    MERGE (t:Description {type: $description})

    MERGE (d)-[:IMPORTED_BY]->(u)
    MERGE (d)-[:HAS_FORMAT]->(ff)
    MERGE (d)-[hs:HAS_STATUS]->(s)
      ON CREATE SET hs.at = $status_date
    MERGE (d)-[:HAS_FILETIME]->(fd)
    MERGE (d)-[:IS_TYPE]->(t)
    """

    # Parameters for this record
    params = {
        "id": obj["uniqueid"],
        "name": obj.get("filename", ""),
        "title": obj.get("orig.filename", ""),
        "path": obj.get("path", ""),
        "user": obj.get("User", ""),
        "description": obj.get("Description", ""),
        "file_format": obj.get("FileType", ""),
        "size_str": obj.get("FileSize", ""),
        "imported": obj.get("Imported", ""),
        "template": obj.get("Template", 0),
        "status_code": obj.get("Status", 0),
        "status_date": obj.get("StatusDate", ""),
        "file_date": obj.get("FileDate", ""),
        "file_time": obj.get("FileTime", "")
    }

    # Run safely with parameters
    graph.query(query, params=params)

1. Inserting document #1 (PROCEDURE)
2. Inserting document #2 (REPORT)
3. Inserting document #3 (Onshore Risk Assessment)
4. Inserting document #4 (Jobcard)


### Embed the data

In [73]:
ollama_embedding_model = "qwen3-embedding:0.6b"
def embed_neo4j_nodes(node_label, index_name=None, text_props=None, embedding_model=None):
    """
    Create or update Neo4j vector indexes for nodes of a given label.

    Args:
        node_label (str): The Neo4j node label (e.g. "Document", "User", "FileType").
        index_name (str, optional): Name of the vector index (defaults to the node_label).
        text_props (list, optional): Properties to embed into the vector.
        embedding_model (str, optional): Ollama embedding model (defaults to your global one).
    """

    if index_name is None:
        index_name = node_label.lower() + "_index"

    # Default text properties per node type
    if text_props is None:
        match node_label:
            case "Document":
                text_props = ["name", "title", "description", "path", "format", "user"]
            case "User":
                text_props = ["name"]
            case "FileType":
                text_props = ["name"]
            case "Status":
                text_props = ["code"]
            case "FileDate":
                text_props = ["date_str", "time_str"]
            case "Description":
                text_props = ["type"]
            case _:
                text_props = ["name"]  # fallback

    print(f"üîπ Creating/Updating embedding index for '{node_label}' using properties: {text_props}")

    try:
        vector_index = Neo4jVector.from_existing_graph(
            OllamaEmbeddings(model=embedding_model),
            url=url,
            username=username,
            password=password,
            index_name=index_name,
            node_label=node_label,
            text_node_properties=text_props,
            embedding_node_property="embedding",
        )
        print(f"‚úÖ Successfully embedded '{node_label}' nodes into vector index '{index_name}'")
        return vector_index
    except Exception as e:
        print(f"‚ö†Ô∏è Failed to embed '{node_label}': {e}")
        return None


# --- 1Ô∏è‚É£ Embed your main Document nodes ---
embed_neo4j_nodes("Document", index_name="documents", embedding_model=ollama_embedding_model)

# --- 2Ô∏è‚É£ Embed all related entities (Users, FileTypes, etc.) ---
related_labels = ["User", "FileType", "Status", "FileDate", "Description"]

for label in related_labels:
    embed_neo4j_nodes(label, embedding_model=ollama_embedding_model)

üîπ Creating/Updating embedding index for 'Document' using properties: ['name', 'title', 'description', 'path', 'format', 'user']
‚úÖ Successfully embedded 'Document' nodes into vector index 'documents'
üîπ Creating/Updating embedding index for 'User' using properties: ['name']
‚úÖ Successfully embedded 'User' nodes into vector index 'user_index'
üîπ Creating/Updating embedding index for 'FileType' using properties: ['name']
‚úÖ Successfully embedded 'FileType' nodes into vector index 'filetype_index'
üîπ Creating/Updating embedding index for 'Status' using properties: ['code']
‚úÖ Successfully embedded 'Status' nodes into vector index 'status_index'
üîπ Creating/Updating embedding index for 'FileDate' using properties: ['date_str', 'time_str']
‚úÖ Successfully embedded 'FileDate' nodes into vector index 'filedate_index'
üîπ Creating/Updating embedding index for 'Description' using properties: ['type']
‚úÖ Successfully embedded 'Description' nodes into vector index 'description_i

### An LLM processes the user query

In [74]:
ollama_model = "gemma3:4b"
def define_query(prompt, model=ollama_model):
    llm = ChatOllama(
        model=model,
        temperature=0,
        format="json"
    )

    messages = [
        SystemMessage(content=system_prompt),
        HumanMessage(content=prompt),
    ]

    response = llm.invoke(messages)
    return response.content

#### Embed the user query

In [75]:
embeddings_client = OllamaEmbeddings(model=ollama_embedding_model)

def create_embedding(text):
    return embeddings_client.embed_query(text)

### Vector / semantic retrieval

In [76]:
def similarity_search(prompt, threshold, structured_query=None):
    """
    Hybrid semantic + symbolic search over Document nodes.
    """
    matches = []
    embedding = create_embedding(prompt)

    query = '''
        WITH $embedding AS inputEmbedding
        MATCH (d:Document)
        WHERE d.embedding IS NOT NULL
          AND gds.similarity.cosine(inputEmbedding, d.embedding) > $threshold
          AND ($user IS NULL OR toLower(d.user) = toLower($user))
          AND ($fileType IS NULL OR toLower(d.format) = toLower($fileType))
          AND ($description IS NULL OR toLower(d.description) CONTAINS toLower($description))
        RETURN d, gds.similarity.cosine(inputEmbedding, d.embedding) AS sim
        ORDER BY sim DESC
        LIMIT 10
    '''

    params = {
        'embedding': embedding,
        'threshold': threshold,
        'user': structured_query.get('User') if structured_query else None,
        'fileType': structured_query.get('FileType') if structured_query else None,
        'description': structured_query.get('Description') if structured_query else None
    }

    result = graph.query(query, params=params)
    for r in result:
        d = r.get('d', {})
        matches.append({
            "id": d.get('id'),
            "name": d.get('name'),
            "similarity": r.get('sim')
        })
    return matches


### Graph-based retrieval

In [77]:
def query_document_context(document_id, relationships_threshold):
    """
    Fetches the connected entities around a single Document node.
    Used for context enrichment in Mode A (no 'similar docs' expansion).

    Returns:
        dict: structured metadata for the given document.
    """
    query = f"""
    MATCH (d:Document {{id: $document_id}})
    OPTIONAL MATCH path=(d)-[*1..{relationships_threshold}]-(n)
    WITH d, collect(DISTINCT n) AS neighbors
    OPTIONAL MATCH (d)-[:IMPORTED_BY]->(u:User)
    OPTIONAL MATCH (d)-[:HAS_FORMAT]->(ft:FileType)
    OPTIONAL MATCH (d)-[:HAS_STATUS]->(s:Status)
    OPTIONAL MATCH (d)-[:HAS_FILETIME]->(fd:FileDate)
    OPTIONAL MATCH (d)-[:IS_TYPE]->(desc:Description)
    RETURN d,
            u.name AS user,
            ft.name AS file_type,
            s.code AS status,
            fd.date_str AS file_date,
            desc.type AS description,
            [n IN neighbors | labels(n)] AS neighbor_types
    """

    result = graph.query(query, params={"document_id": int(document_id)})
    if not result:
        return None

    r = result[0]
    d = r.get("d", {})
    return {
        "id": d.get("id", "N/A"),
        "name": d.get("name", "Unnamed Document"),
        "title": d.get("title", ""),
        "description": r.get("description"),
        "file_type": r.get("file_type"),
        "user": r.get("user"),
        "status": r.get("status"),
        "file_date": r.get("file_date"),
        "imported": d.get("imported", ""),
    }


### Both approaches fused into one pipeline

In [78]:
def graphrag_retrieve(prompt, threshold, relationships_threshold):
    """
    Hybrid GraphRAG retriever combining:
      1Ô∏è‚É£ LLM-guided query parsing
      2Ô∏è‚É£ Semantic vector retrieval
      3Ô∏è‚É£ Graph-based context expansion
      4Ô∏è‚É£ LLM reasoning over the retrieved subgraph
    """

    print(f"\nüîç Processing query: '{prompt}'")

    # --- 1Ô∏è‚É£ Step 1: Let the LLM interpret the user prompt ---
    try:
        structured_query = json.loads(define_query(prompt))
    except Exception as e:
        print("‚ö†Ô∏è Failed to parse structured query:", e)
        structured_query = {}

    print(f"üß© Structured query interpretation:\n{structured_query}\n")

    # --- 2Ô∏è‚É£ Step 2: Perform semantic similarity search over documents ---
    query_terms = " ".join(str(v) for v in structured_query.values() if v)
    search_text = query_terms if query_terms else prompt
    semantic_results = similarity_search(search_text, threshold, structured_query)

    print(f"üß† Found {len(semantic_results)} semantically similar document(s).\n")

    if not semantic_results:
        return "‚ö†Ô∏è No semantically relevant documents found."

    # --- 3Ô∏è‚É£ Step 3: Graph-based expansion ‚Äî get connected documents/entities ---
    context_data = []
    for doc in semantic_results:
        doc_id = doc["id"]
        metadata = query_document_context(doc_id, relationships_threshold)
        if metadata:
            context_data.append(metadata)

    print(f"üß© Enriched {len(context_data)} document(s) with graph metadata.\n")


    # --- 4Ô∏è‚É£ Step 4: Build a readable context summary for the LLM ---
    context_summary = "\n".join(
        f"‚Ä¢ {c['name']} (ID: {c['id']})\n"
        f"  Type: {c['description']}\n"
        f"  Format: {c['file_type']}\n"
        f"  User: {c['user']}\n"
        f"  Status: {c['status']}\n"
        f"  File Date: {c['file_date']}\n"
        for c in context_data
    )

    print(f"IDs from semantic search: {[d['id'] for d in semantic_results]}")
    print(f"IDs enriched with metadata: {[c['id'] for c in context_data]}")

    # --- 5Ô∏è‚É£ Step 5: Ask the LLM again with the combined context ---
    llm = ChatOllama(model=ollama_model, temperature=0)
    messages = [
        SystemMessage(content=SYSTEM_REASONING_PROMPT),
        HumanMessage(content=f"User question: {prompt}\n\nStructured query:\n{json.dumps(structured_query, indent=2)}\n\nContext:\n{context_summary}\n\nAnswer the question using this information only.")
    ]

    response = llm.invoke(messages)

    print("\nüí¨ LLM Response:\n")
    print(response.content)
    return response.content


### Define user query and execute GraphRAG program

In [79]:
user_prompt = "Find all documents imported by dkar."
response = graphrag_retrieve(user_prompt, threshold=0.5, relationships_threshold=3)


üîç Processing query: 'Find all documents imported by dkar.'
üß© Structured query interpretation:
{'User': 'dkar'}

üß† Found 1 semantically similar document(s).

üß© Enriched 1 document(s) with graph metadata.

IDs from semantic search: [2]
IDs enriched with metadata: [2]

üí¨ LLM Response:

32352-F-RA-0003 is a document imported by dkar.


## TODO
- Which information have we instructed the LLM to give about the document it finds? Because there are more nodes connected to the document other than it gives out. Hence why i am asking if it can find the description - or "type" field on the description node which now has the embedding as its name.
- Only get 1 answer for "Find all documents imported by dkar", the answer is two. Implement weights?
- Query in semantic search might be too specific. Specifically looking for user, description and filetype
- Fix graph search to be an additional search through the nodes instead of just retrieving the nodes connected to the nodes the vector search found