### imports

In [33]:
import neo4j
import os
from dotenv import load_dotenv
# LLM and Embedding Model
from neo4j_graphrag.llm import OpenAILLM
from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

### Load env variables

In [34]:
load_dotenv()
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')

In [35]:
neo4j_driver = neo4j.GraphDatabase.driver(NEO4J_URI,
                auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

In [36]:
llm=OpenAILLM(
   model_name="gpt-4o-mini",
   model_params={
       "response_format": {"type": "json_object"}, # use json_object formatting for best results
       "temperature": 0 # turning temperature down for more deterministic results
   }
)
embedder = OpenAIEmbeddings()

# define prompt template (generalized for any document type)
prompt_template = '''
You are a Knowledge Engineer task with extracting structured information from unstructured text 
to build a comprehensive property graph for advanced data analysis and retrieval.

Extract the entities (nodes) and specify their type from the following Input text based on the provided schema. 
Also, extract the directed relationships between these nodes.

Return the result strictly as a JSON object using the following format:
{{"nodes": [ {{"id": "unique_id", "label": "Entity_Type", "properties": {{"name": "Entity Name" }} }}],
  "relationships": [{{"type": "RELATIONSHIP_TYPE", "start_node_id": "unique_id", "end_node_id": "unique_id", "properties": {{"details": "Brief description of how they interact"}} }}] }}

---

### Constraints:
1. **Schema Adherence:** Use only the following node labels and relationship types:
{schema}

2. **Node IDs:** Assign a unique string ID to each node and use these IDs to define the relationships.
3. **Directionality:** Ensure the relationship direction (start_node to end_node) reflects the logic of the source text.
4. **Format:** Do not include any conversational text, preamble, or markdown formatting outside of the JSON block.

---

### Examples:
{examples}

---

### Input text:
{text}
'''

### Extract texts from documents

`DocumentLoader` auto-detects whether a PDF has native text (→ pdfplumber) or is scanned (→ OCR via pytesseract).  
Set `force_ocr=True` to force the OCR path on any file.

In [45]:
from doc_to_graphrag.ingestion import DocumentLoader

loader = DocumentLoader()

# ── Load a document (auto-detects native text vs scanned) ──
# Set force_ocr=True to force OCR even on native-text PDFs
result = loader.load('test-documents/2PX3WorksheetWeek2_restore.pdf')

# ── Metadata ──
print('=== Document Metadata ===')
for k, v in result['metadata'].items():
    print(f'  {k}: {v}')

# ── Text preview ──
extracted_text = result['text']
print(f'\n=== Text Preview ({len(extracted_text)} total chars) ===')
print(extracted_text[:500])
print('...')

=== Document Metadata ===
  file_path: /Users/karlo/Documents/Wizonix/graph-rag/test-documents/2PX3WorksheetWeek2_restore.pdf
  file_type: pdf
  page_count: 19
  ocr_used: False
  ocr_confidence: None

=== Text Preview (20180 total chars) ===
Week 2
Overview and Goals
· Introduce the recycling project.
· Construct a rudimentary framework to evaluate and rank proposed designs.
· Introduce unitless cost equations.
· Construct a rudimentary unitless cost equation.
Table of Contents
Overview and Goals........................................................................................................................................i
Table of Figures.......................................................................................
...


### Chunk text (overlapping chunks)

In [38]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from neo4j_graphrag.experimental.components.text_splitters.langchain import LangChainTextSplitterAdapter

langchain_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=600,       # Smaller chunks = better entity extraction
    chunk_overlap=60,     # 10% overlap to keep context
    length_function=len,
    separators=[
            "\n\n",
            "\n",
            " ",
            ".",
            ",",
            "\u200b",  # Zero-width space
            "\uff0c",  # Fullwidth comma
            "\u3001",  # Ideographic comma
            "\uff0e",  # Fullwidth full stop
            "\u3002",  # Ideographic full stop
            "",
        ],
    is_separator_regex=False
)

adapted_text_splitter = LangChainTextSplitterAdapter(RecursiveCharacterTextSplitter())

# Use the extracted text from the document loader above
split_texts = langchain_text_splitter.split_text(extracted_text)
print(f'Split into {len(split_texts)} chunks')
print(f'First chunk preview: {split_texts[0][:200]}...')

Split into 44 chunks
First chunk preview: Week 2
Overview and Goals
· Introduce the recycling project.
· Construct a rudimentary framework to evaluate and rank proposed designs.
· Introduce unitless cost equations.
· Construct a rudimentary u...


### Step 1: Infer graph schema (LLM-based)

`SchemaInferrer` sends the first few chunks to the LLM to discover what **node labels** (entity types) and **relationship types** are relevant for this document. It then samples later chunks to refine.

In [None]:
from doc_to_graphrag.extraction import SchemaInferrer

inferrer = SchemaInferrer(llm, initial_chunk_count=3, refine_sample_count=5)
schema = inferrer.infer(split_texts)

print('=== Inferred Schema ===')
print(f'Node labels: {schema["node_labels"]}')
print(f'Rel types:   {schema["rel_types"]}')

### Step 2: Build knowledge graph (SimpleKGPipeline)

`SimpleKGPipeline` takes the inferred schema and uses the LLM internally to extract entities and relationships from the text, then writes them directly to Neo4j.

In [None]:
from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline

kg_builder = SimpleKGPipeline(
    llm=llm,
    driver=neo4j_driver,
    embedder=embedder,
    entities=schema['node_labels'],        # e.g. ["Person", "Organization", ...]
    relations=schema['rel_types'],          # e.g. ["FOUNDED", "WORKS_FOR", ...]
    from_pdf=False,                         # we already extracted the text
    text_splitter=adapted_text_splitter,
    perform_entity_resolution=True,
    prompt_template=prompt_template,
)

print('SimpleKGPipeline ready.')
print(f'  Entities: {schema["node_labels"]}')
print(f'  Relations: {schema["rel_types"]}')

### Step 3: Run the pipeline

This sends the extracted text through the pipeline — the LLM scans each chunk, extracts entities/relationships using the schema, and writes them to Neo4j.

In [None]:
# Run the pipeline (async under the hood)
kg_result = await kg_builder.run_async(text=extracted_text)

print('\n=== Pipeline Result ===')
print(kg_result)