In [1]:
import sys
import os

# Add the parent directory to the Python path within the notebook
# This will add the parent directory to the Python path, allowing you to import the graphrag package.
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [2]:
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
from pathlib import Path

# Set up directories
root_dir = Path("./inputs/hello-graphrag")
input_dir = root_dir / "input"
output_dir = root_dir / "output"

# Create directories if they don't exist
input_dir.mkdir(parents=True, exist_ok=True)
output_dir.mkdir(parents=True, exist_ok=True)

# Create a sample .txt file if it doesn't exist
sample_file = input_dir / "emf-review.txt"

In [4]:
from graphrag.config import create_graphrag_config
from graphrag.config.enums import InputFileType, InputType

# Create a basic configuration
config = create_graphrag_config(
    root_dir=str(root_dir),
    values={
        "input": {
            "type": InputType.file,
            "file_type": InputFileType.text,
            "base_dir": "input",
            "file_pattern": r".*\.txt$",
        },
        "storage": {"base_dir": "output"},
        "claim_extraction": {
            "enabled": True,            
        },
        # Simplify the pipeline for this test
        # skip_workflows=[
        #     "create_final_community_reports",
        #     "create_final_covariates",
        # ],
    },
)

INFO:datashaper.engine.verbs:Found module: datashaper.engine.verbs.aggregate
INFO:datashaper.engine.verbs:Found module: datashaper.engine.verbs.bin
INFO:datashaper.engine.verbs:Found module: datashaper.engine.verbs.binarize
INFO:datashaper.engine.verbs:Found module: datashaper.engine.verbs.boolean
INFO:datashaper.engine.verbs:Found module: datashaper.engine.verbs.concat
INFO:datashaper.engine.verbs:Found module: datashaper.engine.verbs.convert
INFO:datashaper.engine.verbs:Found module: datashaper.engine.verbs.copy
INFO:datashaper.engine.verbs:Found module: datashaper.engine.verbs.dedupe
INFO:datashaper.engine.verbs:Found module: datashaper.engine.verbs.derive
INFO:datashaper.engine.verbs:Found module: datashaper.engine.verbs.destructure
INFO:datashaper.engine.verbs:Found module: datashaper.engine.verbs.difference
INFO:datashaper.engine.verbs:Found module: datashaper.engine.verbs.drop
INFO:datashaper.engine.verbs:Found module: datashaper.engine.verbs.erase
INFO:datashaper.engine.verbs:F

In [5]:
from graphrag.index import create_pipeline_config

# Create the pipeline configuration
pipeline_config = create_pipeline_config(config, verbose=True)

# Log the initial workflow order
logger.info("\n\nInitial Workflow order:")
for i, workflow in enumerate(pipeline_config.workflows):
    logger.info(f"{i+1}. {workflow.name}")

INFO:graphrag.index.create_pipeline_config:Using LLM Config {
    "api_key": "*****",
    "type": "azure_openai_chat",
    "model": "gpt-4-turbo-preview",
    "max_tokens": 4000,
    "temperature": 0.0,
    "top_p": 1.0,
    "n": 1,
    "request_timeout": 180.0,
    "api_base": "http://0.0.0.0:8000",
    "api_version": "1978-02-16",
    "organization": null,
    "proxy": null,
    "cognitive_services_endpoint": null,
    "deployment_name": "kiku-deployment",
    "model_supports_json": true,
    "tokens_per_minute": 0,
    "requests_per_minute": 0,
    "max_retries": 10,
    "max_retry_wait": 10.0,
    "sleep_on_rate_limit_recommendation": true,
    "concurrent_requests": 25
}
INFO:graphrag.index.create_pipeline_config:Using Embeddings Config {
    "api_key": "*****",
    "type": "openai_embedding",
    "model": "text-embedding-3-small",
    "max_tokens": 4000,
    "temperature": 0,
    "top_p": 1,
    "n": 1,
    "request_timeout": 180.0,
    "api_base": "http://0.0.0.0:8000",
    "api

In [6]:
from graphrag.index import run_pipeline_with_config

# Run the pipeline
async def run_pipeline():
    async for result in run_pipeline_with_config(pipeline_config):
        if result.errors:
            logger.error(f"Error in workflow {result.workflow}: {result.errors}")
        else:
            logger.info(f"Completed workflow: {result.workflow}")

In [8]:
# For Jupyter notebooks, use await instead of asyncio.run()
await run_pipeline()

# Check the output
logger.info("Output files:")
for file in output_dir.glob("**/*"):
    if file.is_file():
        logger.info(f"- {file.relative_to(output_dir)}")

logger.info("Pipeline execution completed.")

INFO:graphrag.index.run:Running pipeline
INFO:graphrag.index.storage.file_pipeline_storage:Creating file storage at inputs/hello-graphrag/output
INFO:graphrag.index.input.load_input:loading input from root_dir=input
INFO:graphrag.index.input.load_input:using file storage for input
INFO:graphrag.index.storage.file_pipeline_storage:search inputs/hello-graphrag/input for files matching .*\.txt$
INFO:graphrag.index.input.text:found text files from input, found [('emf-review.txt', {})]
INFO:graphrag.index.input.text:Found 1 files, loading 1
INFO:graphrag.index.workflows.load:Workflow Run Order: ['create_base_text_units', 'create_base_extracted_entities', 'create_final_covariates', 'create_summarized_entities', 'join_text_units_to_covariate_ids', 'create_base_entity_graph', 'create_final_entities', 'create_final_nodes', 'create_final_communities', 'join_text_units_to_entity_ids', 'create_final_relationships', 'join_text_units_to_relationship_ids', 'create_final_community_reports', 'create_fi