In [1]:
import asyncio
import json
import logging
import sys
import time
import warnings
from pathlib import Path

from graphrag.config import create_graphrag_config
from graphrag.config.config_file_loader import (
    load_config_from_file,
    resolve_config_path_with_root,
)
from graphrag.config.enums import CacheType
from graphrag.config.logging import enable_logging_with_config

from graphrag.index.api import build_index
from graphrag.index.graph.extractors.claims.prompts import CLAIM_EXTRACTION_PROMPT
from graphrag.index.graph.extractors.community_reports.prompts import COMMUNITY_REPORT_PROMPT
from graphrag.index.graph.extractors.graph.prompts import GRAPH_EXTRACTION_PROMPT
from graphrag.index.graph.extractors.summarize.prompts import SUMMARIZE_PROMPT
from graphrag.index.init_content import INIT_DOTENV, INIT_YAML
from graphrag.index.progress import ProgressReporter
from graphrag.index.progress.load_progress_reporter import load_progress_reporter
from graphrag.index.validate_config import validate_config_names

# Ignore warnings from numba
warnings.filterwarnings("ignore", message=".*NumbaDeprecationWarning.*")

log = logging.getLogger(__name__)

import nest_asyncio
nest_asyncio.apply()

In [2]:

root = "/home/azureuser/autogen_uscases/autosearch/notebooks/graphrag"
config = "settings.yaml"
init = False
verbose = True
resume = None,
memprofile = True
nocache = False
reporter = None
config = None
emit = None
dryrun = False
overlay_defaults = False
skip_validations = False

In [3]:
def _redact(input: dict) -> str:
    """Sanitize the config json."""

    # Redact any sensitive configuration
    def redact_dict(input: dict) -> dict:
        if not isinstance(input, dict):
            return input

        result = {}
        for key, value in input.items():
            if key in {
                "api_key",
                "connection_string",
                "container_name",
                "organization",
            }:
                if value is not None:
                    result[key] = "==== REDACTED ===="
            elif isinstance(value, dict):
                result[key] = redact_dict(value)
            elif isinstance(value, list):
                result[key] = [redact_dict(i) for i in value]
            else:
                result[key] = value
        return result

    redacted_dict = redact_dict(input)
    return json.dumps(redacted_dict, indent=4)

def _logger(reporter: ProgressReporter):
    def info(msg: str, verbose: bool = False):
        log.info(msg)
        if verbose:
            reporter.info(msg)

    def error(msg: str, verbose: bool = False):
        log.error(msg)
        if verbose:
            reporter.error(msg)

    def success(msg: str, verbose: bool = False):
        log.info(msg)
        if verbose:
            reporter.success(msg)

    return info, error, success


def _register_signal_handlers(reporter: ProgressReporter):
    import signal

    def handle_signal(signum, _):
        # Handle the signal here
        # reporter.info(f"Received signal {signum}, exiting...")
        reporter.dispose()
        for task in asyncio.all_tasks():
            task.cancel()
        # reporter.info("All tasks cancelled. Exiting...")

    # Register signal handlers for SIGINT and SIGHUP
    signal.signal(signal.SIGINT, handle_signal)

    if sys.platform != "win32":
        signal.signal(signal.SIGHUP, handle_signal)


In [4]:
"""Run the pipeline with the given config."""
progress_reporter = None
# info, error, success = _logger(progress_reporter)
run_id = time.strftime("%Y%m%d-%H%M%S")

if overlay_defaults or config:
    config_path = (
        Path(root) / config if config else resolve_config_path_with_root(root)
    )
    default_config = load_config_from_file(config_path)
else:
    try:
        config_path = resolve_config_path_with_root(root)
        default_config = load_config_from_file(config_path)
    except FileNotFoundError:
        default_config = create_graphrag_config(root_dir=root)

if nocache:
    default_config.cache.type = CacheType.none

enabled_logging = None
# if enabled_logging:
#     info(f"Logging enabled at {log_path}", True)
# else:
#     info(
#         f"Logging not enabled for config {_redact(default_config.model_dump())}",
#         True,
#     )

if skip_validations:
    validate_config_names(progress_reporter, default_config)

# info(f"Starting pipeline run for: {run_id}, {dryrun=}", verbose)
# info(
#     f"Using default configuration: {_redact(default_config.model_dump())}",
#     verbose,
# )

# if dryrun:
#     info("Dry run complete, exiting...", True)
#     sys.exit(0)

pipeline_emit = emit.split(",") if emit else None

# _register_signal_handlers(progress_reporter)

In [5]:
print(_redact(default_config.model_dump()))

{
    "llm": {
        "api_key": "==== REDACTED ====",
        "type": "azure_openai_chat",
        "model": "gpt-4o",
        "max_tokens": 4000,
        "temperature": 0.0,
        "top_p": 1.0,
        "n": 1,
        "request_timeout": 180.0,
        "api_base": "https://aoai-sweden-505.openai.azure.com/",
        "api_version": "2023-08-01-preview",
        "proxy": null,
        "cognitive_services_endpoint": null,
        "deployment_name": "gpt-4o",
        "model_supports_json": true,
        "tokens_per_minute": 0,
        "requests_per_minute": 0,
        "max_retries": 10,
        "max_retry_wait": 10.0,
        "sleep_on_rate_limit_recommendation": true,
        "concurrent_requests": 25
    },
    "parallelization": {
        "stagger": 0.3,
        "num_threads": 50
    },
    "async_mode": "threaded",
    "root_dir": "/home/azureuser/autogen_uscases/autosearch/notebooks/graphrag",
    "reporting": {
        "type": "file",
        "base_dir": "output/${timestamp}/repor

In [None]:
run_id

In [7]:
outputs = await build_index(
        default_config,
        run_id,
        memprofile,
        None,
        pipeline_emit,
    )


  return bound(*args, **kwds)
Process failed to invoke LLM 1/10 attempts. Cause: rate limit exceeded, will retry. Recommended sleep for 60 seconds. Follow recommendation? True
Process failed to invoke LLM 1/10 attempts. Cause: rate limit exceeded, will retry. Recommended sleep for 60 seconds. Follow recommendation? True
Process failed to invoke LLM 1/10 attempts. Cause: rate limit exceeded, will retry. Recommended sleep for 60 seconds. Follow recommendation? True
Process failed to invoke LLM 1/10 attempts. Cause: rate limit exceeded, will retry. Recommended sleep for 60 seconds. Follow recommendation? True
Process failed to invoke LLM 1/10 attempts. Cause: rate limit exceeded, will retry. Recommended sleep for 60 seconds. Follow recommendation? True
Process failed to invoke LLM 1/10 attempts. Cause: rate limit exceeded, will retry. Recommended sleep for 57 seconds. Follow recommendation? True
Process failed to invoke LLM 1/10 attempts. Cause: rate limit exceeded, will retry. Recommende

CancelledError: 