In [27]:
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

"""Main definition."""

import asyncio
import warnings
import uvloop 
from graphrag.config import create_graphrag_config
import os
from graphrag.index import PipelineConfig, create_pipeline_config
import yaml
from graphrag.index.run import run_pipeline_with_config

# Ignore warnings from numba
warnings.filterwarnings("ignore", message=".*NumbaDeprecationWarning.*")
warnings.filterwarnings("ignore", message="FutureWarning")

api_key = os.environ["OPENAI_API_KEY"]
llm_model = "gpt-4o-mini"

def index(
    root: str,
    config: str | None,
):
    """Run the pipeline with the given config."""
    run_id = "test"
    pipeline_config: str | PipelineConfig = _create_default_config(
        root, config
    )

    def _run_workflow_async() -> None:
        async def execute():
            async for output in run_pipeline_with_config(
                pipeline_config,
                run_id=run_id,
            ):
                if output.errors and len(output.errors) > 0:
                    return
        
        uvloop.install()

        # This part replaces asyncio.run(execute())
        if not asyncio.get_event_loop().is_running():
            asyncio.run(execute())
        else:
            asyncio.create_task(execute())
    _run_workflow_async()

def _create_default_config(
    root_dir: str,
    config_path: str ,
) -> PipelineConfig:
    """Overlay default values on an existing config or create a default config if none is provided."""
    with open(config_path, "rb") as file:
        file_content = file.read().decode(encoding="utf-8", errors="strict")
        replaced_content = file_content.replace("${OPENAI_API_KEY}", api_key).replace("${LLM_MODEL}", llm_model)
        data = yaml.safe_load(replaced_content)
        parameters = create_graphrag_config(data, root_dir)

    result = create_pipeline_config(parameters)
    return result

In [28]:
root = './data'
config = './config/graphrag_index.yaml'

# index_cli 関数を呼び出す
index(
    root=root,
    config=config,
)

In [21]:
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

"""Command line interface for the query module."""

import os
from pathlib import Path
from typing import cast

import pandas as pd

from graphrag.config import (
    GraphRagConfig,
    create_graphrag_config,
)
from graphrag.index.progress import PrintProgressReporter
from graphrag.model.entity import Entity
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
from graphrag.vector_stores import VectorStoreFactory, VectorStoreType
from graphrag.vector_stores.lancedb import LanceDBVectorStore

from graphrag.query.factories import get_global_search_engine, get_local_search_engine
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)

reporter = PrintProgressReporter("")

api_key = os.environ["OPENAI_API_KEY"]
llm_model = "gpt-4o-mini"
def __get_embedding_description_store(
    entities: list[Entity],
    vector_store_type: str = VectorStoreType.LanceDB,
    config_args: dict | None = None,
):
    """Get the embedding description store."""
    if not config_args:
        config_args = {}

    collection_name = config_args.get(
        "query_collection_name", "entity_description_embeddings"
    )
    config_args.update({"collection_name": collection_name})
    description_embedding_store = VectorStoreFactory.get_vector_store(
        vector_store_type=vector_store_type, kwargs=config_args
    )

    description_embedding_store.connect(**config_args)

    if config_args.get("overwrite", False):
        # this step assumps the embeddings where originally stored in a file rather
        # than a vector database

        # dump embeddings from the entities list to the description_embedding_store
        store_entity_semantic_embeddings(
            entities=entities, vectorstore=description_embedding_store
        )
    else:
        # load description embeddings to an in-memory lancedb vectorstore
        # to connect to a remote db, specify url and port values.
        description_embedding_store = LanceDBVectorStore(
            collection_name=collection_name
        )
        description_embedding_store.connect(
            db_uri=config_args.get("db_uri", "./lancedb")
        )

        # load data from an existing table
        description_embedding_store.document_collection = (
            description_embedding_store.db_connection.open_table(
                description_embedding_store.collection_name
            )
        )

    return description_embedding_store


def run_global_search(
    config_path: str | None,
    data_dir: str | None,
    root_dir: str | None,
    community_level: int,
    response_type: str,
):
    """Run a global search with the given query."""
    config =  _create_default_config(
        root_dir, config_path, api_key, llm_model
    )
    data_path = Path(data_dir)

    final_nodes: pd.DataFrame = pd.read_parquet(
        data_path / "create_final_nodes.parquet"
    )
    final_entities: pd.DataFrame = pd.read_parquet(
        data_path / "create_final_entities.parquet"
    )
    final_community_reports: pd.DataFrame = pd.read_parquet(
        data_path / "create_final_community_reports.parquet"
    )

    reports = read_indexer_reports(
        final_community_reports, final_nodes, community_level
    )
    entities = read_indexer_entities(final_nodes, final_entities, community_level)
    search_engine = get_global_search_engine(
        config,
        reports=reports,
        entities=entities,
        response_type=response_type,
    )

    return search_engine


def run_local_search(
    config_path: str | None,
    data_dir: str | None,
    root_dir: str | None,
    community_level: int,
    response_type: str,
    query: str,
):
    """Run a local search with the given query."""
    config =  _create_default_config(
        root_dir ,config_path, api_key, llm_model
    )
    data_path = Path(data_dir)

    final_nodes = pd.read_parquet(data_path / "create_final_nodes.parquet")
    final_community_reports = pd.read_parquet(
        data_path / "create_final_community_reports.parquet"
    )
    final_text_units = pd.read_parquet(data_path / "create_final_text_units.parquet")
    final_relationships = pd.read_parquet(
        data_path / "create_final_relationships.parquet"
    )
    final_entities = pd.read_parquet(data_path / "create_final_entities.parquet")
    final_covariates_path = data_path / "create_final_covariates.parquet"
    final_covariates = (
        pd.read_parquet(final_covariates_path)
        if final_covariates_path.exists()
        else None
    )

    vector_store_args = (
        config.embeddings.vector_store if config.embeddings.vector_store else {}
    )

    reporter.info(f"Vector Store Args: {vector_store_args}")
    vector_store_type = vector_store_args.get("type", VectorStoreType.LanceDB)

    entities = read_indexer_entities(final_nodes, final_entities, community_level)
    description_embedding_store = __get_embedding_description_store(
        entities=entities,
        vector_store_type=vector_store_type,
        config_args=vector_store_args,
    )
    covariates = (
        read_indexer_covariates(final_covariates)
        if final_covariates is not None
        else []
    )

    search_engine = get_local_search_engine(
        config,
        reports=read_indexer_reports(
            final_community_reports, final_nodes, community_level
        ),
        text_units=read_indexer_text_units(final_text_units),
        entities=entities,
        relationships=read_indexer_relationships(final_relationships),
        covariates={"claims": covariates},
        description_embedding_store=description_embedding_store,
        response_type=response_type,
    )

    result = search_engine.search(query=query)
    return result.response

def  _create_default_config(root_dir, config_path: str, api_key, llm_model):
    with open(config_path, "rb") as file:
        import yaml

        content = file.read().decode(encoding="utf-8", errors="strict")
        content = content.replace("${OPENAI_API_KEY}", api_key)
        content = content.replace("${LLM_MODEL}", llm_model)
        data = yaml.safe_load(content)
        return create_graphrag_config(data, root_dir)




In [27]:
search_engine = run_global_search(
    "./config/graphrag.yaml",
    "./data/graphrag/gs_q6OLkTkx9NIQ0Wobtqn62tOy/output/default/artifacts", 
    "./data/graphrag/gs_q6OLkTkx9NIQ0Wobtqn62tOy", 
    2, 
    response_type="multiple paragraphs")

creating llm client with {'api_key': 'REDACTED,len=56', 'type': "openai_chat", 'model': 'gpt-4o-mini', 'max_tokens': 4000, 'temperature': 0.0, 'top_p': 1.0, 'n': 1, 'request_timeout': 180.0, 'api_base': None, 'api_version': None, 'organization': None, 'proxy': None, 'cognitive_services_endpoint': None, 'deployment_name': None, 'model_supports_json': True, 'tokens_per_minute': 0, 'requests_per_minute': 0, 'max_retries': 10, 'max_retry_wait': 10.0, 'sleep_on_rate_limit_recommendation': True, 'concurrent_requests': 25}


In [30]:
result = search_engine.asearch(
    "主人公について教えて"
)

RuntimeError: asyncio.run() cannot be called from a running event loop