# Local search

局所検索法は、知識グラフの構造化データと入力文書の非構造化データを組み合わせ、クエリ時に関連するエンティティ情報でLLMコンテキストを補強する。これは、入力文書で言及されている特定のエンティティの理解を必要とする質問に答えるのに適している（例えば、「カモミールの治癒特性は何か？）

In [1]:
%load_ext dotenv

import os

import pandas as pd
import tiktoken

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

  from .autonotebook import tqdm as notebook_tqdm


ベースとなるLLMとEmbeddingの指定

In [2]:
api_key = os.environ["OPENAI_API_KEY"]
llm_model = 'gpt-4o'
embedding_model = 'text-embedding-3-small'

llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_type=OpenaiApiType.OpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
)

token_encoder = tiktoken.get_encoding("cl100k_base")

text_embedder = OpenAIEmbedding(
    api_key=api_key,
    api_base=None,
    api_type=OpenaiApiType.OpenAI,
    model=embedding_model,
    deployment_name=embedding_model,
    max_retries=20,
)

`python -m graphrag.index --root ./ragtest`で生成された`.parquet`ファイルとlanceDBへのpathを指定

In [3]:
input_dir = "ragtest/output/20240801-212257/artifacts"
lancedb_uri = f"{input_dir}/lancedb"

community_report_table = "create_final_community_reports"
entity_table = "create_final_nodes"
entity_embedding_table = "create_final_entities"
relationship_table = "create_final_relationships"
covariate_table = "create_final_covariates"
text_unit_table = "create_final_text_units"

`pd.DataFrame`として読み出し

In [4]:
report_df = pd.read_parquet(f"{input_dir}/{community_report_table}.parquet")
entity_df = pd.read_parquet(f"{input_dir}/{entity_table}.parquet")
entity_embedding_df = pd.read_parquet(f"{input_dir}/{entity_embedding_table}.parquet")
relationship_df = pd.read_parquet(f"{input_dir}/{relationship_table}.parquet")
covariate_df = pd.read_parquet(f"{input_dir}/{covariate_table}.parquet")
text_unit_df = pd.read_parquet(f"{input_dir}/{text_unit_table}.parquet")

DataFrameからGraphRAGとして使える形に変換

In [5]:
reports = read_indexer_reports(report_df, entity_df, 2)
entities = read_indexer_entities(entity_df, entity_embedding_df, 2)
relationships = read_indexer_relationships(relationship_df)
claims = read_indexer_covariates(covariate_df)
covariates = {"claims": claims}
text_units = read_indexer_text_units(text_unit_df)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="entity_description_embeddings",
)
description_embedding_store.connect(db_uri=lancedb_uri)
entity_description_embeddings = store_entity_semantic_embeddings(
    entities=entities, vectorstore=description_embedding_store
)

NameError: name 'lancedbvectorstore' is not defined

ローカルな文脈を構築

In [None]:
context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    covariates=covariates,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=None,
    token_encoder=token_encoder,
)

ローカルサーチのためのエンジンをインスタンス化

In [None]:
local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

llm_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

In [None]:
search_engine = LocalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

ローカルサーチ

In [None]:
result = await search_engine.asearch(
    "主人公の交友関係を説明して"
)

print(result.response)

In [None]:
result.context_data["reports"]

In [None]:
print(f"LLM calls: {result.llm_calls}. LLM tokens: {result.prompt_tokens}")