In [1]:
import os

import pandas as pd
import tiktoken

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore
from yfiles_jupyter_graphs import GraphWidget

In [2]:
input_dir = "ragtest/output/20240801-212257/artifacts"
lancedb_uri = f"{input_dir}/lancedb"

community_report_table = "create_final_community_reports"
entity_table = "create_final_nodes"
entity_embedding_table = "create_final_entities"
relationship_table = "create_final_relationships"
covariate_table = "create_final_covariates"
text_unit_table = "create_final_text_units"
community_level = 2

In [3]:
entity_df = pd.read_parquet(f"{input_dir}/{entity_table}.parquet")
entity_embedding_df = pd.read_parquet(f"{input_dir}/{entity_embedding_table}.parquet")
relationship_df = pd.read_parquet(f"{input_dir}/{relationship_table}.parquet")

In [4]:
relationships = read_indexer_relationships(relationship_df)

In [5]:
def convert_entities_to_dicts(df):
    """Convert the entities dataframe to a list of dicts for yfiles-jupyter-graphs."""
    nodes_dict = {}
    for _, row in df.iterrows():
        # Create a dictionary for each row and collect unique nodes
        node_id = row["title"]
        if node_id not in nodes_dict:
            nodes_dict[node_id] = {
                "id": node_id,
                "properties": row.to_dict(),
            }
    return list(nodes_dict.values())


# converts the relationships dataframe to a list of dicts for yfiles-jupyter-graphs
def convert_relationships_to_dicts(df):
    """Convert the relationships dataframe to a list of dicts for yfiles-jupyter-graphs."""
    relationships = []
    for _, row in df.iterrows():
        # Create a dictionary for each row
        relationships.append({
            "start": row["source"],
            "end": row["target"],
            "properties": row.to_dict(),
        })
    return relationships


w = GraphWidget()
w.directed = True
w.nodes = convert_entities_to_dicts(entity_df)
w.edges = convert_relationships_to_dicts(relationship_df)

In [6]:
w.node_label_mapping = "title"


# map community to a color
def community_to_color(community):
    """Map a community to a color."""
    colors = [
        "crimson",
        "darkorange",
        "indigo",
        "cornflowerblue",
        "cyan",
        "teal",
        "green",
    ]
    return (
        colors[int(community) % len(colors)] if community is not None else "lightgray"
    )


def edge_to_source_community(edge):
    """Get the community of the source node of an edge."""
    source_node = next(
        (entry for entry in w.nodes if entry["properties"]["title"] == edge["start"]),
        None,
    )
    source_node_community = source_node["properties"]["community"]
    return source_node_community if source_node_community is not None else None


w.node_color_mapping = lambda node: community_to_color(node["properties"]["community"])
w.edge_color_mapping = lambda edge: community_to_color(edge_to_source_community(edge))
# map size data to a reasonable factor
w.node_scale_factor_mapping = lambda node: 0.5 + node["properties"]["size"] * 1.5 / 20
# use weight for edge thickness
w.edge_thickness_factor_mapping = "weight"

In [7]:
w.circular_layout()

In [8]:
display(w)

GraphWidget(layout=Layout(height='800px', width='100%'))

In [10]:
entities = read_indexer_entities(entity_df, entity_embedding_df, community_level)

description_embedding_store = LanceDBVectorStore(
    collection_name="entity_description_embeddings",
)
description_embedding_store.connect(db_uri=lancedb_uri)
entity_description_embeddings = store_entity_semantic_embeddings(
    entities=entities, vectorstore=description_embedding_store
)
covariate_df = pd.read_parquet(f"{input_dir}/{covariate_table}.parquet")
claims = read_indexer_covariates(covariate_df)
covariates = {"claims": claims}
report_df = pd.read_parquet(f"{input_dir}/{community_report_table}.parquet")
reports = read_indexer_reports(report_df, entity_df, community_level)
text_unit_df = pd.read_parquet(f"{input_dir}/{text_unit_table}.parquet")
text_units = read_indexer_text_units(text_unit_df)

api_key = os.environ["OPENAI_API_KEY"]
llm_model = 'gpt-4o'
embedding_model = 'text-embedding-3-small'

llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_type=OpenaiApiType.OpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
)

token_encoder = tiktoken.get_encoding("cl100k_base")

text_embedder = OpenAIEmbedding(
    api_key=api_key,
    api_base=None,
    api_type=OpenaiApiType.OpenAI,
    model=embedding_model,
    deployment_name=embedding_model,
    max_retries=20,
)

context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    covariates=covariates,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)

local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

llm_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

search_engine = LocalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

In [11]:
result = await search_engine.asearch("主人公の交友関係を説明して")
print(result.response)

# 主人公の交友関係

## リオとリナの絆

リオとリナは、村の中で非常に重要な役割を果たす兄妹です。リオは村の長老として、村人たちを導き、文化や伝統を次世代に伝える役割を担っています。一方、リナは村の平和と調和を保つために尽力し、村人たちとの対話を通じて共感と理解を深めています。この兄妹は、村の人々と自然との関係を守るために協力し合い、強い絆で結ばれています [Data: Relationships (7); Entities (1, 2)]。

## 村人たちとの関係

リオとリナは、村人たちとの深い関係を築いています。リオは村の歴史と伝統を守るために、村人たちに積極的に働きかけ、自然との調和を大切にするよう促しています [Data: Relationships (1); Entities (1, 5)]。リナもまた、村人たちとの対話を通じて、共感と理解を深め、村の平和を保つために努力しています [Data: Relationships (4); Entities (2)]。

## 竜との関係

リオとリナは、村の守護者である竜との関係も重要視しています。リナは石を使って竜に話しかけ、その心を動かすことに成功しました。これにより、竜との関係が築かれ、村の平和が保たれるようになりました [Data: Relationships (11, 12); Entities (10)]。リオもまた、竜との出会いを通じて自然との調和の重要性を学び、村人たちにその教えを伝えています [Data: Relationships (8); Entities (1)]。

## 次世代との関係

リオとリナの子孫たちは、彼らの物語と竜との絆を次世代に伝え続けています。これにより、村の豊かな歴史と文化が次世代に受け継がれ、村の未来が守られています [Data: Relationships (6, 9); Entities (23, 22)]。

## 自然との関係

リオとリナは、村の自然環境の再生にも積極的に関与しています。彼らは村人たちと協力して破壊された土地を再生させ、自然との調和を取り戻すための努力を続けています [Data: Entities (21); Relationships (16)]。

リオとリナの交友関係は、村の人々、竜、そして自然との深い結びつきを通じて、村

In [12]:
def show_graph(result):
    """Visualize the result context with yfiles-jupyter-graphs."""

    if (
        "entities" not in result.context_data
        or "relationships" not in result.context_data
    ):
        msg = "The passed results do not contain 'entities' or 'relationships'"
        raise ValueError(msg)

    # converts the entities dataframe to a list of dicts for yfiles-jupyter-graphs
    def convert_entities_to_dicts(df):
        """Convert the entities dataframe to a list of dicts for yfiles-jupyter-graphs."""
        nodes_dict = {}
        for _, row in df.iterrows():
            # Create a dictionary for each row and collect unique nodes
            node_id = row["entity"]
            if node_id not in nodes_dict:
                nodes_dict[node_id] = {
                    "id": node_id,
                    "properties": row.to_dict(),
                }
        return list(nodes_dict.values())

    # converts the relationships dataframe to a list of dicts for yfiles-jupyter-graphs
    def convert_relationships_to_dicts(df):
        """Convert the relationships dataframe to a list of dicts for yfiles-jupyter-graphs."""
        relationships = []
        for _, row in df.iterrows():
            # Create a dictionary for each row
            relationships.append({
                "start": row["source"],
                "end": row["target"],
                "properties": row.to_dict(),
            })
        return relationships

    w = GraphWidget()
    # use the converted data to visualize the graph
    w.nodes = convert_entities_to_dicts(result.context_data["entities"])
    w.edges = convert_relationships_to_dicts(result.context_data["relationships"])
    w.directed = True
    # show title on the node
    w.node_label_mapping = "entity"
    # use weight for edge thickness
    w.edge_thickness_factor_mapping = "weight"
    display(w)


show_graph(result)

GraphWidget(layout=Layout(height='700px', width='100%'))