# Querying Philadelphia Phillies with LlamaIndex and NebulaGraph

## 安装依赖和载入环境变量

In [None]:
%pip install -U llama_index wikipedia llama-index-llms-openai-like llama-index-readers-wikipedia llama-index-readers-youtube-transcript llama-index-graph-stores-nebula llama-index-llms-openai ipython-ngql nebula3-python pyvis networkx youtube_transcript_api 

In [None]:
from dotenv import load_dotenv
import os
import logging
import sys

load_dotenv()

print(os.environ["OPENAI_API_KEY"])
print(os.environ["OPENAI_API_BASE"])

logging.basicConfig(
    filename="rag.log",
    filemode="w",
    format="%(name)s - %(levelname)s - %(message)s",
    level=logging.INFO,
)

## 连接到图数据库，并新建图空间

In [None]:
os.environ["GRAPHD_HOST"] = "127.0.0.1"
os.environ["NEBULA_USER"] = "root"
os.environ["NEBULA_PASSWORD"] = "nebula" 
os.environ["NEBULA_ADDRESS"] = "127.0.0.1:9669"  

%reload_ext ngql
connection_string = f"--address {os.environ['GRAPHD_HOST']} --port 9669 --user root --password {os.environ['NEBULA_PASSWORD']}"
%ngql {connection_string}

创建一个名为 phillies_rag 的图空间

In [None]:
%ngql CREATE SPACE IF NOT EXISTS phillies_rag(vid_type=FIXED_STRING(256), partition_num=1, replica_factor=1);

In [None]:
%ngql SHOW SPACES;

在新的图空间中创建标签、边和标签索引

In [None]:
%%ngql
USE phillies_rag;
CREATE TAG IF NOT EXISTS entity(name string);
CREATE EDGE IF NOT EXISTS relationship(relationship string);

In [None]:
%ngql CREATE TAG INDEX IF NOT EXISTS entity_index ON entity(name(256));

构建下 NebulaGraphStore

In [None]:
from llama_index.graph_stores.nebula import NebulaGraphStore

space_name = "phillies_rag"
edge_types, rel_prop_names = ["relationship"], ["relationship"]
tags = ["entity"]

graph_store = NebulaGraphStore(
    space_name=space_name,
    edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags,
)

## 加载数据并创建 KG 索引

是时候加载数据了。我们的源数据来自 Philadelphia Phillies 的维基百科页面和一个关于 Trea Turner 在 2023 年 8 月收到 standing ovation 的 YouTube 视频。

为了节省时间和成本，我们先检查下本地 storage_context 来加载 KG 索引。如果存在索引，我们就加载索引。如果不存在索引（例如初次访问应用程序时），我们需要加载这两个源文档（上文提到的维基百科页面和 YouTube 视频），再构建 KG 索引，并在项目 root 目录的本地 storage_graph 中持久化地存储 doc、index 和 vector。

In [None]:
# from llama_index import KnowledgeGraphIndex
# from llama_index.graph_stores import SimpleGraphStore
# from llama_index import download_loader
# from llama_index.llms import OpenAI
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings
from llama_index.core.llms import ChatMessage

# define LLM
Settings.llm = OpenAI(temperature=0.1, model="gpt-3.5-turbo")
Settings.embed_model = OpenAIEmbedding()
Settings.chunk_size = 512

# test if model is ready
response = Settings.llm.chat([ChatMessage(role="user", content="Hello")])
print(str(response))

In [None]:
from llama_index.readers.youtube_transcript import YoutubeTranscriptReader
from llama_index.core import StorageContext
from llama_index.core import SimpleDirectoryReader, KnowledgeGraphIndex
from llama_index.readers.wikipedia import WikipediaReader

storage_context = StorageContext.from_defaults(graph_store=graph_store)

try:
    documents = SimpleDirectoryReader('./storage_graph').load_data()
    # kg_index = load_index_from_storage(
    #     storage_context=storage_context,
    #     service_context=service_context,
    #     max_triplets_per_chunk=15,
    #     space_name=space_name,
    #     edge_types=edge_types,
    #     rel_prop_names=rel_prop_names,
    #     tags=tags,
    #     verbose=True,
    # )
    kg_index = KnowledgeGraphIndex.from_documents(
        documents,
        storage_context=storage_context,
        max_triplets_per_chunk=15,
        space_name=space_name,
        edge_types=edge_types,
        rel_prop_names=rel_prop_names,
        tags=tags,
        verbose=True,
    )
    index_loaded = True
except:
    index_loaded = False

if not index_loaded:
    print("Load data and persist index")
    loader = WikipediaReader()
    wiki_documents = loader.load_data(pages=['Philadelphia Phillies'], auto_suggest=False)
    print(f'Loaded {len(wiki_documents)} documents')

    youtube_loader = YoutubeTranscriptReader()
    youtube_documents = youtube_loader.load_data(ytlinks=['https://www.youtube.com/watch?v=k-HTQ8T7oVw'])    
    print(f'Loaded {len(youtube_documents)} YouTube documents')

    kg_index = KnowledgeGraphIndex.from_documents(
        documents=wiki_documents + youtube_documents,
        storage_context=storage_context,
        max_triplets_per_chunk=15,
        space_name=space_name,
        edge_types=edge_types,
        rel_prop_names=rel_prop_names,
        tags=tags,
        include_embeddings=True,
    )
    
    kg_index.storage_context.persist(persist_dir='./storage_graph')
    print("Done")

在构建 KG 索引时，需要注意以下几点：


max_triplets_per_chunk：每个块提取三元组的最大数。将其设置为 15，可覆盖大多数（可能不是所有）块中的内



include_embeddings：说明创建 KG 索引时，是否包含数据的 Embedding。Embedding 是一种将文本数据表示为数据语义的向量法。它们通常用来让模型理解不同文本片段之间的语义相似性。当设置 include_embeddings=True 时，KnowledgeGraphIndex 会在索引中包含这些嵌入。当你想在知识图谱上执行语义搜索时，include_embeddings=True 会很有用，因为 Embedding 可用来找到与查询在语义上相似的节点和边。

## Query with Text2Cypher

In [None]:
from llama_index.core.query_engine import KnowledgeGraphQueryEngine
from llama_index.core import StorageContext
from IPython.display import Markdown, display

### 图探索的方法 1：KG 基于向量的检索

In [None]:
query_engine = kg_index.as_query_engine()
response = query_engine.query("Tell me about some of the facts of Philadelphia Phillies.")
display(Markdown(f"<b>{response}</b>"))

query_engine = kg_index.as_query_engine() 这种方法通过向量相似性查找 KG 实体，获取连接的文本块，并选择性探索关系。是 LlamaIndex 基于索引构建的默认查询方式。它非常简单、开箱即用，不用额外的参数。

### 图探索的方法 2：KG 基于关键词的检索

In [None]:
kg_keyword_query_engine = kg_index.as_query_engine(
    # setting to false uses the raw triplets instead of adding the text from the corresponding nodes
    include_text=False,
    retriever_mode="keyword",
    response_mode="tree_summarize",
)
response = kg_keyword_query_engine.query("Tell me about some of the facts of Philadelphia Phillies.")
display(Markdown(f"<b>{response}</b>"))

这个查询用了关键词来检索相关的 KG 实体，来获取连接的文本块，并选择性地探索关系以获取更多的上下文。

参数 retriever_mode="keyword" 指定了本次检索采用关键词形式。

include_text=False：查询引擎只用原生三元组进行查询，查询不包含对应节点的文本信息；

response_mode="tree_summarize"：返回结果（响应形式）是知识图谱的树结构的总结。这个树以递归方式构建，查询作为根节点，最相关的答案作为叶节点。tree_summarize 响应模式对于总结性任务非常有用，比如：提供某个话题的高度概括，或是回答某个需要考虑周全的问题。当然，它还可以生成更复杂的响应，比如：解释某个事物发生的真实原因，或者解释某个过程涉及了哪些步骤。

### 图探索方法 3：KG 混合检索

In [None]:
hybrid_query_engine = kg_index.as_query_engine(
    include_text=True,
    response_mode="tree_summarize",
    embedding_mode="hybrid",
    similarity_top_k=3,
    explore_global_knowledge=True,
)

response = hybrid_query_engine.query("Tell me about Bryce Harper.")
display(Markdown(f"<b>{response}</b>"))

通过设定 embedding_mode="hybrid"，指定查询引擎为基于向量的检索和基于关键词的检索二者的混合方式，从知识图谱中检索信息，并进行去重。KG 混合检索方式不仅使用关键词找到相关的三元组，它也使用基于向量的检索来找到基于语义相似性的相似三元组。所以，本质上，混合模式结合了关键词搜索和语义搜索，并利用这两种方法的优势来提高搜索结果的准确性和相关性。

include_text=True：同上文的字段一样，用来指定是否包含节点的文本信息；

similarity_top_k=3：Top K 设定，它将根据 Embedding 检索出最相似结果的前三个结果。你可以根据你的使用场景弹性地调整这个值；

explore_global_knowledge=True：指定查询引擎是否要考虑知识图谱的全局上下文来检索信息。当设置 explore_global_knowledge=True时，查询引擎不会将其搜索限制在本地上下文（即，一个节点的直接邻居），而是会考虑知识图谱的更广泛的全局上下文。当你想检索与查询不直接相关，但在该知识图谱的更大上下文中有关的信息时，这可能很有用。

基于关键词的检索和混合检索二者主要区别，在于我们从知识图谱中检索信息的方法：基于关键词的检索使用关键词方法，而混合检索使用结合 Embedding 和关键词的混合方法。

### 绘制图

In [None]:
%%ngql
MATCH (p:`entity`)-[e:relationship]->(m:`entity`)
  WHERE p.`entity`.`name` == 'Phillies'
RETURN p, e, m LIMIT 10;

In [None]:
%ng_draw

我们可以试一下查询引擎是否有正确使用到图中的数据：

In [None]:
response = query_engine.query("Tell about Ryan howard.")
display(Markdown(f"<b>{response}</b>"))

In [None]:
%%ngql 
MATCH (p:`entity`)-[r:`relationship`]->(q:`entity`)
WHERE p.`entity`.`name` == 'Ryan howard' 
RETURN p, r, q;

In [None]:
%ng_draw

## 使用向量索引

In [None]:
from llama_index.core import VectorStoreIndex

loader = WikipediaReader()
wiki_documents = loader.load_data(pages=['Philadelphia Phillies'], auto_suggest=False)
print(f'Loaded {len(wiki_documents)} documents')

youtube_loader = YoutubeTranscriptReader()
youtube_documents = youtube_loader.load_data(ytlinks=['https://www.youtube.com/watch?v=k-HTQ8T7oVw'])    
print(f'Loaded {len(youtube_documents)} YouTube documents')

vector_index = VectorStoreIndex.from_documents(wiki_documents + youtube_documents)
vector_query_engine = vector_index.as_query_engine()

response = vector_query_engine.query("Tell me about Bryce Harper.")
display(Markdown(f"<b>{response}</b>"))

In [None]:
response = vector_query_engine.query("How did the standing ovation Trey Turner received change his season?")
display(Markdown(f"<b>{response}</b>"))

In [None]:
response = vector_query_engine.query("Tell me about some of the facts of Philadelphia Phillies.")
display(Markdown(f"<b>{response}</b>"))

## Create CustomRetriever to combine vector index and KG index

In [None]:
from llama_index import QueryBundle
from llama_index.schema import NodeWithScore
from llama_index.retrievers import BaseRetriever, VectorIndexRetriever, KGTableRetriever
from typing import List

class CustomRetriever(BaseRetriever):
    """Custom retriever that performs both Vector search and Knowledge Graph search"""

    def __init__(
        self,
        vector_retriever: VectorIndexRetriever,
        kg_retriever: KGTableRetriever,
        mode: str = "OR",
    ) -> None:
        """Init params."""

        self._vector_retriever = vector_retriever
        self._kg_retriever = kg_retriever
        if mode not in ("AND", "OR"):
            raise ValueError("Invalid mode.")
        self._mode = mode

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve nodes given query."""

        vector_nodes = self._vector_retriever.retrieve(query_bundle)
        kg_nodes = self._kg_retriever.retrieve(query_bundle)

        vector_ids = {n.node.node_id for n in vector_nodes}
        kg_ids = {n.node.node_id for n in kg_nodes}

        combined_dict = {n.node.node_id: n for n in vector_nodes}
        combined_dict.update({n.node.node_id: n for n in kg_nodes})

        if self._mode == "AND":
            retrieve_ids = vector_ids.intersection(kg_ids)
        else:
            retrieve_ids = vector_ids.union(kg_ids)

        retrieve_nodes = [combined_dict[rid] for rid in retrieve_ids]
        return retrieve_nodes

In [None]:
from llama_index import get_response_synthesizer
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.retrievers import VectorIndexRetriever, KGTableRetriever

# create custom retriever
vector_retriever = VectorIndexRetriever(index=vector_index)
kg_retriever = KGTableRetriever(
    index=kg_index, retriever_mode="keyword", include_text=False
)
custom_retriever = CustomRetriever(vector_retriever, kg_retriever)

# create response synthesizer
response_synthesizer = get_response_synthesizer(
    service_context=service_context,
    response_mode="tree_summarize",
)

## Create 7 query engines and run queries

In [None]:
# KG vector-based entity retrieval
kg_query_engine = kg_index.as_query_engine()

# KG keyword-based entity retrieval
kg_keyword_query_engine = kg_index.as_query_engine(
    # setting to false uses the raw triplets instead of adding the text from the corresponding nodes
    include_text=False,
    retriever_mode="keyword",
    response_mode="tree_summarize",
)

# KG hybrid entity retrieval
kg_hybrid_query_engine = kg_index.as_query_engine(
    include_text=True,
    response_mode="tree_summarize",
    embedding_mode="hybrid",
    similarity_top_k=3,
    explore_global_knowledge=True,
)

# Raw vector index retrieval
vector_query_engine = vector_index.as_query_engine()

# Custom combo query engine
custom_query_engine = RetrieverQueryEngine(
    retriever=custom_retriever,
    response_synthesizer=response_synthesizer,
)

# using KnowledgeGraphQueryEngine
from llama_index.query_engine import KnowledgeGraphQueryEngine

kgqe_query_engine = KnowledgeGraphQueryEngine(
    storage_context=storage_context,
    service_context=service_context,
    llm=llm,
    verbose=True,
)

# using KnowledgeGraphRAGRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.retrievers import KnowledgeGraphRAGRetriever

graph_rag_retriever = KnowledgeGraphRAGRetriever(
    storage_context=storage_context,
    service_context=service_context,
    llm=llm,
    verbose=True,
)

kg_rag_query_engine = RetrieverQueryEngine.from_args(
    graph_rag_retriever, service_context=service_context
)

In [None]:
response = kg_query_engine.query("Tell me about Bryce Harper.")
display(Markdown(f"<b>{response}</b>"))

In [None]:
response = kg_keyword_query_engine.query("Tell me about Bryce Harper.")
display(Markdown(f"<b>{response}</b>"))

In [None]:
response = kg_hybrid_query_engine.query("Tell me about Bryce Harper.")
display(Markdown(f"<b>{response}</b>"))

In [None]:
response = vector_query_engine.query("Tell me about Bryce Harper.")
display(Markdown(f"<b>{response}</b>"))

In [None]:
response = custom_query_engine.query("Tell me about Bryce Harper.")
display(Markdown(f"<b>{response}</b>"))

In [None]:
response = kgqe_query_engine.query("Tell me about Bryce Harper.")
display(Markdown(f"<b>{response}</b>"))

In [None]:
response = kg_rag_query_engine.query("Tell me about Bryce Harper.")
display(Markdown(f"<b>{response}</b>"))

In [None]:
response = kg_query_engine.query("How did the standing ovation Trey Turner received change his season?")
display(Markdown(f"<b>{response}</b>"))

In [None]:
response = kg_keyword_query_engine.query("How did the standing ovation Trey Turner received change his season?")
display(Markdown(f"<b>{response}</b>"))

In [None]:
response = kg_hybrid_query_engine.query("How did the standing ovation Trey Turner received change his season?")
display(Markdown(f"<b>{response}</b>"))

In [None]:
response = vector_query_engine.query("How did the standing ovation Trey Turner received change his season?")
display(Markdown(f"<b>{response}</b>"))

In [None]:
response = custom_query_engine.query("How did the standing ovation Trey Turner received change his season?")
display(Markdown(f"<b>{response}</b>"))

In [None]:
response = kgqe_query_engine.query("How did the standing ovation Trey Turner received change his season?")
display(Markdown(f"<b>{response}</b>"))

In [None]:
response = kg_rag_query_engine.query("How did the standing ovation Trey Turner received change his season?")
display(Markdown(f"<b>{response}</b>"))

In [None]:
response = kg_query_engine.query("Tell me some facts about the current stadium of Philadelphia Phillies.")
display(Markdown(f"<b>{response}</b>"))

In [None]:
response = kg_keyword_query_engine.query("Tell me some facts about the current stadium of Philadelphia Phillies.")
display(Markdown(f"<b>{response}</b>"))

In [None]:
response = kg_hybrid_query_engine.query("Tell me some facts about the current stadium of Philadelphia Phillies.")
display(Markdown(f"<b>{response}</b>"))

In [None]:
response = vector_query_engine.query("Tell me some facts about the current stadium of Philadelphia Phillies.")
display(Markdown(f"<b>{response}</b>"))

In [None]:
response = custom_query_engine.query("Tell me some facts about the current stadium of Philadelphia Phillies.")
display(Markdown(f"<b>{response}</b>"))

In [None]:
response = kgqe_query_engine.query("Tell me some facts about the current stadium of Philadelphia Phillies.")
display(Markdown(f"<b>{response}</b>"))

In [None]:
response = kg_rag_query_engine.query("Tell me some facts about the current stadium of Philadelphia Phillies.")
display(Markdown(f"<b>{response}</b>"))