In [1]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
from langchain_experimental.graph_transformers import LLMGraphTransformer

from neo4j import GraphDatabase

from pydantic import BaseModel, Field
from rich import print as pprint
import os

from dotenv import load_dotenv
load_dotenv(".env")

# add an environment variables
os.environ["NEO4J_URI"] = os.environ["LC_NEO4J_URI"]
os.environ["NEO4J_USERNAME"] = os.environ["LC_NEO4J_USERNAME"]
os.environ["NEO4J_PASSWORD"] = os.environ["LC_NEO4J_PASSWORD"]

from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings

gpt4o = AzureChatOpenAI(
    api_key=os.environ["MY_AZURE_OPENAI_API_KEY"],
    azure_endpoint = os.environ["MY_AZURE_OPENAI_ENDPOINT"],
    azure_deployment = os.environ["MY_AZURE_OPENAI_DEPLOYMENT_NAME_CHAT"],
    openai_api_version = os.environ["AZURE_OPENAI_API_VERSION"],
    temperature=0
)

embeddings = AzureOpenAIEmbeddings(
    azure_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"],
    azure_deployment = os.environ["AZURE_OPENAI_DEPLOYMENT_NAME_EMBEDDINGS"],
    openai_api_version = os.environ["AZURE_OPENAI_API_VERSION"],
)


In [2]:
#@markdown <a id="connect_db" name="connect_db"></a>
#@markdown # **步驟 3：連線圖形資料庫** 📃
#@markdown 根據剛剛所提供的金鑰，讓程式連上資料庫
#@markdown ---

from langchain_neo4j import Neo4jGraph, Neo4jVector

graph = Neo4jGraph(
    url = os.environ["LC_NEO4J_URI"],
    username = os.environ["LC_NEO4J_USERNAME"],
    password = os.environ["LC_NEO4J_PASSWORD"]
)

In [3]:
#@markdown <a id="connect_db" name="connect_db"></a>
#@markdown # **步驟(視情況決定做不做)：刪除圖形資料庫的所有資料** 📃
#@markdown 如果換了一篇文章，就需要執行
#@markdown ---

# 刪除所有節點和關係的 Cypher 語法
graph.query("MATCH (n) DETACH DELETE n")
# 刪除向量索引的 Cypher 語法
graph.query("DROP INDEX vector IF EXISTS")

[]

In [4]:
from langchain_community.document_loaders import TextLoader, MergedDataLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader_1 = TextLoader("./data/90-Romance-of-the-Three-Kingdoms.txt")
loader_2 = TextLoader("./data/91-Romance-of-the-Three-Kingdoms.txt")
loader_3 = TextLoader("./data/92-Romance-of-the-Three-Kingdoms.txt")

docs = MergedDataLoader(loaders=[loader_1, loader_2, loader_3]).load()

# 擷取文件內容
# loader = TextLoader(file_path="./data/92-Romance-of-the-Three-Kingdoms.txt")
# docs = loader.load()    # 回傳 Document 物件

text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=24)
documents = text_splitter.split_documents(documents=docs)

In [5]:
print(documents[0:2])

[Document(metadata={'source': './data/90-Romance-of-the-Three-Kingdoms.txt'}, page_content='第九十回：驅巨獸六破蠻兵，燒藤甲七擒孟獲\n卻說孔明放了孟獲等一干人，楊鋒父子皆封官爵，重賞洞兵。楊鋒等拜謝而去。孟獲等連夜奔回銀坑洞。那洞外有三江：乃是瀘水、甘南水、西城水。三路水會合，故為三江。其洞北近平坦二百餘里，多產萬物；洞西二百餘里，有鹽井；西南二百里，直抵瀘、甘；正南三百里，乃是梁都洞。洞中有山，環抱其洞；山上出銀礦，故名為銀坑山。山中置宮殿樓臺，以為蠻王巢穴。'), Document(metadata={'source': './data/90-Romance-of-the-Three-Kingdoms.txt'}, page_content='其中建一祖廟，名曰「家鬼」。四時殺牛宰馬享祭。名曰「卜鬼」。每年常以蜀人并外鄉之人祭之。若人患病，不肯服藥，只禱師巫，名為「藥鬼。」其處無刑法，但犯罪即斬。有女長成，卻於溪中沐浴，男女自相混淆，任其自配，父母不禁，名為「學藝」。年歲雨水均調，則種稻穀；倘若不熟，殺蛇為羹，煮象為飯。每方隅之中，上戶號曰：「洞主」，次日「酋長」。每月初一十五兩日，皆在三江城中買賣，轉易貨物。其風俗如此。')]


In [6]:
#@markdown <a id="connect_db" name="connect_db"></a>
#@markdown # **步驟5：建立知識圖譜** 📃
#@markdown 將文本透過LLM建立知識圖譜
#@markdown ---

# 指定 llm_transformer
llm_transformer = LLMGraphTransformer(llm=gpt4o)

# 逐段轉換為 GraphDocument
graph_documents = []
for document in documents:
    try:
        graph_doc = llm_transformer.convert_to_graph_documents([document])
        graph_documents.extend(graph_doc)
    except Exception as e:
        print(f"Error processing document chunk: {e}")

# 合併結果
# 你可以將所有的 GraphDocument 合併，然後進行後續的知識圖譜處理

print(f"Total number of graph documents: {len(graph_documents)}")

# 將 GraphDocument List 實際存入資料庫中
graph.add_graph_documents(
    graph_documents,            # 指定 GraphDocument List
    baseEntityLabel=True,       # 是否為每個 Entity 設定標籤，例如看到人名就 Label 為 Person。有助於後續知識圖譜搜尋
    include_source=True         # 是否註明每個實體或關係的來源。有助於追溯資料來源
)

# 針對每個 Node 建 index
driver = GraphDatabase.driver(
        uri = os.environ["LC_NEO4J_URI"],
        auth = ("neo4j", os.environ["LC_NEO4J_PASSWORD"]))

def create_fulltext_index(tx):
    query = '''
    CREATE FULLTEXT INDEX `fulltext_entity_id`
    FOR (n:__Entity__)
    ON EACH [n.id];
    '''
    tx.run(query)

# Function to execute the query
def create_index():
    with driver.session() as session:
        session.execute_write(create_fulltext_index)
        print("Fulltext index created successfully.")

# Call the function to create the index
try:
    create_index()
except:
    pass

# Close the driver connection
driver.close()

Total number of graph documents: 98


In [7]:
pprint(graph_documents[0:1])

In [8]:
#@markdown <a id="connect_db" name="connect_db"></a>
#@markdown # **步驟6：建立LLM應用** 📃
#@markdown 建立一個可以用來查詢知識圖譜的LLM應用
#@markdown ---

# 建立要抓取的 entity 描述的 model
class Entities(BaseModel):
    """Identifying information about entities."""

    names: list[str] = Field(
        ...,
        description="All the person, organization, or business entities that "
        "appear in the text",
    )

# 將模型與 model 合併為 chain
entity_chain = gpt4o.with_structured_output(Entities)

# # 這段程式碼的目的是將輸入的文字轉換成適合 Lucene 搜索引擎的查詢字串，並使用模糊搜索來提高搜索的靈活性。
# def generate_full_text_query(input: str) -> str:
#     words = [el for el in remove_lucene_chars(input).split() if el]
#     if not words:
#         return ""
#     full_text_query = " AND ".join([f"{word}~2" for word in words])
#     print(f"Generated Query: {full_text_query}")
#     return full_text_query.strip()


def graph_retriever(question: str) -> str:      # question 為提出的問題字串
    """
    Collects the neighborhood of entities mentioned
    in the question
    """
    result = ""
    entities = entity_chain.invoke(question)    # 提取出 entity

    # 使用 entity 對圖形資料庫做搜尋
    # 透過預先建立好的 index 來對每個 node 做搜尋
    # 搜尋到 node 後，搜尋與其有相關的 neighborhood node 與 relationship
    # 前面建 index 時，只有對有 label 為 _entity_ 的 node 建立，因此搜尋出 label 為 Document 的 node
    for entity in entities.names:
        try:
            response = graph.query(
                """CALL db.index.fulltext.queryNodes('fulltext_entity_id', $query, {limit:5})
                YIELD node,score
                CALL (node, node) {
                WITH node
                MATCH (node)-[r:!MENTIONS]->(neighbor)
                RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
                UNION ALL
                WITH node
                MATCH (node)<-[r:!MENTIONS]-(neighbor)
                RETURN neighbor.id + ' - ' + type(r) + ' -> ' +  node.id AS output
                }
                RETURN output LIMIT 10
                """,
                {"query": entity},
            )
                       # 若無結果則略過
            if response:
                result += "\n".join([el['output'] for el in response]) + "\n"
            else:
                result += f"No results found for entity '{entity}'\n"
        except Exception as e:
            result += f"Error encountered for entity '{entity}': {e}\n"
    return result if result else "No relevant data found in the knowledge graph."


vector_index = Neo4jVector.from_existing_graph(
    embeddings,
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)
vector_retriever = vector_index.as_retriever()

# 將 Node/Relationship 搜尋與文本搜尋兩種方式結合
def full_retriever(question: str):
    graph_data = graph_retriever(question)
    vector_data = [el.page_content for el in vector_retriever.invoke(question)]
    final_data = f"""Graph data:
{graph_data}
vector data:
{"\n\n".join(vector_data)}
    """
    return final_data

native_llm_chain = ( gpt4o | StrOutputParser() )

template = """Use the following knowledge graph context and vector text to answer the question comprehensively. Base your answer strictly on the context provided.
If additional information seems relevant but isn't in the context, indicate that it's not present in the knowledge graph. Answer in Traditional Chinese.

Context:
{context}

Question: {question}

Provide a clear, thorough answer based on the context above:
Answer:"""

prompt = ChatPromptTemplate.from_template(template)

KG_chain = (
    {
        "context": full_retriever,  # 呼叫 full_retriever 來取得相關知識圖譜內容
        "question": RunnablePassthrough(),  # 傳入實際問題
    }
    | prompt
    | gpt4o  # 使用 LLM 根據 prompt 模板回答
    | StrOutputParser()  # 將 LLM 回答解析為純文字
)

In [9]:
pprint(entity_chain.invoke("孔明和孟獲之間發生過什麼事？"))

In [10]:
pprint(full_retriever("孔明和孟獲之間發生過什麼事？"))

In [11]:
pprint(KG_chain.invoke(input="孔明和孟獲之間發生過什麼事？"))

In [12]:
pprint(native_llm_chain.invoke(input="孔明和孟獲之間發生過什麼事？"))

# Evaluation

In [13]:
from ragas import EvaluationDataset

sample_queries = [
    "誰是祝融夫人？",
    "孟獲被諸葛亮第幾次擒住後才真心歸降？",
    "孔明用什麼方法平定了烏戈國的藤甲軍？",
    "趙雲在鳳鳴山與韓德及其四子交戰的結果如何？",
    "孔明如何智取南安城？",
]

expected_responses = [
    "祝融夫人是孟獲的妻子，南蠻祝融氏之後，善使飛刀，百發百中。",
    "孟獲被諸葛亮第七次擒住後才真心歸降，並誓不再反。",
    "孔明利用火攻之計，在盤蛇谷用火藥和火砲燒毀了烏戈國的藤甲軍。",
    "趙雲在鳳鳴山與韓德及其四子交戰，最終斬殺了韓德及其三子，並生擒了次子韓瑤。",
    "孔明利用崔諒和楊陵的內應計策，讓關興和張苞扮作安定軍馬進入南安城，最終擒住了夏侯楙。",
]

dataset = []

for query, reference in zip(sample_queries, expected_responses):
    relevant_docs = vector_retriever.invoke(query)
    response = KG_chain.invoke(input=query)
    dataset.append(
        {
            "user_input": query,
            "retrieved_contexts": [rdoc.page_content for rdoc in relevant_docs],
            "response": response,
            "reference": reference,
        }
    )

evaluation_dataset = EvaluationDataset.from_list(dataset)

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness

evaluator_llm = LangchainLLMWrapper(gpt4o)

result = evaluate(
    dataset=evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness()],
    llm=evaluator_llm,
)

result

Evaluating: 100%|██████████| 15/15 [00:33<00:00,  2.21s/it]


{'context_recall': 1.0000, 'faithfulness': 0.9235, 'factual_correctness': 0.4300}