# 更加可靠的检索

更加可靠的检索方法，增加了检查文档相关性和防止幻觉，以确保检索信息的准确性和相关性。

In [34]:
import re
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["MINIMAX_GROUP_ID"] = os.getenv("MINIMAX_GROUP_ID")
os.environ["MINIMAX_API_KEY"] = os.getenv("MINIMAX_API_KEY")

from langchain_qdrant import QdrantVectorStore
from langchain_community.chat_models import MiniMaxChat
from langchain_community.embeddings import MiniMaxEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.output_parsers import StrOutputParser

### 第一步 文档加载和分块
加载基于txt的文档并将其拆分为更小的、可管理的块，以促进高效的向量编码和检索。

In [19]:
file_path = ('../data/问答题.txt')
loader = TextLoader(file_path, encoding="utf-8")
documents = loader.load()
documents

[Document(metadata={'source': '../data/问答题.txt'}, page_content='挂失解挂\n挂失解挂，快速响应，保证号卡安全\n一键解挂，轻松恢复\n服务链接：https://wapah.189.cn/fjcfw/zzgsjg.wap\n\n\n资费专区\n套餐资费一目了然，选择更轻松\n资费专区，各类套餐资费信息抢先看\t\n服务链接：https://wapact.189.cn:9001/provincialarea/provincialarea.html?provCode=600301\n\n\n补换卡\t线上换卡，足不出户\t号卡用不了？不用急，专属补换卡服务帮您迅速搞定！\t\n服务链接：https://ah.189.cn/cms/r/cms/ah/default/v2020/replaceCard/index.html\n\n\n已定业务\t\n服务链接：https://wapah.189.cn/busQueryAuth/newBusPackageInit.wap\n\n\n充值话费\t\n服务链接：http://wapah.189.cn/index/rechargeIndex.wap?fromSource=QYWX\n\n\n订购流量\t\n服务链接：http://wapah.189.cn/handle/packageIndex.shtml?fromSource=QYWX\n\n\n积分兑换\t\n服务链接：http://wapah.189.cn/exchange/exchangeInit.shtml?fromSource=QYWX\n\n\n套餐余量\n服务链接：http://wapah.189.cn/used/newUsedAmountQry.wap?m=-1&fromSource=QYWX\n\n\n服务进度查询\t\n服务链接：http://ah.189.cn/cms/r/cms/ah/default/v2018/repairSheet/querySheet.html?B9wgfv37VqnX=1714291687044\n\n\n宽带远程停复机\n服务链接：http://wapah.189.cn/ua/toLogin.shtml\n\n\n已办业务查询\t\n服务链接：http://wapah.1

使用正则表达式根据特定分隔符分割文本

In [24]:
separator = "\n\n"
chunks = re.split(separator, documents[0].page_content)
# 去除空块
chunks = [chunk.strip() for chunk in chunks if chunk.strip()]
# list转换为Document格式
docs = []
for chunk in chunks:
    docs.append(
        Document(
            page_content=chunk,
            metadata={"source": documents[0].metadata["source"]}
        )
    )
docs

[Document(metadata={'source': '../data/问答题.txt'}, page_content='挂失解挂\n挂失解挂，快速响应，保证号卡安全\n一键解挂，轻松恢复\n服务链接：https://wapah.189.cn/fjcfw/zzgsjg.wap'),
 Document(metadata={'source': '../data/问答题.txt'}, page_content='资费专区\n套餐资费一目了然，选择更轻松\n资费专区，各类套餐资费信息抢先看\t\n服务链接：https://wapact.189.cn:9001/provincialarea/provincialarea.html?provCode=600301'),
 Document(metadata={'source': '../data/问答题.txt'}, page_content='补换卡\t线上换卡，足不出户\t号卡用不了？不用急，专属补换卡服务帮您迅速搞定！\t\n服务链接：https://ah.189.cn/cms/r/cms/ah/default/v2020/replaceCard/index.html'),
 Document(metadata={'source': '../data/问答题.txt'}, page_content='已定业务\t\n服务链接：https://wapah.189.cn/busQueryAuth/newBusPackageInit.wap'),
 Document(metadata={'source': '../data/问答题.txt'}, page_content='充值话费\t\n服务链接：http://wapah.189.cn/index/rechargeIndex.wap?fromSource=QYWX'),
 Document(metadata={'source': '../data/问答题.txt'}, page_content='订购流量\t\n服务链接：http://wapah.189.cn/handle/packageIndex.shtml?fromSource=QYWX'),
 Document(metadata={'source': '../data/问答题.txt'}, page_conten

### 第二步 创建向量存储
利用Qdrant和MiniMaxEmbeddings嵌入将文档块向量存储中，从而实现高效的基于相似性的检索。

In [25]:
embeddings = MiniMaxEmbeddings()
vectorstore = QdrantVectorStore.from_documents(
    docs,
    embeddings,
    url="http://localhost:6333/",
    prefer_grpc=True,
    collection_name="007"
)

### 第三步 问题查询
使用检索器来查询问题，并从向量存储中检索相关文档块。

In [27]:
# 创建检索器，相似度；返回文档块数量
retriever = vectorstore.as_retriever(
                search_type="similarity",
                search_kwargs={'k': 5},
            )

question = "存费送权益的活动有哪些？"
context = retriever.invoke(question)
context

[Document(metadata={'source': '../data/问答题.txt', '_id': '0c45d39c-1f3f-4c9f-9cec-5ce82ef9b306', '_collection_name': '007'}, page_content='问：存费送权益活动期多久\n答：您好，2024-08-01 00:00:00 开始到 2024-12-31 23:59:59 结束，如果您需要请及时参加。'),
 Document(metadata={'source': '../data/问答题.txt', '_id': '53c0d279-ec97-456e-a577-6866518ac391', '_collection_name': '007'}, page_content='问：存费送权益的活动内容是什么样的\n答：活动1：预存300元，协议期6个月，分6个月返还话费，每个月返50元话费，并赠送大权益包A包。\n活动2：预存600元，协议期12个月，分12个月返还话费，每个月返50元话费，并赠送大权益包B包。'),
 Document(metadata={'source': '../data/问答题.txt', '_id': 'f9a7ee88-3680-4b80-a74b-a9bed4bc40c0', '_collection_name': '007'}, page_content='问：存费送权益我能参加吗\n答：您好，目前这个活动针对安徽电信翼支付预付费、后付费手机用户均可办理'),
 Document(metadata={'source': '../data/问答题.txt', '_id': '5658881a-a4be-49ea-91c4-bcc136815caa', '_collection_name': '007'}, page_content='问：存费送权益的的权益包是什么\n答：大权益包A包：以下权益每月N选2：爱奇艺周卡/优酷周卡/腾讯周卡/5元水电煤缴费券/5元中石油加油券/5元商户券\n大权益包B包：以下权益每月N选1：爱奇艺月卡/优酷月卡/腾讯月卡/15元水电煤缴费券（5元*3张）/15元中石油元券/15元商户券（5元*3张）\n大权益包在总价值不减少的前提下，包含的内容可能会不定期进行更新调整，具体见翼支付

### 第四步 检查文档相关性
使用MiniMaxChat语言模型进行判断相关性是否过滤掉不相关的文档块。

In [28]:

# Data model
class GradeDocuments(BaseModel):
    """对检索到的文档进行相关性检查"""

    binary_score: str = Field(
        description="Documents are relevant to the question, 'yes' or 'no'"
    )

llm = MiniMaxChat()
structured_llm_grader = llm.with_structured_output(GradeDocuments)

# Prompt
# 你是一个评分员，评估检索到的文档与用户问题的相关性。\ n
# 如果文档包含与用户问题相关的关键字或语义，则将其评为相关。\ n
# 它不需要是一个严格的测试。目标是过滤掉错误的检索。\ n
# 给出一个是否结果“是”或“否”，以表明该文档是否与问题相关。
system = """
    You are a grader assessing relevance of a retrieved document to a user question. \n 
    If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n
    It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question.
"""

grade_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
    ]
)

retrieval_grader = grade_prompt | structured_llm_grader

docs_to_use = []
for doc in context:
    print(doc.page_content, '\n', '-'*50)
    res = retrieval_grader.invoke({"question": question, "document": doc.page_content})
    print(res,'\n')
    if res.binary_score == 'yes':
        docs_to_use.append(doc)

问：存费送权益活动期多久
答：您好，2024-08-01 00:00:00 开始到 2024-12-31 23:59:59 结束，如果您需要请及时参加。 
 --------------------------------------------------
binary_score='yes' 

问：存费送权益的活动内容是什么样的
答：活动1：预存300元，协议期6个月，分6个月返还话费，每个月返50元话费，并赠送大权益包A包。
活动2：预存600元，协议期12个月，分12个月返还话费，每个月返50元话费，并赠送大权益包B包。 
 --------------------------------------------------
binary_score='yes' 

问：存费送权益我能参加吗
答：您好，目前这个活动针对安徽电信翼支付预付费、后付费手机用户均可办理 
 --------------------------------------------------
binary_score='yes' 

问：存费送权益的的权益包是什么
答：大权益包A包：以下权益每月N选2：爱奇艺周卡/优酷周卡/腾讯周卡/5元水电煤缴费券/5元中石油加油券/5元商户券
大权益包B包：以下权益每月N选1：爱奇艺月卡/优酷月卡/腾讯月卡/15元水电煤缴费券（5元*3张）/15元中石油元券/15元商户券（5元*3张）
大权益包在总价值不减少的前提下，包含的内容可能会不定期进行更新调整，具体见翼支付客户端权益专区展示内容。 
 --------------------------------------------------
binary_score='yes' 

问：你们最近是有个存费送权益的活动吗
答：您好，感谢您对安徽电信的关注，目前是有预存300元/600元，协议期6/12个月，分6/12个月返还话费，每个月返50元话费，并赠送对应的大权益包活动。 
 --------------------------------------------------
binary_score='yes' 



### 第五步 生成答案
使用相关文档块来生成对用户查询的简明答案。

In [30]:

# Prompt
# 你是负责回答问题的助手。根据你的知识回答问题。
# 最多用三到五个句子，回答要简明扼要
system = """
    You are an assistant for question-answering tasks. Answer the question based upon your knowledge. 
    Use three-to-five sentences maximum and keep the answer concise.
"""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Retrieved documents: \n\n <docs>{documents}</docs> \n\n User question: <question>{question}</question>"),
    ]
)

def format_docs(docs):
    return "\n".join(f"<doc{i+1}>:\nSource:{doc.metadata['source']}\nContent:{doc.page_content}\n</doc{i+1}>\n" for i, doc in enumerate(docs))

# Chain
rag_chain = prompt | llm | StrOutputParser()

# 生成答案
generation = rag_chain.invoke({"documents":format_docs(docs_to_use), "question": question})
print(generation)

您好，存费送权益的活动有两种方案：活动1是预存300元，协议期6个月，分6个月返还话费，每个月返50元话费，并赠送大权益包A包；活动2是预存600元，协议期12个月，分12个月返还话费，每个月返50元话费，并赠送大权益包B包。


### 第六步 幻觉检测

专用的幻觉检测步骤确保生成的答案基于检索到的文档，防止包含不支持或错误的信息。

In [35]:
# 对于幻觉检测，我们使用一个简单的是否评分系统，其中“是”表示答案基于检索到的文档，而“否”表示答案可能包含幻觉。
class GradeHallucinations(BaseModel):
    binary_score: str = Field(
        ...,
        description="Answer is grounded in the facts, 'yes' or 'no'"
    )


structured_llm_grader = llm.with_structured_output(GradeHallucinations)

# Prompt
system = """
    You are a grader assessing whether an LLM generation is grounded in / supported by a set of retrieved facts. \n 
    Give a binary score 'yes' or 'no'. 'Yes' means that the answer is grounded in / supported by the set of facts.
"""

hallucination_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Set of facts: \n\n <facts>{documents}</facts> \n\n LLM generation: <generation>{generation}</generation>"),
    ]
)

hallucination_grader = hallucination_prompt | structured_llm_grader

response = hallucination_grader.invoke({"documents": format_docs(docs_to_use), "generation": generation})
print(response)

binary_score='yes'


### 最后 显示答案使用的文档块
识别并突出显示检索到的文档中用于生成最终答案的确切片段块


In [32]:
from typing import List
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate

# 返回文档中用于回答问题的特定部分
class HighlightDocuments(BaseModel):
    id: List[str] = Field(
        ...,
        description="List of id of docs used to answers the question"
    )

    source: List[str] = Field(
        ...,
        description="List of sources used to answers the question"
    )

    segment: List[str] = Field(
        ...,
        description="List of direct segements from used documents that answers the question"
    )

# parser
parser = PydanticOutputParser(pydantic_object=HighlightDocuments)

# Prompt
system = """
You are an advanced assistant for document search and retrieval. You are provided with the following:
1. A question.
2. A generated answer based on the question.
3. A set of documents that were referenced in generating the answer.

Your task is to identify and extract the exact inline segments from the provided documents that directly correspond to the content used to 
generate the given answer. The extracted segments must be verbatim snippets from the documents, ensuring a word-for-word match with the text 
in the provided documents.

Ensure that:
- (Important) Each segment is an exact match to a part of the document and is fully contained within the document text.
- The relevance of each segment to the generated answer is clear and directly supports the answer provided.
- (Important) If you didn't used the specific document don't mention it.

Used documents: <docs>{documents}</docs> \n\n User question: <question>{question}</question> \n\n Generated answer: <answer>{generation}</answer>

<format_instruction>
{format_instructions}
</format_instruction>
"""


prompt = PromptTemplate(
    template= system,
    input_variables=["documents", "question", "generation"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

# Chain
doc_lookup = prompt | llm | parser

# Run
lookup_response = doc_lookup.invoke({"documents":format_docs(docs_to_use), "question": question, "generation": generation})

In [33]:
for id, source, segment in zip(lookup_response.id, lookup_response.source, lookup_response.segment):
    print(f"ID: {id}\nSource: {source}\nText Segment: {segment}\n")

ID: doc2
Source: ../data/问答题.txt
Text Segment: 活动1：预存300元，协议期6个月，分6个月返还话费，每个月返50元话费，并赠送大权益包A包。

ID: doc5
Source: ../data/问答题.txt
Text Segment: 活动2：预存600元，协议期12个月，分12个月返还话费，每个月返50元话费，并赠送大权益包B包。

