
HyDE 是一种创新方法，它将查询问题转换为包含答案的假设文档，旨在弥合向量空间中查询和文档分布之间的差距。

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["MINIMAX_GROUP_ID"] = os.getenv("MINIMAX_GROUP_ID")
os.environ["MINIMAX_API_KEY"] = os.getenv("MINIMAX_API_KEY")

from langchain_qdrant import QdrantVectorStore
from langchain_community.chat_models import MiniMaxChat
from langchain_community.embeddings import MiniMaxEmbeddings
from langchain_community.document_loaders import Docx2txtLoader
from langchain.prompts import PromptTemplate
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
path = "../data/二十届三中全会.docx"
loader = Docx2txtLoader(path)
documents = loader.load()
documents

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100,
    length_function=len
)
docs = text_splitter.split_documents(documents)
chunk_count = len(docs)

In [None]:
embeddings = MiniMaxEmbeddings()
vectorstore = QdrantVectorStore.from_documents(
    docs,
    embeddings,
    url="http://localhost:6333/",
    prefer_grpc=True,
    collection_name="011"
)

In [None]:
llm = MiniMaxChat()

hyde_prompt = PromptTemplate(
    input_variables=["query", "chunk_size"],
    template="""Given the question '{query}', generate a hypothetical document that directly answers this question. The document should be detailed and in-depth.
    the document size has be exactly {chunk_size} characters.""",
)

hyde_chain = hyde_prompt | llm

In [None]:
def generate_hypothetical_document(self, query):
        input_variables = {"query": query, "chunk_size": 500}
        return self.hyde_chain.invoke(input_variables).content

def retrieve(self, query, k=3):
    hypothetical_doc = self.generate_hypothetical_document(query)
    similar_docs = self.vectorstore.similarity_search(hypothetical_doc, k=k)
    return similar_docs, hypothetical_doc
