# 检索器优化：查询重写
本质上查询重写就是使用多重查询来改写问题

In [1]:
! pip install langchain_chroma

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting langchain_chroma
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/08/bc/591e416a134f9fb766579d1ca9739f32b9d79c607a5053d34ad2b8fa7246/langchain_chroma-0.2.6-py3-none-any.whl (12 kB)
Collecting chromadb>=1.0.20 (from langchain_chroma)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/00/39/5969bec252d6b174eeb68a5b23c88cbe4913a1e20d6b313ec628e5079c74/chromadb-1.1.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m917.5 kB/s[0m  [33m0:00:21[0m0:00:01[0m00:01[0m
[?25hCollecting build>=1.0.3 (from chromadb>=1.0.20->langchain_chroma)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/cb/8c/2b30c12155ad8de0cf641d76a8b396a16d2c36bc6d50b621a62b7c4567c1/build-1.3.0-py3-none-any.whl (23 kB)
Collecting pybase64>=1.4.1 (from chromadb>=1.0.20->langchain_chroma)
  Downloading https://pypi.tuna.tsinghua

In [2]:
import os
from dotenv import load_dotenv

# 加载 .env 文件中的环境变量
load_dotenv(override=True)  # 使用 override=True 确保加载最新的 .env 数据

True

In [3]:
from langchain_openai import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings(
    model="BAAI/bge-m3",
    # https://api.siliconflow.cn/v1/embeddings
    base_url=os.environ.get("SILICONFLOW_API_BASE"),
    api_key=os.environ.get("SILICONFLOW_API_KEY"),
)

In [4]:
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

page_url = "https://python.langchain.com/docs/how_to/MultiQueryRetriever/"

# 加载网页
loader = WebBaseLoader(web_paths=[page_url])
data = loader.load()

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
splits = text_splitter.split_documents(data)

# VectorDB
vectordb = Chroma.from_documents(documents=splits, embedding=embeddings_model)

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [None]:
# 使用多重查询
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_openai import ChatOpenAI
import os

llm = ChatOpenAI(
    model=os.environ.get("OPENAPI_MODEL"),
    base_url=os.environ.get("OPENAPI_API_BASE"),
    api_key=os.environ.get("OPENAPI_API_KEY"),
    temperature=0,
)


# 多重查询检索器
# 传入向量数据库默认的检索器和LLM，最终多重问题的质量跟这里传入的模型有很大的关系
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=vectordb.as_retriever(), llm=llm
)

In [6]:
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [None]:
question = "如何让用户查询更准确？"

unique_docs = retriever_from_llm.invoke(question)
print(unique_docs)
len(unique_docs)

INFO:langchain.retrievers.multi_query:Generated queries: ['1. 有哪些方法可以提高用户查询的精准度？  ', '2. 用户搜索时如何减少不相关的结果？  ', '3. 优化查询准确性的技术和策略有哪些？']


[Document(id='09750ca3-495c-434b-87db-a67d20c0d8c5', metadata={'language': 'en', 'title': 'How to use the MultiQueryRetriever | 🦜️🔗 LangChain', 'description': 'Distance-based vector database retrieval embeds (represents) queries in high-dimensional space and finds similar embedded documents based on a distance metric. But, retrieval may produce different results with subtle changes in query wording, or if the embeddings do not capture the semantics of the data well. Prompt engineering / tuning is sometimes done to manually address these problems, but can be tedious.', 'source': 'https://python.langchain.com/docs/how_to/MultiQueryRetriever/'}, page_content='OpenAIEmbeddings()vectordb = Chroma.from_documents(documents=splits, embedding=embedding)'), Document(id='d14c1373-6efd-48f1-804b-686609738abd', metadata={'title': 'How to use the MultiQueryRetriever | 🦜️🔗 LangChain', 'description': 'Distance-based vector database retrieval embeds (represents) queries in high-dimensional space and fi

6