## 进阶RAG检索  MultiQueryRetriever  

### 准备工作（加载数据、定义embedding模型、向量库）

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma


# Load pdf
loader = PyPDFLoader("..\\..\\baichuan.pdf")
data = loader.load()

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
splits = text_splitter.split_documents(data[:6])



In [6]:
import os
from getpass import getpass

OPENAI_API_KEY = getpass()

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

# VectorDB
embedding = OpenAIEmbeddings(base_url="...")
vectordb = Chroma.from_documents(documents=splits, embedding=embedding)

### MultiQueryRetriever

In [7]:
# Set logging for the queries
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [11]:
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.chat_models import ChatOpenAI

question = "what is baichuan2 ?"
llm = ChatOpenAI(temperature=0, base_url="...")
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=vectordb.as_retriever(), llm=llm
)


In [12]:
docs = retriever_from_llm.get_relevant_documents(query=question)
len(docs)

INFO:langchain.retrievers.multi_query:Generated queries: ['1. Can you provide information on baichuan2?', '2. What can you tell me about baichuan2?', '3. Could you explain the concept of baichuan2 to me?']


5

In [13]:
docs

[Document(metadata={'author': '', 'creationdate': '2025-04-18T00:32:55+00:00', 'creator': 'LaTeX with hyperref', 'keywords': '', 'moddate': '2025-04-18T00:32:55+00:00', 'page': 1, 'page_label': '2', 'producer': 'pdfTeX-1.40.25', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'source': '..\\..\\baichuan.pdf', 'subject': '', 'title': '', 'total_pages': 28, 'trapped': '/False'}, page_content='evaluations, Baichuan 2 nearly doubles the results\nof the Baichuan 1. In addition, Baichuan 2 also\ndemonstrates strong performance on medical and\nlegal domain tasks. On benchmarks such as\nMedQA (Jin et al., 2021) and JEC-QA (Zhong\net al., 2020), Baichuan 2 outperforms other open-\nsource models, making it a suitable foundation\nmodel for domain-specific optimization.\nAdditionally, we also released two chat\nmodels, Baichuan 2-7B-Chat and Baichuan 2-'),
 Document(metadata={'author': '', 'creationdate': '2025-04-18T00:32:55+00:00', 'cr

In [14]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain



template = """基于以下提供的内容回答问题，如果内容中不包含问题的答案，请回答“我不知道”
内容：
{contexts}

问题： {query}
"""

mulitquery_PROMPT = PromptTemplate(  input_variables=["query", "contexts"], template=template,)

# Chain
qa_chain = LLMChain(llm=llm, prompt=mulitquery_PROMPT)

  qa_chain = LLMChain(llm=llm, prompt=mulitquery_PROMPT)


In [15]:
out = qa_chain(  inputs={"query": question,  
                         "contexts": "\n---\n".join([d.page_content for d in docs]) }
                        )

out

  out = qa_chain(  inputs={"query": question,


{'query': 'what is baichuan2 ?',
 'contexts': 'evaluations, Baichuan 2 nearly doubles the results\nof the Baichuan 1. In addition, Baichuan 2 also\ndemonstrates strong performance on medical and\nlegal domain tasks. On benchmarks such as\nMedQA (Jin et al., 2021) and JEC-QA (Zhong\net al., 2020), Baichuan 2 outperforms other open-\nsource models, making it a suitable foundation\nmodel for domain-specific optimization.\nAdditionally, we also released two chat\nmodels, Baichuan 2-7B-Chat and Baichuan 2-\n---\nWith such a massive amount of training data,\nBaichuan 2 achieves significant improvements over\nBaichuan 1. On general benchmarks like MMLU\n(Hendrycks et al., 2021a), CMMLU (Li et al.,\n2023), and C-Eval (Huang et al., 2023), Baichuan\n2-7B achieves nearly 30% higher performance\ncompared to Baichuan 1-7B. Specifically, Baichuan\n2 is optimized to improve performance on math\nand code problems. On the GSM8K (Cobbe\net al., 2021) and HumanEval (Chen et al., 2021)\n---\nBaichuan 1-1