# 4种文档处理链

- stuff
- refine

1. stuffChain

最常见文档链，直接将文档塞进 prompt 中

In [None]:
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.document_loaders import  PyPDFLoader
from langchain.chat_models import ChatOpenAI

loader = PyPDFLoader("../assets/loader.pdf")
#print(loader.load())

prompt_template = """对以下文字做简洁的总结:
{text}
简洁的总结:"""

prompt = PromptTemplate.from_template(prompt_template)
llm = ChatOpenAI(
    temperature=0,
    model="gpt-4-1106-preview",
)
llm_chain = LLMChain(llm=llm, prompt=prompt)

stuff_chain = StuffDocumentsChain(
    llm_chain=llm_chain,
    document_variable_name="text",
)
docs = loader.load()
print(stuff_chain.run(docs))



1.2 封装好的 load_summarize_chain

In [None]:
from langchain.document_loaders import  PyPDFLoader
from langchain.chat_models import ChatOpenAI
from langchain.chains.summarize import load_summarize_chain

loader = PyPDFLoader("../assets/loader.pdf")
docs = loader.load()
llm = ChatOpenAI(
    temperature=0,
    model="gpt-4-1106-preview",
)
chain = load_summarize_chain(
    llm=llm,
    chain_type="stuff",
    verbose=True,
    )

chain.run(docs)

2. refine

通过循环引用 LLM，将文档不断投喂，并产生中间答案，适合逻辑有上下文关联的文档，不适合交叉引用的文档

In [None]:
from langchain.prompts import  PromptTemplate
from langchain.document_loaders import PyPDFLoader
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain

#load
loader = PyPDFLoader("../assets/loader.pdf")
docs = loader.load()
#split
text_split = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size = 1000,
    chunk_overlap=0
)
split_docs = text_split.split_documents(docs)

prompt_template = """对以下文字做简洁的总结:
{text}
简洁的总结:"""

prompt = PromptTemplate.from_template(prompt_template)

refine_template = (
    "你的任务是产生最终摘要\n"
    "我们已经提供了一个到某个特定点的现有回答:{existing_answer}\n"
    "我们有机会通过下面的一些更多上下文来完善现有的回答(仅在需要时使用).\n"
    "------------\n"
    "{text}\n"
    "------------\n"
    "根据新的上下文，用中文完善原始回答.\n"
    "如果上下文没有用处,返回原始回答."
)

refine_prompt = PromptTemplate.from_template(refine_template)
llm = ChatOpenAI(
    temperature=0,
    model="gpt-3.5-turbo",
)

chain = load_summarize_chain(
    llm=llm,
    chain_type="refine",
    question_prompt=prompt,
    refine_prompt = refine_prompt,
    return_intermediate_steps=True,
    input_key = "documents",
    output_key = "output_text",
)

# 执行
result = chain({"documents":split_docs},return_only_outputs=True)
print(result["output_text"])

print("\n\n".join(result["intermediate_steps"][:3]))


3. map reduce chain

In [None]:
from langchain.chains import MapReduceDocumentsChain
from langchain.chains import ReduceDocumentsChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter

#load pdf
loader = PyPDFLoader("../assets/loader.pdf")
docs = loader.load()
#split text
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000,
    chunk_overlap=0,
)
split_docs = text_splitter.split_documents(docs)
#print(split_docs)

#map chain
map_template = """对以下文字做简洁的总结:
"{content}"
简洁的总结:"""
map_prompt = PromptTemplate.from_template(map_template)
llm = ChatOpenAI(
    temperature=0,
    model="gpt-3.5-turbo",
)
map_chain = LLMChain(
    llm=llm,
    prompt=map_prompt,
)

#reduce chain
reduce_template = """以下是一个摘要集合:
{doc_summaries}
将上述摘要与所有关键细节进行总结.
总结:"""
reduce_prompt = PromptTemplate.from_template(reduce_template)
reduce_chain = LLMChain(
    prompt=reduce_prompt,
    llm=llm,
)
stuff_chain = StuffDocumentsChain(
    llm_chain=reduce_chain,
    document_variable_name="doc_summaries",
)
reduce_final_chain = ReduceDocumentsChain(
    combine_documents_chain=stuff_chain,
    #超过4000个token就会切入到下一个stuff_chain
    collapse_documents_chain=stuff_chain,
    token_max=4000,
)

#map reduce chain
map_reduce_chain = MapReduceDocumentsChain(
    llm_chain=map_chain,
    document_variable_name="content",
    reduce_documents_chain=reduce_final_chain,
)

4. map rerank

先将每个文档或文档块投喂给LLM,并对每个文档或文档块生成问题的答案进行打分，然后将打分最高的文档或文档块作为最终答案返回

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.prompts import PromptTemplate

llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k")
#load
loader = PyPDFLoader("../assets/loader.pdf")
docs = loader.load()
#split
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=500, chunk_overlap=0
)
split_docs = text_splitter.split_documents(docs)

chain = load_qa_with_sources_chain(
    ChatOpenAI(temperature=0), 
    chain_type="map_rerank", 
    metadata_keys=['source'], 
    return_intermediate_steps=True
    )


query = "what is this document talk about?answer by chinese"
result = chain({"input_documents":split_docs,"question":query})
result