# Combine Docs

PermChain is a great choice for implementating workflows that involve operating over longer documents because of its recursive nature

In [1]:
from operator import itemgetter

from langchain.chat_models.openai import ChatOpenAI
from langchain.prompts import SystemMessagePromptTemplate, ChatPromptTemplate, PromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.runnables.openai_functions import OpenAIFunctionsRouter
from langchain.schema.runnable import RunnableMap
from langchain.schema.document import Document
from langchain.schema import format_document

from permchain.connection_inmemory import InMemoryPubSubConnection
from permchain.pubsub import PubSub
from permchain.topic import Topic

## Stuff Documents

Stuff documents is simple - just a chain

In [2]:
DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
def _combine_documents(docs, document_prompt = DEFAULT_DOCUMENT_PROMPT, document_separator="\n\n"):
    doc_strings = [format_document(doc, document_prompt) for doc in docs]
    return document_separator.join(doc_strings)

In [77]:
docs = [Document(page_content="Harrison used to work at Kensho"), Document(page_content="Ankush worked at Facebook")]

In [4]:
stuff_chain = {
    "question": lambda x: x["question"],
    "context": lambda x: _combine_documents(x["docs"])
} |ChatPromptTemplate.from_messages([
    ("system", "Answer user questions based on the following documents:\n\n{context}"),
    ("human", "{question}"),
]) | ChatOpenAI()

In [5]:
stuff_chain.invoke({"question": "where did harrison work", "docs": docs})

AIMessage(content='Harrison worked at Kensho.', additional_kwargs={}, example=False)

## Reduce Documents

Reduce documents tries to merge documents recursively.

In [78]:
many_docs = docs * 5

In [86]:
def _split_list_of_docs(docs, max_length=70):
    new_result_doc_list = []
    _sub_result_docs = []
    for doc in docs:
        _sub_result_docs.append(doc)
        _num_tokens = sum([len(d.page_content) for d in _sub_result_docs])
        if _num_tokens > max_length:
            if len(_sub_result_docs) == 1:
                raise ValueError(
                    "A single document was longer than the context length,"
                    " we cannot handle this."
                )
            new_result_doc_list.append(_sub_result_docs[:-1])
            _sub_result_docs = _sub_result_docs[-1:]
    new_result_doc_list.append(_sub_result_docs)
    return new_result_doc_list

In [87]:
# Just to show what its like split
split_docs = _split_list_of_docs(many_docs)
split_docs

[[Document(page_content='Harrison used to work at Kensho', metadata={}),
  Document(page_content='Ankush worked at Facebook', metadata={})],
 [Document(page_content='Harrison used to work at Kensho', metadata={}),
  Document(page_content='Ankush worked at Facebook', metadata={})],
 [Document(page_content='Harrison used to work at Kensho', metadata={}),
  Document(page_content='Ankush worked at Facebook', metadata={})],
 [Document(page_content='Harrison used to work at Kensho', metadata={}),
  Document(page_content='Ankush worked at Facebook', metadata={})],
 [Document(page_content='Harrison used to work at Kensho', metadata={}),
  Document(page_content='Ankush worked at Facebook', metadata={})]]

In [88]:
reduce_inbox = Topic("reduce_inbox")
finish_inbox = Topic("finish_inbox")
collapse_inbox = Topic("collapse_inbox")

In [97]:
# Decide if should finish or should reduce one more step
def decide_end(plan):
    if len(plan['docs']) > 1:
        return reduce_inbox.publish()
    else:
        return {"docs": lambda x: x["docs"][0], "question": lambda x: x["question"]} | stuff_chain | Topic.OUT.publish()

# Chain that collapses documents then chooses end
collapse_chain = collapse_inbox.subscribe() | RunnableMap({
    "docs": lambda x: _split_list_of_docs(x["docs"]),
    "question": lambda x: x["question"]
}) | decide_end

# Call the stuff chain on all elements and then pass to collapse
# TODO: this causes some errors because of the Topic.IN being where it is I believe
# reduce_chain = (
#     reduce_inbox.subscribe() 
#     | (lambda x: [{"docs": d, "question": x["question"]} for d in x['docs']])
#     | stuff_chain.map() 
#     | {
#         "docs": lambda x: [Document(page_content=m.content) for m in x],
#         # TODO: this causes some errors
#         "question": Topic.IN.current() | itemgetter("question"),
#     }
#     | collapse_inbox.publish()
# )
reduce_chain = (
    reduce_inbox.subscribe() 
    | {
        "docs":(lambda x: [{"docs": d, "question": x["question"]} for d in x['docs']]) | stuff_chain.map() | (lambda x: [Document(page_content=m.content) for m in x]),
        "question": lambda x: x["question"]
        }
    | collapse_inbox.publish()
)
start_chain = Topic.IN.subscribe() | collapse_inbox.publish()

In [98]:
reduce_agent = PubSub(processes=(start_chain, collapse_chain, reduce_chain),
    connection=InMemoryPubSubConnection(),)

In [99]:
# TODO: why does this return a list?

In [101]:
reduce_agent.invoke({"question": "where did harrison work", "docs": many_docs})

[AIMessage(content='Harrison used to work at Kensho.', additional_kwargs={}, example=False)]