# Chat With Documents

In [378]:
import os
from dotenv import load_dotenv
_ = load_dotenv('../.env')

from langchain_community.document_loaders import TextLoader, PyPDFLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.memory import ConversationBufferWindowMemory
from langchain.chains import LLMChain

## Split Document

In [349]:
def split_document(doc_path):
    ext = os.path.splitext(doc_path)[1]

    doc = None
    if ext == '.txt':
        doc = TextLoader(doc_path).load()
    elif ext == '.pdf':
        doc = PyPDFLoader(doc_path).load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False
    )

    documents = text_splitter.split_documents(doc)
    return documents

In [4]:
# test with apple_docs
split_document("./docs/monkey.pdf")[:5]

[Document(page_content='Title: "Monkeys: Guardians of the Jungle"  \n \nIn the heart of the lush rainforests, amidst the vibrant tapestry of life, dwell the remarkable \ncreatures known as monkeys. Their antics and intelligence have captivated the imaginations of \nhumans for centuries, and their role in the delicate balance of the jungle ecosystem is \nparamount.  \n \nChapter 1: The Primate Puzzle  \nMonkeys, with their diverse species and fascinating behaviors, present an intricate puzzle for', metadata={'source': './docs/monkey.pdf', 'page': 0}),
 Document(page_content='Monkeys, with their diverse species and fascinating behaviors, present an intricate puzzle for \nscientists and enthusiasts alike. From the acrobatic Gibbons swinging through the canopy to the \nmischievous Capuchins foraging for food, each species contributes  to the rich tapestry of \nprimate life.  \n \nChapter 2: Social Structures  \nAt the core of monkey society lies intricate social structures that rival those

## Store and query document

In [350]:
embedding_function = OpenAIEmbeddings()
def store_doc_collection(documents, collection_name, persist_directory='./chroma_db'):
    db = Chroma.from_documents(documents, 
                               embedding=OpenAIEmbeddings(), 
                               collection_name=collection_name, 
                               persist_directory=persist_directory)
    return db

In [351]:
def query_db_collection(db, query, k=2):
    docs = db.similarity_search(query, k=k)
    return docs

In [352]:
def split_and_store_doc(doc_path, collection_name, persist_directory='./chroma_db'):
    docs = split_document(doc_path)
    db = store_doc_collection(docs, collection_name)
    return db

In [353]:
# setup collections
# apple_db.delete_collection()
# monkey_db.delete_collection()
# relu_db.delete_collection()
walt_db.delete_collection()
# apple_db = split_and_store_doc(doc_path="./docs/apple.pdf",
#                                collection_name="apple_col")
# monkey_db = split_and_store_doc(doc_path="./docs/monkey.pdf",
#                                collection_name="monkey_col")
# relu_db = split_and_store_doc(doc_path="./docs/relu.pdf",
#                                collection_name="relu_col")
walt_db = split_and_store_doc(doc_path="./docs/WaltDisney.pdf",
                               collection_name="walt_col")

In [540]:
# query collections
print(query_db_collection(apple_db, "how does snow white relate to apples?"), "\n\n")
print(query_db_collection(monkey_db, "what are modern monkey threats?"), "\n\n")
print(query_db_collection(relu_db, "what is in the conclusion section??"), "\n\n")

[Document(page_content='paintings to literary classics like "Snow White," examine the diverse ways apples have captured \nthe imagination of artists and storytellers throughout history.', metadata={'page': 1, 'source': './docs/apple.pdf'}), Document(page_content="crisps to innovative salads and savory dishes, there's a delectable apple creation for every \npalate.  \n \nChapter 5: Folklore and Symbolism", metadata={'page': 0, 'source': './docs/apple.pdf'})] 


[Document(page_content='Chapter 5: Threats and Conservation  \nDespite their resilience, monkeys face an array of threats in the modern world. Habitat loss,', metadata={'page': 0, 'source': './docs/monkey.pdf'}), Document(page_content='poaching, and disease pose significant challenges to their survival. Conservation efforts strive to \nmitigate these pressures and secure a future where monkeys con tinue to thrive in their natural', metadata={'page': 0, 'source': './docs/monkey.pdf'})] 


[Document(page_content='the conclusions of

## RetrievalQA with Memory

In [13]:
import langchain
langchain.debug = False

In [14]:
from langchain_openai import ChatOpenAI
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA, ConversationalRetrievalChain

In [546]:
llm = ChatOpenAI(temperature=1)
embedding_function=OpenAIEmbeddings()
memory = ConversationBufferMemory(memory_key="chat_history", output_key="result")
retriever = relu_db.as_retriever(search_type="mmr")

In [543]:
def combine_prompt_with_history(input_prompt):
    template = '''You are a prompt engineer that generates a more fitting prompt based on some input prompt and past conversation history.
    The new prompt will be used in a RetrievalQA chain which retrieves context from some insightful text document.
    Be sure to respond in only one sentence.
    Input Prompt: {input_prompt}
    History: {chat_history}
    New Prompt:'''

    prompt = PromptTemplate.from_template(template)


    chain = LLMChain(
        llm=ChatOpenAI(),
        prompt=prompt,
    )

    res = chain.invoke({"input_prompt": input_prompt, "chat_history": memory.load_memory_variables({})["chat_history"]})
    return res["text"]

In [530]:
combine_prompt_with_history("what relu?")

[32;1m[1;3m[chain/start][0m [1m[1:chain:LLMChain] Entering Chain run with input:
[0m{
  "input_prompt": "what relu?",
  "chat_history": ""
}
[32;1m[1;3m[llm/start][0m [1m[1:chain:LLMChain > 2:llm:ChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Human: You are a prompt engineer that generates a more fitting prompt based on some input prompt and past conversation history.\n    The new prompt will be used in a RetrievalQA chain which retrieves context from some insightful text document.\n    Be sure to respond in only one sentence.\n    Input Prompt: what relu?\n    History: \n    New Prompt:"
  ]
}
[36;1m[1;3m[llm/end][0m [1m[1:chain:LLMChain > 2:llm:ChatOpenAI] [614ms] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "Can you explain the role of ReLU activation function in neural networks?",
        "generation_info": {
          "finish_reason": "stop",
          "logprobs": null
        },
        "type": "ChatGeneratio

'Can you explain the role of ReLU activation function in neural networks?'

In [544]:
def retrieval_prompt(input_prompt):
    better_prompt = combine_prompt_with_history(input_prompt)
    print(better_prompt)
    question_template = f"""Use the following chat history and pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible.
    History: {memory.load_memory_variables({})["chat_history"]}
    Context: {{context}}

    Question: {{question}}
    Question Reframed: {better_prompt}
    Helpful Answer:"""
    question_prompt_template = PromptTemplate.from_template(template=question_template)
    
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        return_source_documents=True,
        memory=memory,
        chain_type="stuff",
        chain_type_kwargs={
            "prompt": question_prompt_template
        }
    )
    result = qa_chain.invoke({"query": input_prompt})["result"]
    return result

In [549]:
retrieval_prompt("what is softmax?")

[32;1m[1;3m[chain/start][0m [1m[1:chain:LLMChain] Entering Chain run with input:
[0m{
  "input_prompt": "what is softmax?",
  "chat_history": ""
}
[32;1m[1;3m[llm/start][0m [1m[1:chain:LLMChain > 2:llm:ChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Human: You are a prompt engineer that generates a more fitting prompt based on some input prompt and past conversation history.\n    The new prompt will be used in a RetrievalQA chain which retrieves context from some insightful text document.\n    Be sure to respond in only one sentence.\n    Input Prompt: what is softmax?\n    History: \n    New Prompt:"
  ]
}
[36;1m[1;3m[llm/end][0m [1m[1:chain:LLMChain > 2:llm:ChatOpenAI] [610ms] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "Can you explain the mathematical formula behind softmax activation function?",
        "generation_info": {
          "finish_reason": "stop",
          "logprobs": null
        },
        "type"

'Softmax is a mathematical function that converts a vector of real values into a probability distribution. It is calculated by taking the exponential of each element in the input vector and then normalizing these values to sum up to 1. The formula for softmax activation function is: softmax(x_i) = e^(x_i) / Σ(e^(x_j)) where x_i is the i-th element of the input vector.'

In [508]:
memory.load_memory_variables({})

{'relu_history': "Human: what softmax?\nAI: Softmax is a function used in machine learning to convert the output of a neural network into probabilities. It is commonly used in classification tasks to determine the most likely class for a given input. Softmax ensures that the output probabilities sum up to 1, making it easier to interpret the results.\nHuman: list bullet points why\nAI: Softmax is crucial in classification tasks as it converts neural network outputs into probabilities, making it easier to interpret and determine the most likely class for a given input. It ensures that the output probabilities sum up to 1, providing a clear and reliable way to assess the model's predictions. Additionally, softmax allows for a more robust and accurate classification process by optimizing the model's ability to assign probabilities to different classes.\nHuman: list it in bullet point form\nAI: - Softmax converts neural network outputs into probabilities, aiding in interpreting and determi

## Chat with document

In [354]:
def generate_query(user_input, memory):
    template = '''You will be provided with an initial prompt and some chat history. If the initial prompt is too vague, add more context to it with the help of the chat history.
    be sure that the new prompt is concise in one sentence, to the point, and highly relevant to be used in querying information from an insightful text document.
    Input Prompt: {user_input}
    History: {chat_history}
    New Prompt:'''

    prompt = PromptTemplate.from_template(template)

    chain = LLMChain(
        llm=ChatOpenAI(),
        prompt=prompt,
    )

    query = chain.invoke({"user_input": user_input, "chat_history": memory.load_memory_variables({})["chat_history"]})
    return query["text"]

def generate_context(db, query):
    context = '\n'.join([doc.page_content for doc in db.similarity_search(query, k=4)])
    return context
    
def generate_final_response(context, user_input, memory):
    template = """Use the following chat history and pieces of context to give a helpful answer at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible.
    
    History: {chat_history}
    
    Context: {context}

    Question: {user_input}
    
    Helpful Answer:"""

    prompt = PromptTemplate.from_template(template)

    chain = LLMChain(
        llm=ChatOpenAI(),
        prompt=prompt
    )

    response = chain.invoke({"chat_history": memory.load_memory_variables({})["chat_history"], "context": context, "user_input": user_input})
    return response

def prompt_db_doc(db, user_input, memory, verbose=0):
    query = generate_query(user_input, memory)
    context = generate_context(db, query)
    response = generate_final_response(context, user_input, memory)
    response_text = response["text"]
    memory.save_context({"input": user_input}, {"output": response_text})
    if verbose == 1:
        print(f"Query: {query}\n")
        print(f"Context: {context}\n")
        print(f"Invoke: {response}\n")
    return response_text

In [367]:
memory = ConversationBufferWindowMemory(memory_key="chat_history", k=3)
memory.save_context({"input": "What is this document about?"}, {"output": summary})

In [368]:
prompt_db_doc(walt_db, "why was he accused of being antisemetic and racist?", memory, verbose=1)

Query: What evidence supports the accusations of antisemitism and racism against Walt Disney?

Context: in 1944, that was rumored to have anti-Semitic undertones. Gabler
concludes that "...though Walt himself, in my estimation, was not anti-
Semitic, nevertheless, he willingly allied himself with people who were anti-
Semitic, and that reputation stuck. He was never really able to expung e it
throughout his life."[195] Disney distanced himself from the Motion Picture
Alliance, and had no involvement with the organization after 1947.[196]
According to Disney's daughter Diane Disney-Miller , her sister Sharon
dated a Jewish boyfriend for a perio d of time, to which her father raised no
objections and even reportedly said, "Sharon, I think it's wonderful how
these Jewish families have accepted you."[172]
Disney has also been accused of other forms of racism because some of his
productions released between the 1930s and 1950s contain racially
insensitive material.[197][ac] Gabler argues th

"Walt Disney was accused of being anti-Semitic and racist due to his association with the Motion Picture Alliance for the Preservation of American Ideals, which was rumored to have anti-Semitic undertones. Additionally, some of Disney's productions released between the 1930s and 1950s contained racially insensitive material, leading to accusations of racism. However, available evidence suggests that Disney himself was not anti-Semitic, and he had Jewish employees in influential positions."

In [369]:
prompt_db_doc(walt_db, "what about the good he did?", memory, verbose=1)

Query: What were the lasting impacts of Walt Disney's creations on American culture and the entertainment industry?

Context: Crowther  argu es that Disne y's "achievement as a creator of entertainment
for an almost unlimited public and as a highly ingenious merchandiser of
his wares  can rightly be compared to the most successful industrialists  in
history."[5] Correspo ndent Alistair Cooke  calls Disney a "folk-hero ... the
Pied Piper of Holly wood",[213] while Gabler cons iders Disney "reshaped the
culture and the American consciousness".[214] In the American Dictionary
of National Biography , Langer writes:
Disney remains the central figure in the history of animation.
Through technological innovations and alliances with
governments and corporations, he transformed a minor studio
in a marginal form of communication into a multinational
leisure industry giant. Despite his critics, his vision of a modern,
corporate utopia as an extension of traditional American values
has possibly ga

"Walt Disney is credited with transforming the animation industry and creating iconic characters that have had a lasting impact on American culture. His technological innovations and entertainment ventures have brought joy to audiences worldwide, and his vision of a modern, corporate utopia aligned with traditional American values continues to influence popular culture today. Despite criticisms and controversies, Disney's work is widely recognized for its entertainment value and cultural significance."

In [370]:
prompt_db_doc(walt_db, "did he have a rough childhood?", memory, verbose=1)

Query: Was Walt Disney's childhood marked by adversity and hardship?

Context: Disney's childhood homeDisney was a shy, self-deprecating and insecure man in private but adopted
a warm and outg oing public persona. He had high standards and high
expectations of those with whom he worked. Although there have been
accusations that he was racist  or antisemitic , they have been contradicted
by many who knew him. Historiography of Disney has taken a variety of
perspectives, rang ing from views of him as a purveyor of homely patriotic
values  to being a repres entative of American cultural imperialism . Wid ely
considered to be one of the most influential cultural figures of the 20th
century, Disney remains an important presence in the history of animation
and in the cultural history of the United States , where he is acknowledged
as a national cultural icon . His film work continues to be shown and
adapted, the Disney theme parks have grown in size and number to attract
visitors in several 

"Yes, Walt Disney had a challenging childhood, marked by financial struggles and frequent moves. His family's hardships influenced his work ethic and creativity, shaping his future success in the animation industry. Despite facing adversity, Disney's resilience and imagination helped him overcome obstacles and achieve his dreams."

In [372]:
prompt_db_doc(walt_db, "how much did that cost?", memory, verbose=1)

Query: What was the cost of Walt Disney's technological innovations and entertainment ventures?

Context: distance the project from the studio  — which might attract the criticism of
shareholders — Disney formed WED Enterprises (now Walt Disney
Imagineering ) and used his own money to fund a group of designers and
animators to work on the plans;[107][108] those involved became known as
"Imagineers".[109] After obtaining bank funding  he invited other
stockholders, American Broadcasting-Paramount Theatres — part of
American Broadc asting Company  (ABC) — and Western Printing  and
Lithographing Company .[58] In mid-1954, Disney sent his Imagineers to
every amusement park in the U.S. to analyze what worked and what pitfa lls
or proble ms there were in the various locations and incorporated their
findings into his design.[110] Construc tion work started in July 1954, and
Disneyland  opened in July 1955; the opening ceremony was broadcast on
ABC, which reached 70 million viewers.[111] The p

'Walt Disney used his own money to initially fund the development of Disneyland in the 1950s, with additional support from bank funding and other stockholders such as ABC and Western Printing and Lithographing Company. The exact cost is not specified in the provided context.'

In [373]:
prompt_db_doc(walt_db, "and its earnings?", memory, verbose=1)

Query: What was the financial impact of Walt Disney funding the development of Disneyland in the 1950s?

Context: distance the project from the studio  — which might attract the criticism of
shareholders — Disney formed WED Enterprises (now Walt Disney
Imagineering ) and used his own money to fund a group of designers and
animators to work on the plans;[107][108] those involved became known as
"Imagineers".[109] After obtaining bank funding  he invited other
stockholders, American Broadcasting-Paramount Theatres — part of
American Broadc asting Company  (ABC) — and Western Printing  and
Lithographing Company .[58] In mid-1954, Disney sent his Imagineers to
every amusement park in the U.S. to analyze what worked and what pitfa lls
or proble ms there were in the various locations and incorporated their
findings into his design.[110] Construc tion work started in July 1954, and
Disneyland  opened in July 1955; the opening ceremony was broadcast on
ABC, which reached 70 million viewers.[11

'The exact earnings of Disneyland are not specified in the provided context.'

In [374]:
prompt_db_doc(walt_db, "what else did he make?", memory, verbose=1)

Query: What other projects or ventures did Walt Disney invest in or create?

Context: live-action films  followed  after World War II, including the critically
successful Cinderella  (1950), Sleeping Beauty  (1959) and Mary Poppins
(1964), the last of which received five Academy Awards.
In the 1950s, Disn ey expanded into the amusement park  industry, and in
July 1955 he opened Disneyland  in Anaheim, California . To fund the proje ct
he diversified into television programs, such as Walt Disney's Disneyland
and The Mick ey Mous e Club . He was also involved in planning the 1959
Moscow Fair, the 1960 Win ter Olympics , and the 1964 New  York World's
Fair. In 1965,  he began development of another theme park, Disney World ,
the heart of which was to be a new type of city, the "Experimental Prototype
Community of Tomorrow " (EPCOT). Disney was a heavy smoker
throughout his life and died of lung cancer in 1966 before either the park or
the EPCOT project were completed.
distance the project

'Walt Disney also developed plans for a ski resort.'

In [375]:
prompt_db_doc(walt_db, "how did his journey begin?", memory, verbose=0)

"Disney's journey began with the formation of WED Enterprises (now Walt Disney Imagineering) using his own money to fund a group of designers and animators to work on plans for Disneyland. He later obtained bank funding and invited other stockholders to support the project. The development of Disneyland began in the mid-1950s after research and analysis of existing amusement parks in the U.S."

In [28]:
prompt_db_doc(walt_db, "did walt disney die?", memory, verbose=0)

'Yes, Walt Disney died on December 15, 1966, at the age of 65 in Burbank, California, U.S.'

In [29]:
prompt_db_doc(walt_db, "how did he die?", memory, verbose=1)

Query: How did Walt Disney die?

Context: -article-1.2381618)  on May 21, 2016 . Retrieved May 21,  2016 .
158. "Walt Disney dies of cancer at 65"  (https://news.google.com/newspaper
s?id=D7peAAAAIBAJ&pg=4762%2C3438544) . Lewiston Morning
Tribune . (Idaho). Associated Press. December 16, 1966. p. 1.
159. Gabler 2006 , p. 544.
160. Watts 2013 , p. 352.
161. Barrier 2007 , pp. 102, 131.
162. Mosley 1990 , p. 169; Gabler 2006 , p. 280.
163. Thomas 1994 , p. 196; Watts 2013 , p. 352.
164. Broggie 2006 , pp. 7, 109.
-article-1.2381618)  on May 21, 2016 . Retrieved May 21,  2016 .
158. "Walt Disney dies of cancer at 65"  (https://news.google.com/newspaper
s?id=D7peAAAAIBAJ&pg=4762%2C3438544) . Lewiston Morning
Tribune . (Idaho). Associated Press. December 16, 1966. p. 1.
159. Gabler 2006 , p. 544.
160. Watts 2013 , p. 352.
161. Barrier 2007 , pp. 102, 131.
162. Mosley 1990 , p. 169; Gabler 2006 , p. 280.
163. Thomas 1994 , p. 196; Watts 2013 , p. 352.
164. Broggie 2006 , pp. 7, 109.
Disney h

'Walt Disney died of lung cancer on December 15, 1966, at the age of 65, due to circulatory collapse caused by the cancer.'

In [376]:
prompt_db_doc(walt_db, "did he have family?", memory, verbose=1)

Query: did Walt Disney have any family members involved in his journey to create Disneyland?

Context: town to an attraction.[151] At the inauguration in 1971, Roy dedicated Walt
Disney World to his brother.[152][v] Walt Disney Wor ld expanded with the
opening of Epcot Center  in 1982;  Walt Disney's vision of a functional city
was replaced by a park more akin to a permanent world's fair.[154] In 2009,
the Walt Disney Family Museum, designed by Disney's daughter Diane and
her son Walter E. D. Miller, opened  in the Presidio of San Francisco .[155]
Thousands of artifacts from Disney's life and career are on display,
including numerous awards that he received.[156] In 2014, the Disne y theme
parks around the world hosted approximately 134 million visitors.[157]
Early in 1925, Disney hired an ink
artist, Lillian Bounds . They married in
July of that year, at her brothe r's
house in her home town of Lewiston,
Idaho .[158] The marriage was generally
happy, according to Lillian, although
acc

'Yes, Walt Disney had a wife named Lillian Bounds and two daughters, Diane and Sharon. They lived in Los Angeles and kept their daughters out of the public eye as much as possible.'

In [377]:
prompt_db_doc(walt_db, "both adopted or just one?", memory, verbose=1)

Query: Were both of Walt Disney's daughters adopted or just one?

Context: Disney family at Schiphol Airport
(1951)henpecked he is'."[159][w] Lillian had little interest in films or the Hollywood
social scene and she was, in the words of the historian Steven Watts,
"content with household management and providing support for her
husband".[160] Their marriage produced two daughters, Diane  (born
December 1933) and Sharon (adopted in December 1936, born six weeks
previously).[161][x] Within the famil y, neither Disney nor his wife hid the
fact Sharon had been adopted, although they became annoyed if people
outside the family raised the point.[162] The Disneys were  careful to keep
their daughters out of the public eye as much as possible, particularly in the
light of the Lindbergh kidnap ping ; Disney took steps to ensure his
daughters were not photographed by the press.[163]
In 1949, Disney and his family move d to
a new home in the Holmby Hills  district
of Los Angeles. With the help o

"One of Disney's daughters, Sharon, was adopted, while the other daughter, Diane, was not adopted."

## Document summarization

In [334]:
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders.merge import MergedDataLoader
from langchain.chains.combine_documents.reduce import collapse_docs
from langchain.docstore.document import Document
from pypdf import PdfReader

In [335]:
reader = PdfReader("./docs/WaltDisney.pdf")
text = '\n'.join([page.extract_text() for page in reader.pages])
text



In [336]:
doc = Document(page_content=text, metadata={"source": "./docs/WaltDisney.pdf"})
doc



In [337]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=5000,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False
)

documents = text_splitter.split_documents([doc])

In [338]:
len(documents)

20

In [339]:
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain_text_splitters import CharacterTextSplitter

llm = ChatOpenAI(temperature=0)

# Map
map_template = """The following is a set of documents
{docs}
Based on this list of docs, please identify generally what the document is about
Helpful Answer:"""
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)

In [340]:
# Reduce
reduce_template = """The following is an unordered set of summaries based on parts of a document:
{docs}
Take these and write a summary of the document.
Helpful Answer:"""
reduce_prompt = PromptTemplate.from_template(reduce_template)

In [341]:
# Run chain
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="docs"
)

# Combines and iteratively reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=4000,
)

In [342]:
# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)

In [343]:
rand_selection = documents[:5]
rand_selection

[Document(page_content='Walt Disney\nDisney in 1946\nBorn December 5, 1901\nChicago, Illinois, U.S.\nDied December 15, 1966\n(aged 65)\nBurbank, California,\nU.S.\nOccupationsAnimator · film producer\n· voice actor ·\nWalt Disney\n(Redirected from Walt disney )\nWalter Elias Disney  (/ ˈd ɪzni/;[2]\nDecember 5, 1901 – December 15,\n1966) was an American animator,\nfilm producer, and entrepreneur. A\npioneer of the American animation\nindustry , he introduced  several\ndevelopments in the production of\ncartoons . As a film producer, he\nholds the record for most Academy\nAwards  earned and nominations by\nan individ ual. He was presented with\ntwo Golden Globe  Special\nAchievement Awards and an Emmy\nAward , among other honors. Several\nof his films are included in the\nNational Film Registry  by the\nLibrary of Congress  and have also\nbeen nam ed as some of the greatest\nfilms ever  by the American Film\nInstitute .\nBorn in Chicago in 1901, Disney\ndeveloped an early interest in\nd

In [344]:
summary = map_reduce_chain.run(rand_selection)

In [345]:
summary

"The document provides a comprehensive overview of Walt Disney's life and career, detailing his early years, achievements in animation, film production, and entrepreneurship. It covers his creation of iconic characters like Mickey Mouse, the challenges faced in the animation industry, and the establishment of The Walt Disney Company. Additionally, it discusses Disney's involvement in World War II, financial struggles, and his ventures in television and theme parks, highlighting his lasting impact on American culture and the entertainment industry."

# End to end

In [13]:
import os
from dotenv import load_dotenv
_ = load_dotenv('../.env')

from langchain_community.document_loaders import TextLoader, PyPDFLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.memory import ConversationBufferWindowMemory
from langchain.chains import LLMChain

from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders.merge import MergedDataLoader
from langchain.chains.combine_documents.reduce import collapse_docs
from langchain.docstore.document import Document
from pypdf import PdfReader

from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain_text_splitters import CharacterTextSplitter
from langchain_core.prompts.prompt import PromptTemplate
from langchain.chains.combine_documents.stuff import StuffDocumentsChain

## 1 - PDF to Document

In [2]:
reader = PdfReader("./docs/WaltDisney.pdf")
text = '\n'.join([page.extract_text() for page in reader.pages])
text[:100]

'Walt Disney\nDisney in 1946\nBorn December 5, 1901\nChicago, Illinois, U.S.\nDied December 15, 1966\n(age'

In [3]:
doc = Document(page_content=text, metadata={"source": "./docs/WaltDisney.pdf"})
doc.page_content[:100]

'Walt Disney\nDisney in 1946\nBorn December 5, 1901\nChicago, Illinois, U.S.\nDied December 15, 1966\n(age'

In [4]:
def pdf_to_document(pdf_path):
    reader = PdfReader(pdf_path)
    text = '\n'.join([page.extract_text() for page in reader.pages])
    doc = Document(page_content=text, metadata={"source": pdf_path})
    return doc

In [5]:
doc = pdf_to_document("./docs/WaltDisney.pdf")
doc



## 2 - Generate Summary

In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=3000,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False
)

documents = text_splitter.split_documents([doc])

In [7]:
len(documents)

34

In [14]:
llm = ChatOpenAI(temperature=0)

# Map
map_template = """The following is a set of documents
{docs}
Based on this list of docs, please identify generally what the document is about
Helpful Answer:"""
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)

# Reduce
reduce_template = """The following is an unordered set of summaries based on parts of a document:
{docs}
Take these and write a summary of the document.
Helpful Answer:"""
reduce_prompt = PromptTemplate.from_template(reduce_template)

# Run chain
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="docs"
)

# Combines and iteratively reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=4000,
)

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)

In [15]:
docs_to_summarize = documents[:5]

In [16]:
summary = map_reduce_chain.invoke(docs_to_summarize)
summary

  warn_deprecated(


"The document provides a comprehensive look at the life and career of Walt Disney, detailing his early interests in drawing, the founding of The Walt Disney Company, the development of iconic characters like Mickey Mouse, and the success of his animated films. It also covers Disney's expansion into the amusement park industry with Disneyland, his personal traits, and his involvement in various projects and awards. Additionally, it discusses Disney's struggles and successes in starting his own businesses, his experimentation with animation techniques, and the establishment of Laugh-O-Gram Studio. The document also delves into the challenges faced by Disney and his studio during the creation and early success of Mickey Mouse, including the introduction of synchronized sound in animation, collaborations with distributors, and the introduction of new cartoon characters."

In [17]:
def summarize_doc(doc):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=3000,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False
    )
    
    documents = text_splitter.split_documents([doc])

    llm = ChatOpenAI(temperature=0)
    
    # Map
    map_template = """The following is a set of documents
    {docs}
    Based on this list of docs, please identify generally what the document is about
    Helpful Answer:"""
    map_prompt = PromptTemplate.from_template(map_template)
    map_chain = LLMChain(llm=llm, prompt=map_prompt)
    
    # Reduce
    reduce_template = """The following is an unordered set of summaries based on parts of a document:
    {docs}
    Take these and write a summary of the document.
    Helpful Answer:"""
    reduce_prompt = PromptTemplate.from_template(reduce_template)
    
    # Run chain
    reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)
    
    # Takes a list of documents, combines them into a single string, and passes this to an LLMChain
    combine_documents_chain = StuffDocumentsChain(
        llm_chain=reduce_chain, document_variable_name="docs"
    )
    
    # Combines and iteratively reduces the mapped documents
    reduce_documents_chain = ReduceDocumentsChain(
        # This is final chain that is called.
        combine_documents_chain=combine_documents_chain,
        # If documents exceed context for `StuffDocumentsChain`
        collapse_documents_chain=combine_documents_chain,
        # The maximum number of tokens to group documents into.
        token_max=4000,
    )
    
    # Combining documents by mapping a chain over them, then combining results
    map_reduce_chain = MapReduceDocumentsChain(
        # Map chain
        llm_chain=map_chain,
        # Reduce chain
        reduce_documents_chain=reduce_documents_chain,
        # The variable name in the llm_chain to put the documents in
        document_variable_name="docs",
        # Return the results of the map steps in the output
        return_intermediate_steps=False,
    )

    docs_to_summarize = documents[:5]

    summary = map_reduce_chain.invoke(docs_to_summarize)
    return summary['output_text']

In [18]:
summary = summarize_doc(doc)

In [23]:
summary

"The document provides a comprehensive look at the life and career of Walt Disney, focusing on his childhood, early influences, and the establishment of his animation studio. It covers his struggles and successes, including the creation of iconic characters like Mickey Mouse and the development of animated films. The document also delves into Disney's personal life, including his marriage and battle with lung cancer. Overall, it offers a detailed biography of Walt Disney and his significant contributions to the entertainment industry."

## 3 - Build Chatbot

In [24]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False
)

documents = text_splitter.split_documents([doc])
len(documents)

109

In [25]:
db = None

In [26]:
embedding_function = OpenAIEmbeddings()
collection_name = "default_collection"
if db:
    db.delete_collection()
db = Chroma.from_documents(documents, 
                           embedding=OpenAIEmbeddings(),
                           collection_name=collection_name)

In [27]:
len(db.get()["ids"])

109

In [28]:
def generate_query(user_input, memory):
    template = '''You will be provided with an initial prompt and some chat history. If the initial prompt is too vague, add more context to it with the help of the chat history.
    be sure that the new prompt is concise in one sentence, to the point, and highly relevant to be used in querying information from an insightful text document.
    Input Prompt: {user_input}
    History: {chat_history}
    New Prompt:'''

    prompt = PromptTemplate.from_template(template)

    chain = LLMChain(
        llm=ChatOpenAI(),
        prompt=prompt,
    )

    query = chain.invoke({"user_input": user_input, "chat_history": memory.load_memory_variables({})["chat_history"]})
    return query["text"]

def generate_context(db, query):
    context = '\n'.join([doc.page_content for doc in db.similarity_search(query, k=4)])
    return context
    
def generate_final_response(context, user_input, memory):
    template = """Use the following chat history and pieces of context to give a helpful answer at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible.
    
    History: {chat_history}
    
    Context: {context}

    Question: {user_input}
    
    Helpful Answer:"""

    prompt = PromptTemplate.from_template(template)

    chain = LLMChain(
        llm=ChatOpenAI(),
        prompt=prompt
    )

    response = chain.invoke({"chat_history": memory.load_memory_variables({})["chat_history"], "context": context, "user_input": user_input})
    return response

def prompt_db_doc(db, user_input, memory, verbose=0):
    query = generate_query(user_input, memory)
    context = generate_context(db, query)
    response = generate_final_response(context, user_input, memory)
    response_text = response["text"]
    memory.save_context({"input": user_input}, {"output": response_text})
    if verbose == 1:
        print(f"Query: {query}\n")
        print(f"Context: {context}\n")
        print(f"Invoke: {response}\n")
    return response_text

In [29]:
memory = ConversationBufferWindowMemory(memory_key="chat_history", k=3)
memory.save_context({"input": "What is this document about?"}, {"output": summary})

In [30]:
prompt_db_doc(db, "where did walt grow up?", memory)

'Walt Disney grew up in Chicago, Illinois, up until his family moved to Marceline, Missouri when he was four years old. After that, the family moved to Kansas City, Missouri in 1911.'