# Summary of a long PDF document

In [None]:
import os
import openai
from IPython.display import display, HTML, Markdown
from pprint import pprint

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

In [None]:
from langchain.callbacks import OpenAICallbackHandler

totals_cb = OpenAICallbackHandler()

print(totals_cb)

---

## Mind the costs!!!

This notebook performs many requests to OpenAI LLM endpoints with very large count of tokens.

Running it with latest and greatest models may cost several $ per one PDF summarized!!!

---

The AI papers used as examples:

- [Eight Things to Know about Large Language Models](https://arxiv.org/pdf/2304.00612.pdf)
- [Plan-and-Solve Prompting: Improving Zero-Shot Chain-of-Thought Reasoning by Large Language Models](https://arxiv.org/pdf/2305.04091.pdf)
- [ChatGPT for Digital Forensic Investigation: The Good, The Bad, and The Unknown](https://arxiv.org/pdf/2307.10195.pdf)
- [REACT: SYNERGIZING REASONING AND ACTING IN
LANGUAGE MODELS](https://arxiv.org/pdf/2210.03629.pdf)
- [Challenges and Applications of Large Language Models](https://arxiv.org/pdf/2307.10169.pdf) - 72 pages

Check **LangChain**'s Python API reference for [document_loaders](https://api.python.langchain.com/en/latest/api_reference.html?highlight=document_loaders#module-langchain.document_loaders)

In [None]:
from langchain.document_loaders import PyPDFLoader

papers=[
    "https://arxiv.org/pdf/2304.00612.pdf", # total tokens used - ca. 48k
    "https://arxiv.org/pdf/2305.04091.pdf", # total tokens used - ca. 47k
    "https://arxiv.org/pdf/2307.10195.pdf", # total tokens used - ca. 38k
    "https://arxiv.org/pdf/2210.03629.pdf", # total tokens used - ca. 82k
    "https://arxiv.org/pdf/2307.10169.pdf", # total tokens used - ca. 238k
]

loader = PyPDFLoader(papers[0])

docs = loader.load()

print(f"Document has {len(docs)} pages\n")
pprint(docs[0])

In [None]:
pprint(docs[1])

In [None]:
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI()

total_tokens = 0
for n, page in enumerate(docs):
    tokens = llm.get_num_tokens(page.page_content)
    total_tokens += tokens
    print(f"Page {n+1:2d}: {tokens:>}")
    
print(f"Total number of tokens in document: {total_tokens}")

### Map-Reduce Chain

In [None]:
from langchain.chains.summarize import load_summarize_chain

summary_chain_default = load_summarize_chain(ChatOpenAI(temperature=0.0), chain_type="map_reduce")

![%E2%80%8EPresentation.%E2%80%8E002.jpeg](attachment:%E2%80%8EPresentation.%E2%80%8E002.jpeg)

In [None]:
# Map Chain prompt template

summary_chain_default.llm_chain.prompt.template

In [None]:
# Reduce Chain prompt template

summary_chain_default.combine_document_chain.llm_chain.prompt.template

In [None]:
summary_default = summary_chain_default(docs, callbacks=[totals_cb])

In [None]:
print(summary_default["output_text"])

In [None]:
pprint(totals_cb)

### Custom prompts to improve summary quality

In [None]:
from langchain import PromptTemplate

map_prompt_template = """
You are provided with one page from a long document.


<page>
{text}
</page>


Scan the page for main themes, ideas, definitions and theses, focusing on titles, subtitles, 
first sentences of paragraphs, and any highlighted or bolded information. Skip all literature references.
If the page contains only references, then output 'Only references, skip this summary.'
Write a one to three-sentence summary of key points.
"""
map_prompt = PromptTemplate(template=map_prompt_template, input_variables=["text"])

reduce_prompt_template = """
You are provided with a list of summaries, one summary per page of the long document.


<summaries>
{text}
</summaries>


Analyse the summaries to identify recurring themes and links between different points. 
Based on this write a coherent final document summary including all important items from the document, 
but skipping all info on references.
Review final summary, edit for brevity, rephrase into several paragraphs to ensure it 
sounds like a document summary, not a summary of pages and 
accurately represents the main topics and maintains coherence. 
"""
reduce_prompt = PromptTemplate(template=reduce_prompt_template, input_variables=["text"])


summary_chain_custom = load_summarize_chain(
    ChatOpenAI(temperature=0.0), 
    chain_type="map_reduce", 
    return_intermediate_steps=True, # get Map Chain results
    map_prompt=map_prompt, 
    combine_prompt=reduce_prompt
)

In [None]:
summary_custom = summary_chain_custom({"input_documents": docs}, callbacks=[totals_cb])

In [None]:
pprint(summary_custom["intermediate_steps"])

In [None]:
print(summary_custom["output_text"])

In [None]:
pprint(totals_cb)