<a href="https://colab.research.google.com/github/kswamy15/langchain_experiments/blob/main/Langchain_amazon_pdf_multiquery_types_test1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install the Langchain, openai, chromadb libraries

In [None]:
!pip3 install langchain
!pip3 install pypdf
!pip3 install chromadb
!pip3 install openai
!pip3 install tiktoken
# !pip3 install deeplake
!pip3 install lark

# Import the Langchain libraries

In [26]:
import os
os.environ["OPENAI_API_KEY"] = ""

from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import PagedPDFSplitter
from langchain.vectorstores import DeepLake
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
# from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers import ContextualCompressionRetriever

# Download Amazon's historical Financial quarterly release pages

In [3]:
import requests
import tqdm
from typing import List

# financial reports of amamzon, but can be replaced by any URLs of pdfs
urls = [
        'https://s2.q4cdn.com/299287126/files/doc_financials/2020/Q1/AMZN-Q1-2020-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2020/q2/Q2-2020-Amazon-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2020/q3/AMZN-Q3-2020-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2020/q4/Amazon-Q4-2020-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2021/q1/Amazon-Q1-2021-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2021/q2/AMZN-Q2-2021-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2021/q3/Q3-2021-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2020/q4/Amazon-Q4-2020-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2022/q1/Q1-2022-Amazon-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2022/q2/Q2-2022-Amazon-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2022/q3/Q3-2022-Amazon-Earnings-Release.pdf',
        'https://s2.q4cdn.com/299287126/files/doc_financials/2022/q4/Q4-2022-Amazon-Earnings-Release.pdf'
        ]

def load_reports(urls: List[str]) -> List[str]:
    """ Load pages from a list of urls"""
    pages = []

    for url in tqdm.tqdm(urls):
        r = requests.get(url)
        path = url.split('/')[-1]
        quarter = url.split('/')[-2]
        year = url.split('/')[-3]
        with open(path, 'wb') as f:
            f.write(r.content)
        loader = PagedPDFSplitter(path)
        local_pages = loader.load_and_split()
        for local_page in local_pages:
          local_page.page_content = quarter + " "+year+" "+path.split(".")[0]+" " +local_page.page_content
          local_page.metadata["company"]="Amazon"
          local_page.metadata['year'] = int(year)
          local_page.metadata['quarter'] = quarter
        # local_pages = [path.split(".")[0]+" " +local_page for local_page in local_pages]
        pages.extend(local_pages)
    return pages

pages = load_reports(urls)

100%|██████████| 12/12 [00:34<00:00,  2.87s/it]


# Split the documents into chunks

In [4]:
# split the documents into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

splits = text_splitter.split_documents(pages)
print('No of pages', len(pages), 'No of splits', len(splits))

No of pages 287 No of splits 580


# Build a ChromaDB vector store from the split documents

In [5]:
persist_directory = 'docs/chroma/'
!rm -rf ./docs/chroma  # remove old database files if any
embedding = OpenAIEmbeddings()
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)
vectordb.persist()

# Build some functions and meta data field info for retriever

In [6]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))


In [7]:
metadata_field_info = [
    AttributeInfo(
        name="year",
        description="The Year of the earnings release",
        type="integer",
    ),
    AttributeInfo(
        name="quarter",
        description="The quarter of the earnings release",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page from the earnings release",
        type="integer",
    ),
]
document_content_description = "Earnings release"

# Define the various types of query Retrievers

In [43]:
llm = ChatOpenAI(temperature=0)
multi_query_retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=vectordb.as_retriever(search_type = "mmr"), llm=llm
)
self_query_retriever = SelfQueryRetriever.from_llm(
    llm,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True
)
compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(search_type = "mmr")
)

In [71]:
qa_multi_query = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=multi_query_retriever_from_llm, verbose=True, return_source_documents=False)
qa_compress_query = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=compression_retriever, verbose=True, return_source_documents=False)
qa_self_query = RetrievalQAWithSourcesChain.from_chain_type(llm=llm, chain_type='stuff', retriever=self_query_retriever, verbose=True, return_source_documents=False)
qa_vectordb_query = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=vectordb.as_retriever(), verbose=True, return_source_documents=False)
qa_vectordb_mmr_query = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=vectordb.as_retriever(search_type = "mmr"), verbose=True, return_source_documents=False)

# Run a list of questions using the various query Retriever types

In [72]:
query_list = ["What is the free cash flow for Q1 2022?", "What is the net profit for Q2 2022?", 'What is the operating income for 2022?', 'What is the sales for Q2 2021']

In [74]:
result_list = []
for query in query_list:
  print(query)
  result_multi_query = qa_multi_query({"query": query})
  result_compress_query = qa_compress_query({"query": query})
  result_self_query = qa_self_query({"question": query.lower()})
  result_vectordb_query =  qa_vectordb_query({"query": query})
  result_vectordb_mmr_query =  qa_vectordb_query({"query": query})
  result_list.append([query, result_multi_query['result'], result_compress_query['result'], result_self_query['answer'], result_vectordb_query['result'],result_vectordb_mmr_query['result']])

What is the free cash flow for Q1 2022?


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m





[1m> Finished chain.[0m


[1m> Entering new RetrievalQAWithSourcesChain chain...[0m
query='free cash flow' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='quarter', value='q1'), Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='year', value=2022)]) limit=None

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
What is the net profit for Q2 2022?


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQAWithSourcesChain chain...[0m
query='net profit' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='quarter', value='q2'), Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='year', value=2022)]) limit=Non




[1m> Finished chain.[0m


[1m> Entering new RetrievalQAWithSourcesChain chain...[0m
query='sales' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='quarter', value='q2'), Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='year', value=2021)]) limit=None

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [77]:
result_list

[['What is the free cash flow for Q1 2022?',
  'The free cash flow for Q1 2022 is -$18,627 million.',
  'Based on the provided information, we do not have the specific free cash flow amount for Q1 2022. The information only includes data for the trailing twelve months up until December 31, 2021.',
  'The free cash flow for Q1 2022 is not provided in the given document.\n',
  'The free cash flow for Q1 2022 is an outflow of $18.6 billion.',
  'The free cash flow for Q1 2022 is an outflow of $18.6 billion.'],
 ['What is the net profit for Q2 2022?',
  'The net profit for Q2 2022 is -$2.028 billion.',
  'The net loss for Q2 2022 is $2.7 billion, or $0.27 per diluted share.',
  'The net profit for Q2 2022 is a loss of $2.0 billion.\n',
  'The information provided does not mention the net profit for Q2 2022.',
  'The information provided does not mention the net profit for Q2 2022.'],
 ['What is the operating income for 2022?',
  'The operating income for 2022 is $12.2 billion.',
  'The ope

In [78]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [79]:
pd.DataFrame(data=result_list, columns=['question','multi_query','compress_query','self_query','vectordb_query','vectordb_mmr_query']).head()

Unnamed: 0,question,multi_query,compress_query,self_query,vectordb_query,vectordb_mmr_query
0,What is the free cash flow for Q1 2022?,"The free cash flow for Q1 2022 is -$18,627 million.","Based on the provided information, we do not have the specific free cash flow amount for Q1 2022. The information only includes data for the trailing twelve months up until December 31, 2021.",The free cash flow for Q1 2022 is not provided in the given document.\n,The free cash flow for Q1 2022 is an outflow of $18.6 billion.,The free cash flow for Q1 2022 is an outflow of $18.6 billion.
1,What is the net profit for Q2 2022?,The net profit for Q2 2022 is -$2.028 billion.,"The net loss for Q2 2022 is $2.7 billion, or $0.27 per diluted share.",The net profit for Q2 2022 is a loss of $2.0 billion.\n,The information provided does not mention the net profit for Q2 2022.,The information provided does not mention the net profit for Q2 2022.
2,What is the operating income for 2022?,The operating income for 2022 is $12.2 billion.,The operating income for 2022 is $12.2 billion.,I don't know the answer.,The operating income for 2022 is $12.2 billion.,The operating income for 2022 is $12.2 billion.
3,What is the sales for Q2 2021,The net sales for Q2 2021 were $113.1 billion.,The sales for Q2 2021 were $113.1 billion.,The sales for Q2 2021 are not provided in the given document.\n,The net sales for Q2 2021 were $113.1 billion.,The net sales for Q2 2021 were $113.1 billion.


# Some query results from other experiments

In [80]:
query = "Summarize the financial results for Q1 2022?"
result_multi_query = qa_multi_query({"query": query})



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [81]:
result_multi_query

{'query': 'Summarize the financial results for Q1 2022?',
 'result': 'In Q1 2022, Amazon.com reported a 7% increase in net sales, reaching $116.4 billion compared to $108.5 billion in Q1 2021. However, the operating cash flow decreased by 41% to $39.3 billion for the trailing twelve months, compared to $67.2 billion in the previous year. Free cash flow also decreased to an outflow of $18.6 billion for the trailing twelve months, compared to an inflow of $26.4 billion in the previous year. Additionally, the company reported an outflow of $29.3 billion in free cash flow less principal repayments of finance leases and financing obligations for the trailing twelve months, compared to an inflow of $14.9 billion in the previous year.'}

In [55]:
query = "What is the free cash flow for Q1 2022?"
# self_query = "What is the net profit for q2 2022?"
# query = "What is the operating income for 2022?"
result_multi_query = qa_multi_query({"query": query})
result_compress_query = qa_compress_query({"query": query})
result_self_query = qa_self_query({"question": query.lower()})
result_vectordb_query =  qa_vectordb_query({"query": query})



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m





[1m> Finished chain.[0m


[1m> Entering new RetrievalQAWithSourcesChain chain...[0m
query='free cash flow' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='quarter', value='q1'), Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='year', value=2022)]) limit=None

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [56]:
print(result_multi_query)
print(result_compress_query)
print(result_self_query)
print(result_vectordb_query)

{'query': 'What is the free cash flow for Q1 2022?', 'result': 'The free cash flow for Q1 2022 is -$18,627 million.'}
{'query': 'What is the free cash flow for Q1 2022?', 'result': 'Based on the provided information, we do not have the specific free cash flow amount for Q1 2022. The information only includes data for the trailing twelve months up until December 31, 2021.'}
{'question': 'what is the free cash flow for q1 2022?', 'answer': 'The free cash flow for Q1 2022 is not provided in the given information.\n', 'sources': 'Q1-2022-Amazon-Earnings-Release.pdf'}
{'query': 'What is the free cash flow for Q1 2022?', 'result': 'The free cash flow for Q1 2022 is an outflow of $18.6 billion.'}


In [46]:
query = "What is the revenue for Q1 2021?"
result_multi_query = qa_multi_query({"query": query})
result_compress_query = qa_compress_query({"query": query})
result_self_query = qa_self_query({"question": query.lower()})
result_vectordb_query =  qa_vectordb_query({"query": query})



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQAWithSourcesChain chain...[0m
query='revenue' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='quarter', value='q1'), Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='year', value=2021)]) limit=None

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [47]:
print(result_multi_query)
print(result_compress_query)
print(result_self_query)
print(result_vectordb_query)

{'query': 'What is the revenue for Q1 2021?', 'result': 'The revenue for Q1 2021 was $108.5 billion.', 'source_documents': [Document(page_content='q1 2021 Amazon-Q1-2021-Earnings-Release AMAZON.COM ANNOUNCES FIRST QUARTER RESULTS \n \nSEATTLE—(BUSINESS WIRE) April 29, 2021—Amazon.com, Inc. (NASDAQ: AMZN) today announced financial results \nfor its first quarter ended March 31, 2021.  \n \n• Operating cash flow  increased 69% to $67.2 billion for the trailing twelve months, compared with $39.7 billion for \nthe trailing twelve months ended March 31, 2020.  \n• Free cash flow  increased to $26.4 billion for the trailing twelve months, compared with $24.3 billion for the trailing \ntwelve months ended March 31, 2020.  \n• Free cash flow less principal repayments of finance leases and financing obligations  increased to $14.9 billion for \nthe trailing twelve months, compared with $14.3 billion for the trailing twelve months ended March 31, 2020.  \n• Free cash flow less equipment finance 