In [1]:
%pip install -U --quiet llmsherpa graphviz aspose-words langchain_experimental python-docx fpdf pdfkit pdf2image

Note: you may need to restart the kernel to use updated packages.


In [94]:
from llmsherpa.readers import LayoutPDFReader
from llmsherpa.readers import LayoutReader
from llmsherpa.readers.layout_reader import ListItem, Paragraph, Table
from pprint import pprint
from IPython.display import display, HTML, IFrame
from graphviz import Digraph
import subprocess
import os
import aspose.words as aw
from langchain.docstore.document import Document as LangchainDocument
from langchain_community.document_loaders.llmsherpa import LLMSherpaFileLoader
from langchain_community.vectorstores import Neo4jVector
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from docx import Document as docxDocument
from fpdf import FPDF
from pdfkit import from_file
from pdf2image import convert_from_path
from PIL import ImageDraw, Image
import fitz
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import FlashrankRerank
from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate



**Focus Area: Improving the usefulness of RAG, now that a brief POC of the architecture has been achieved.**<br>
- Performance of RAG depends heavily on chunking since RAG just uses LLM to stitch together information.<br>
- Previously treated as a black box.<br>
- Determine how best for users to verify documents used in answer generation.

##### Input file here (we are limiting to pdf for now)

In [3]:
file_path = "documents/report.pdf"

### Preliminary chunking using LLMSherpa

##### LLMSherpa Setup

In [4]:
db_config={"ollama_base_url": "http://localhost:11434",
        "llm_name": "llama3",
        # "nlm_url": "http://localhost:5010/api/parseDocument?renderFormat=all&applyOcr=yes&useNewIndentParser=yes",
        "nlm_url": "http://localhost:5010/api/parseDocument?renderFormat=all&applyOcr=yes",
        "neo4j_url": "bolt://localhost:7687",
        "neo4j_username": "neo4j",
        "neo4j_password": "password",
        "bbox_correction_multiple": 2.78
        }

In [5]:
try:
    reader = LayoutPDFReader(db_config["nlm_url"])
    print(file_path)
    parsed_doc = reader.read_pdf(file_path)
    layout_root = parsed_doc.root_node
    json_doc = parsed_doc.json
except Exception as e:
    print("Error!:", e)

documents/report.pdf


In [6]:
pprint(json_doc[4])

{'bbox': [104.16, 282.09, 399.6, 294.09],
 'block_class': 'cls_0',
 'block_idx': 4,
 'level': 2,
 'page_idx': 0,
 'sentences': ['*',
               ' ',
               '   North      America      segment      sales      '
               'increased      12%      year-over-year      to      $86.3      '
               'billion.'],
 'tag': 'list_item'}


In [7]:
for i in range(10):
    pprint(json_doc[i])

{'bbox': [199.68, 73.22, 199.68, 85.22],
 'block_class': 'cls_0',
 'block_idx': 0,
 'level': 0,
 'page_idx': 0,
 'sentences': ['amazon'],
 'tag': 'para'}
{'bbox': [170.88, 161.33, 396.0, 173.33],
 'block_class': 'cls_0',
 'block_idx': 1,
 'level': 0,
 'page_idx': 0,
 'sentences': ['AMAZON.COM      ANNOUNCES      FIRST      QUARTER      '
               'RESULTS'],
 'tag': 'header'}
{'bbox': [49.92, 195.42, 525.6, 219.42],
 'block_class': 'cls_0',
 'block_idx': 2,
 'level': 1,
 'page_idx': 0,
 'sentences': ['SEATTLE—(BUSINESS      WIRE)      April      30,      '
               '2024—Amazon.com,      Inc.      (NASDAQ:      AMZN)      '
               'today      announced      financial      results  ',
               ' ',
               '  for      its      first      quarter      ended      '
               'March      31,      2024.'],
 'tag': 'para'}
{'bbox': [68.16, 241.03, 539.76, 277.28],
 'block_class': 'cls_0',
 'block_idx': 3,
 'level': 1,
 'page_idx': 0,
 'sentences': [' ',


Draw bounding box

In [8]:
def draw_bounding_box_on_pdf_image(pdf_path, coordinates, page_number=0, dpi=200):
    # Convert PDF page to image
    images = convert_from_path(pdf_path, first_page=page_number + 1, last_page=page_number + 1, dpi=dpi)
    
    # Assuming we have only one image since we specified a single page
    img = images[0]
    
    # Get the size of the PDF page in points (1 point = 1/72 inches)
    pdf_doc = fitz.open(pdf_path)
    page = pdf_doc.load_page(page_number)
    page_width, page_height = page.rect.width, page.rect.height
    
    # Scale coordinates to match the image resolution
    scale_x = img.width / page_width
    print(scale_x)
    scale_y = img.height / page_height
    print(scale_y)
    scaled_coordinates = tuple(int(coord * scale_x) for coord in coordinates)

    # Manually tune the right limit more
    scaled_coordinates = (scaled_coordinates[0], scaled_coordinates[1], scaled_coordinates[2] + 60, scaled_coordinates[3])
    
    # Draw the bounding box on the image
    draw = ImageDraw.Draw(img)
    draw.rectangle(scaled_coordinates, outline="red", width=2)

    # Image file name
    # take the pdf_path and add page_number
    # image_path = pdf_path.replace(".pdf", f"_page_{page_number}_{coordinates[1]}.png")
    image_path = pdf_path.split("/")[-1].replace(".pdf", f"_page_{page_number}_{coordinates[1]}.png")
    
    # Save the image with the bounding box
    img.save(f"output/{image_path}")

In [9]:
draw_bounding_box_on_pdf_image(file_path, json_doc[4]["bbox"], json_doc[4]["page_idx"])

2.7777777777777777
2.7777777777777777


Visualise layout tree

In [10]:
def visualize_block_tree(root, max_level=-1, graph=None):
    if graph is None:
        graph = Digraph()
    
    node_id = str(id(root))
    graph.node(node_id, f"{root.tag} (level {root.level})")

    for child in root.children:
        child_id = str(id(child))
        max_level = max(max_level, child.level)
        graph.node(child_id, f"{child.tag} (level {child.level})")
        graph.edge(node_id, child_id)
        # reset the root node and recurse
        _, lvl = visualize_block_tree(child, max_level, graph)
        max_level = max(max_level, lvl)
    
    return graph, max_level

In [11]:
graph, max_level = visualize_block_tree(layout_root)
graph.render('block_tree', view=True)
print(max_level)

3


In [12]:
def parent_chain(node):
    """
    Returns the parent chain of the block consisting of all the parents of the block until the root.
    """
    chain = []
    parent = node.parent
    while parent:
        chain.append(parent)
        parent = parent.parent
    chain.reverse()
    return chain

def parent_text(node):
    """
    Returns the text of the parent chain of the block. This is useful for adding section information to the text.
    """
    chain = parent_chain(node)
    header_texts = []
    para_texts = []
    for p in chain:
        if p.tag == "header":
            header_texts.append(p.to_text()) 
        elif p.tag in ['list_item', 'para']:
            para_texts.append(p.to_text())
    text = "\n>\n".join(header_texts)
    if len(para_texts) > 0:
        text +="\n\n".join(para_texts)
    return text
   
def to_context_text(node, include_section_info=True):
    """
    This is a customised function largely derived from layout_reader.py of the llmsherpa library
    Returns the text of the block with section information. This provides context to the text.
    """
    text = "Metadata:\n"
    if include_section_info and parent_text(node) != "":
        text += parent_text(node) + "  >\n"
    text += "Content:\n"
    if node.tag in ['list_item', 'para']:
        text += node.to_text(include_children=True, recurse=True)
    elif node.tag == 'table':
        text += node.to_html()
    else:
        print("Tag anomaly!")
        print("TAG IS:", node.tag)
        text += node.to_text()
    return text

In [13]:
def is_use_semantic_chunking(leaf_nodes):
    # Returns true if more than 50% of paragraphs have only one line
    count = 0
    num_paras = len(leaf_nodes)
    
    for node in leaf_nodes:
        if node.tag == "para":
            txt = node.to_text().strip()

            lines = txt.split("\n")
            if len(lines) == 1:
                count += 1
                print("Single line para:", txt)

    print("Count is:", count)
    return count > num_paras/2

In [14]:
leaf_nodes = []

def find_leaf_nodes(node):
    if len(node.children) == 0:
        leaf_nodes.append(node)
    for child in node.children:
        find_leaf_nodes(child)

find_leaf_nodes(layout_root)

print(f"Found {len(leaf_nodes)} leaf nodes")

if is_use_semantic_chunking(leaf_nodes):
    # Perform semantic chunking

    ## Load embeddings
    embeddings = OllamaEmbeddings(
        base_url=db_config["ollama_base_url"],	
        model=db_config["llm_name"]
    )
    ## Chunk documents using semantic chunker
    text_splitter = SemanticChunker(
        embeddings, breakpoint_threshold_type="percentile"
    )

    full_text = ""
    for child in layout_root.children:
        full_text += child.to_text(include_children=True, recurse=True) + "\n"

    docs = text_splitter.create_documents([full_text[:20]])
    print(docs[0])
else:
    print("Using llmsherpa")
    # Use chunks from llmsherpa
    # Each chunk is each leaf_node with to_context_text()
    collated_pg_content = [to_context_text(node) for node in leaf_nodes]

    print(len(layout_root.chunks()))
    

    # assert len(collated_pg_content) == len(leaf_nodes)
    
    # Convert to Langchain documents
    # langchainDocument metadata limited to bbox, page_idx, and level for now
    docs = [LangchainDocument(page_content=collated_pg_content[i], metadata={key: leaf_nodes[i].block_json[key] for key in ('bbox', 'page_idx', 'level')}) for i in range(len(collated_pg_content))]

Found 139 leaf nodes
Single line para: amazon
Single line para: Cost      of      sales
Single line para: Technology      and      infrastructure Sales      and      marketing General      and      administrative
Single line para: Total      operating      expenses Operating      income Interest      income Interest      expense Other      income      (expense),      net Total      non-operating      expense Income      before      income      taxes Provision      for      income      taxes
Single line para: Net      income Basic      earnings      per      share Diluted      earnings      per      share
Single line para: 2023      2024
Single line para: 10,250      10,393
Single line para: Net      income Other      comprehensive      income      (loss):
Single line para: Foreign      currency      translation      adjustments,      net      of      tax      of      $(10)      and      $30 Available-for-sale      debt      securities:
Single line para: 386      (1,096)
Single line par

Testing with LLM

In [15]:
embeddings = OllamaEmbeddings(
    base_url=db_config["ollama_base_url"],
    model=db_config["llm_name"]
)

In [16]:
hybrid_db = Neo4jVector.from_documents(
    docs,
    embedding=embeddings,
    url=db_config["neo4j_url"],
    username=db_config["neo4j_username"],
    password=db_config["neo4j_password"],
    search_type="hybrid",
    pre_delete_collection=True,
)



In [17]:
index_name = "vector"  # default index name
keyword_index_name = "keyword"  # default keyword index name

store = Neo4jVector.from_existing_index(
    embeddings,
    url=db_config["neo4j_url"],
    username=db_config["neo4j_username"],
    password=db_config["neo4j_password"],
    index_name=index_name,
    keyword_index_name=keyword_index_name,
    search_type="hybrid",
)

In [18]:
retriever = store.as_retriever(search_kwargs={'k': 25, 'fetch_k': 50, 'score_threshold': 0.8})

In [19]:
query = "What is the Depreciation and amortization of property and equipment and capitalized content costs, operating lease assets, and other for year 2023"

In [20]:
resp = retriever.invoke(query)[0]
print("PageContent:", resp.page_content)
print("Metadata:", resp.metadata)

PageContent: Metadata:
AMAZON.COM      ANNOUNCES      FIRST      QUARTER      RESULTS
>
Second      Quarter      2024      Guidance  >
Content:
 
 
  Consolidated      Statements      of      Cash      Flows  
 
  (in      millions)  
 
  (unaudited)  
 
  Three      Months      Ended      Twelve      Months      Ended  
 
  March      31,      March      31,  
 
  2023      2024      2023      2024 CASH,      CASH      EQUIVALENTS,      AND      RESTRICTED      CASH,      BEGINNING      OF      PERIOD $
 
   54,253   
 
$      73,890   
 
$      36,599      §$      49,734  
 
  OPERATING      ACTIVITIES:  
 
  Net      income      3,172      10,431      4,294      37,684  
 
  Adjustments      to      reconcile      net      income      to      net      cash      from      operating      activities:  
 
  Depreciation      and      amortization      of      property      and      equipment      and      capitalized      content      costs,      operating      lease      assets,  
 
  

In [21]:
draw_bounding_box_on_pdf_image(file_path, resp.metadata["bbox"], resp.metadata["page_idx"])

2.7777777777777777
2.7777777777777777


Rerank (shown to return better results)

In [78]:
llm = Ollama(model="llama3", temperature=0, base_url="http://localhost:11434")

# top_n gives us the top 10 results after reranking the initial 25 results
compressor = FlashrankRerank(top_n=5)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever, 
)

compressed_docs = compression_retriever.invoke(
    query
)

In [23]:
compressed_docs[0]

Document(page_content='Metadata:\nY/Y      %  >\nContent:\n(3)      Free      cash      flow      is      cash      flow      from      operations      reduced      by      “Purchases      of      property      and      equipment,      net      of      proceeds      from      sales      and      incentives.”', metadata={'level': 2, 'page_idx': 13, 'bbox': [63.36, 559.37, 436.08, 571.37], 'relevance_score': 0.9992182})

In [24]:
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [
                f"Document {i+1}:\n\n{d.page_content}\nMetadata: {d.metadata}"
                for i, d in enumerate(docs)
            ]
        )
    )

In [25]:
pretty_print_docs(compressed_docs)

Document 1:

Metadata:
Y/Y      %  >
Content:
(3)      Free      cash      flow      is      cash      flow      from      operations      reduced      by      “Purchases      of      property      and      equipment,      net      of      proceeds      from      sales      and      incentives.”
Metadata: {'level': 2, 'page_idx': 13, 'bbox': [63.36, 559.37, 436.08, 571.37], 'relevance_score': 0.9992182}
----------------------------------------------------------------------------------------------------
Document 2:

Metadata:
AMAZON.COM      ANNOUNCES      FIRST      QUARTER      RESULTS
>
Diluted       March      31,  >
Content:
Change      in      net      unrealized      gains      (losses),      net      of      tax      of      $(29)      and      $(158) Less:      reclassification      adjustment      for      losses      (gains)      included      in      “Other      income  
 
  (expense),      net,”      net      of      tax      of      $(10)      and      $0 Net      change O

In [26]:
draw_bounding_box_on_pdf_image(file_path, compressed_docs[0].metadata["bbox"], compressed_docs[0].metadata["page_idx"])

2.7777777777777777
2.7777777777777777


In [27]:
layout_root.children

[<llmsherpa.readers.layout_reader.Paragraph at 0x7f11ec2cd430>,
 <llmsherpa.readers.layout_reader.Section at 0x7f11ec2cdf40>,
 <llmsherpa.readers.layout_reader.Section at 0x7f107fff68e0>,
 <llmsherpa.readers.layout_reader.Section at 0x7f107fff6a90>,
 <llmsherpa.readers.layout_reader.Section at 0x7f107fff6b80>,
 <llmsherpa.readers.layout_reader.Section at 0x7f107fff6d90>,
 <llmsherpa.readers.layout_reader.Section at 0x7f107fff6ee0>,
 <llmsherpa.readers.layout_reader.Section at 0x7f107fffd0a0>]

#### Hard code the part that returns the documents used to answer said question.
#### The `return_source_documents` parameter is useful in this case.

In [28]:
# Without reranker
resp = retriever.invoke(query)
print(len(resp))
for doc in resp:
    print(doc)
    print()

25
page_content='Metadata:\nAMAZON.COM      ANNOUNCES      FIRST      QUARTER      RESULTS\n>\nSecond      Quarter      2024      Guidance  >\nContent:\n \n \n  Consolidated      Statements      of      Cash      Flows  \n \n  (in      millions)  \n \n  (unaudited)  \n \n  Three      Months      Ended      Twelve      Months      Ended  \n \n  March      31,      March      31,  \n \n  2023      2024      2023      2024 CASH,      CASH      EQUIVALENTS,      AND      RESTRICTED      CASH,      BEGINNING      OF      PERIOD $\n \n   54,253   \n \n$      73,890   \n \n$      36,599      §$      49,734  \n \n  OPERATING      ACTIVITIES:  \n \n  Net      income      3,172      10,431      4,294      37,684  \n \n  Adjustments      to      reconcile      net      income      to      net      cash      from      operating      activities:  \n \n  Depreciation      and      amortization      of      property      and      equipment      and      capitalized      content      costs,      opera

In [29]:
compressed_docs = compression_retriever.invoke(
    query, 
)
print(len(compressed_docs))
for doc in compressed_docs:
    print(doc)
    print()

10
page_content='Metadata:\nY/Y      %  >\nContent:\n(3)      Free      cash      flow      is      cash      flow      from      operations      reduced      by      “Purchases      of      property      and      equipment,      net      of      proceeds      from      sales      and      incentives.”' metadata={'level': 2, 'page_idx': 13, 'bbox': [63.36, 559.37, 436.08, 571.37], 'relevance_score': 0.9992182}

page_content='Metadata:\nAMAZON.COM      ANNOUNCES      FIRST      QUARTER      RESULTS\n>\nDiluted       March      31,  >\nContent:\nChange      in      net      unrealized      gains      (losses),      net      of      tax      of      $(29)      and      $(158) Less:      reclassification      adjustment      for      losses      (gains)      included      in      “Other      income  \n \n  (expense),      net,”      net      of      tax      of      $(10)      and      $0 Net      change Other,      net      of      tax      of      $0      and      $(1) Total      other  

We note that the number of documents has reduced from 4 to 3

In [88]:
query = "Give me the key highlights of financial results for the first quarter ended March 31, 2024"
chain = RetrievalQA.from_chain_type(llm=llm, retriever=compression_retriever, return_source_documents=True,)
resp = chain.invoke(query)


In [31]:
# print(resp)
print(resp["result"])
print()
for doc in (resp["source_documents"]):
    print(doc)

Based on the provided context, here are the key highlights of Amazon's financial results for the first quarter ended March 31, 2024:

* Net sales increased to $95 billion in Q1 2024 from $536 million in Q1 2023.
* Operating income was $67.8 billion, up from $72.6 billion in Q1 2023.
* Net income was $20.9 billion, down from $22.3 billion in Q1 2023.
* Earnings per share (EPS) were $0.31, compared to $1.00 in Q1 2023.

These highlights suggest that Amazon's financial performance has improved slightly compared to the same period last year, with a slight increase in operating income and net income, but a decrease in EPS.

page_content='Metadata:\nAMAZON.COM      ANNOUNCES      FIRST      QUARTER      RESULTS\n>\nHighlights  \n \n*      Launched   \n \na      Disaster      Relief      Hub      in      Rheinberg,      Germany—Amazon’s      first      Hub      in      Europe      and      the      company’s      13th  \n \n  around      the      world.\nThe      21,000-square-foot      Hub  

In [37]:
for doc in resp["source_documents"]:
    draw_bounding_box_on_pdf_image(file_path, doc.metadata["bbox"], doc.metadata["page_idx"])

2.7777777777777777
2.7777777777777777
2.7777777777777777
2.7777777777777777
2.7777777777777777
2.7777777777777777
2.7777777777777777
2.7777777777777777
2.7777777777777777
2.7777777777777777
2.7777777777777777
2.7777777777777777
2.7777777777777777
2.7777777777777777
2.7777777777777777
2.7777777777777777
2.7777777777777777
2.7777777777777777
2.7777777777777777
2.7777777777777777


In [33]:
layout_root.children

[<llmsherpa.readers.layout_reader.Paragraph at 0x7f11ec2cd430>,
 <llmsherpa.readers.layout_reader.Section at 0x7f11ec2cdf40>,
 <llmsherpa.readers.layout_reader.Section at 0x7f107fff68e0>,
 <llmsherpa.readers.layout_reader.Section at 0x7f107fff6a90>,
 <llmsherpa.readers.layout_reader.Section at 0x7f107fff6b80>,
 <llmsherpa.readers.layout_reader.Section at 0x7f107fff6d90>,
 <llmsherpa.readers.layout_reader.Section at 0x7f107fff6ee0>,
 <llmsherpa.readers.layout_reader.Section at 0x7f107fffd0a0>]

In [34]:
for node in layout_root.chunks():
    print(node.tag)
    print(node.level)
    print(node.to_text())
    print("---" * 5)

para
0
amazon
---------------
para
1
SEATTLE—(BUSINESS      WIRE)      April      30,      2024—Amazon.com,      Inc.      (NASDAQ:      AMZN)      today      announced      financial      results  
 
  for      its      first      quarter      ended      March      31,      2024.
---------------
para
1
 
 
 
¢      Net      sales      increased      13%      to      $143.3      billion      in      the      first      quarter,      compared      with      $127.4      billion      in      first      quarter      2023.  
 
  Excluding      the      $0.2      billion      unfavorable      impact      from      year-over-year      changes      in      foreign      exchange      rates      throughout      the  
 
  quarter,      net      sales      increased      13%      compared      with      first      quarter      2023.
---------------
list_item
2
*
 
   North      America      segment      sales      increased      12%      year-over-year      to      $86.3      billion.
------------

In [35]:
for node in leaf_nodes:
    print(node.tag)
    print(node.level)
    # print(node.to_text(include_children=True, recurse=True))
    print(to_context_text(node))
    print("---" * 5)

para
0
Metadata:
Content:
amazon
---------------
para
1
Metadata:
AMAZON.COM      ANNOUNCES      FIRST      QUARTER      RESULTS  >
Content:
SEATTLE—(BUSINESS      WIRE)      April      30,      2024—Amazon.com,      Inc.      (NASDAQ:      AMZN)      today      announced      financial      results  
 
  for      its      first      quarter      ended      March      31,      2024.
---------------
para
1
Metadata:
AMAZON.COM      ANNOUNCES      FIRST      QUARTER      RESULTS  >
Content:
 
 
 
¢      Net      sales      increased      13%      to      $143.3      billion      in      the      first      quarter,      compared      with      $127.4      billion      in      first      quarter      2023.  
 
  Excluding      the      $0.2      billion      unfavorable      impact      from      year-over-year      changes      in      foreign      exchange      rates      throughout      the  
 
  quarter,      net      sales      increased      13%      compared      with      first   

In [36]:
print(to_context_text(leaf_nodes[3]))

Metadata:
AMAZON.COM      ANNOUNCES      FIRST      QUARTER      RESULTS  >
Content:
*
 
   North      America      segment      sales      increased      12%      year-over-year      to      $86.3      billion.


This is a test to see if iterating through each document and seeing how they answer the question will help

In [89]:
query = "Give me the key highlights of financial results for the first quarter ended March 31, 2024"

In [90]:
llm1_manager = Ollama(model="llama3", temperature=0, format='json', base_url="http://localhost:11434")
llm2_answerer = Ollama(model="llama3", temperature=0.3, base_url="http://localhost:11434")
# retrieve documents here
llm3_summarizer = Ollama(model="llama3", temperature=0.3, base_url="http://localhost:11434")

In [98]:
# llm1 will disect the initial prompt into more specific subprompts
llm1_manager_task = PromptTemplate(
    template="""<|begin_of_text|>
    <|start_header_id|>system<|end_header_id|>
    Your task is to dissect 
    the following question into 5 more specific queries 
    for prompting the vector database. In each of your sub-prompts, 
    be specific in your use of keyword.

    <|eot_id|><|start_header_id|>user<|end_header_id|>
    Question: {query}

    Answer format:
    'query1': 'prompt', 'query2': 'prompt',
    <|eot_id|><|start_header_id|>assistant<|end_header_id|>
    """,

    # these refer to the items in {} in the template above
    input_variables=["query"],
)

pipeline = llm1_manager_task | llm1_manager | JsonOutputParser()
llm1_manager_resp = pipeline.invoke({"query":query})
print(llm1_manager_resp)


{'query1': 'What were the total revenues reported by the company for the first quarter ended March 31, 2024?', 'query2': 'What was the net income of the company for the first quarter ended March 31, 2024?', 'query3': "What was the gross profit margin percentage for the company's products and services for the first quarter ended March 31, 2024?", 'query4': 'What were the operating expenses reported by the company for the first quarter ended March 31, 2024?', 'query5': 'What was the cash flow from operations for the company for the first quarter ended March 31, 2024?'}


In [99]:
doc_prompts = [llm1_manager_resp[query] for query in llm1_manager_resp]
print(doc_prompts)

['What were the total revenues reported by the company for the first quarter ended March 31, 2024?', 'What was the net income of the company for the first quarter ended March 31, 2024?', "What was the gross profit margin percentage for the company's products and services for the first quarter ended March 31, 2024?", 'What were the operating expenses reported by the company for the first quarter ended March 31, 2024?', 'What was the cash flow from operations for the company for the first quarter ended March 31, 2024?']


In [100]:
set_of_docs = []
for doc_prompt in doc_prompts:
    compressed_docs = compression_retriever.invoke(doc_prompt)
    set_of_docs.append(compressed_docs)

In [101]:
print((set_of_docs))

[[Document(page_content='Metadata:\nY/Y      %  >\nContent:\n(1)   \n \n_      For      the      twelve      months      ended      March      31,      2023      and      2024,      this      amount      relates      to      equipment      included      in      “Property      and      equipment      acquired      under      finance      leases,      net      of  \n \n  remeasurements      and      modifications”      of      $517      million      and      $676      million.', metadata={'level': 2, 'page_idx': 13, 'bbox': [63.36, 525.76, 534.24, 545.92], 'relevance_score': 0.9993714}), Document(page_content='Metadata:\nY/Y      %  >\nContent:\n(2)      For      the      twelve      months      ended      March      31,      2023      and      2024,      this      amount      relates      to      property      included      in      “Principal      repayments      of      finance      leases”      of      $6,544      million      and      $3,774  \n \n  million.', metadata={'level': 2, '

In [112]:
# Flatten array, remove duplicates
flat_docs = [doc for docs in set_of_docs for doc in docs]

In [113]:
len(flat_docs)

25

In [128]:
responses = []
# For each document, return yes or no depending on whether the document is relevant to the original query
for doc in flat_docs:
    llm2_answerer_task = PromptTemplate(
        template="""
        <|begin_of_text|>
        <|start_header_id|>system<|end_header_id|>
        Your task is to use the document to answer the original query.
        ASSUME THAT THE DOCUMENTS ARE ALL RELEVANT TO THE QUERY.
        The document WILL NOT answer the full query, but only highlight the key points of how it can contribute to an answer.
        Keep to at most 5 sentences.
        EXCLUDE ANY PREAMBLE.
        <|eot_id|><|start_header_id|>user<|end_header_id|>
        Document: {doc}
        Original Query: {query}
        <|eot_id|><|start_header_id|>assistant<|end_header_id|>
        """,
        input_variables=["doc", "query"],
    )
    pipeline = llm2_answerer_task | llm2_answerer | StrOutputParser()
    llm2_answerer_resp = pipeline.invoke({"doc": doc.page_content, "query": query})
    print(llm2_answerer_resp)
    responses.append(llm2_answerer_resp)

The document highlights that for the twelve months ended March 31, 2023 and 2024, equipment acquired under finance leases, net of remeasurements and modifications, was $676 million. This information provides a snapshot of the company's financial performance over a longer period, including the first quarter ended March 31, 2024.
The document provides information on principal repayments of finance leases for the 12 months ended March 31, 2023 and 2024. For the period ended March 31, 2024, the amount relates to property included in "Principal repayments of finance leases" of $3,774 million. This suggests that there were significant principal repayments made on finance leases during the first quarter of 2024.
This document does not provide specific financial results for the first quarter ended March 31, 2024. However, it can contribute to an answer by highlighting Amazon's focus on customer obsession, passion for invention, operational excellence, and long-term thinking, which may be refle

In [124]:
print(responses)

['Unfortunately, there are no specific financial results mentioned in the provided document. However, it does provide information on equipment acquired under finance leases for the 12 months ended March 31, 2023 and 2024, which were $517 million and $676 million, respectively. There is no mention of first-quarter financial results specifically.', "Unfortunately, there is no information provided about the financial results for the first quarter ended March 31, 2024. The document only mentions a comparison between two years (2023 and 2024) regarding principal repayments of finance leases. If you're looking for specific financial highlights, I'd be happy to help you with that once more information is provided.", "I apologize, but there are no financial results or key highlights mentioned in the provided document. The text appears to be an introduction to Amazon and its values, rather than a report on its financial performance. If you're looking for information on Amazon's financial result

In [134]:
llm3_summarizer_task = PromptTemplate(
    template="""
    <|begin_of_text|>
    <|start_header_id|>system<|end_header_id|>
    YOU ARE A SUMMARIZER. YOUR OPINION ON THE ANSWERS OR QUERY IS NOT NEEDED.
    SUMMARISE THE ANSWERS TO THE QUERY IN AS MANY SENTENCES AS YOU NEED.
    <|eot_id|><|start_header_id|>user<|end_header_id|>
    Answers: {answers}
    Query: {query}
    <|eot_id|><|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["answers", "query"],
)

pipeline = llm3_summarizer_task | llm3_summarizer | StrOutputParser()
# array to text
responses_str = "; ".join(responses)
# print(responses_str)
llm3_summarizer_resp = pipeline.invoke({"answers": responses_str, "query": query})
print(llm3_summarizer_resp)

Here are the key highlights of financial results for the first quarter ended March 31, 2024:

* Net sales expected to be between $144.0 billion and $146.5 billion.
* Operating income is expected to be between $3.2 billion and $3.7 billion.
* Net income is expected to be between $2.1 billion and $2.6 billion.
* Diluted earnings per share (EPS) are expected to be between $4.35 and $5.15.

Note: These financial results are based on Amazon's guidance provided in their press release, but do not include actual figures as the query is asking for key highlights of financial results for the first quarter ended March 31, 2024, which are not available in the provided document.


... this seems kinda useless