In [1]:
%pip install -U --quiet llmsherpa graphviz aspose-words langchain_experimental python-docx fpdf pdfkit pdf2image

Note: you may need to restart the kernel to use updated packages.


In [2]:
from llmsherpa.readers import LayoutPDFReader
from IPython.display import display, Markdown
from graphviz import Digraph
from langchain.docstore.document import Document as LangchainDocument
from langchain_community.vectorstores import Neo4jVector
from langchain_community.embeddings import OllamaEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from pdf2image import convert_from_path
from PIL import ImageDraw
import fitz
from langchain_community.llms import Ollama
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from flashrank import Ranker, RerankRequest

**Focus Area: Improving the usefulness of RAG, now that a brief POC of the architecture has been achieved.**<br>
- Performance of RAG depends heavily on chunking since RAG just uses LLM to stitch together information.<br>
- Previously treated as a black box.<br>
- Determine how best for users to verify documents used in answer generation.

##### Input file here (we are limiting to pdf for now)

In [3]:
file_path = "documents/report.pdf"

### Preliminary chunking using LLMSherpa

##### LLMSherpa Setup

In [4]:
db_config= {"ollama_base_url": "http://localhost:11434",
        "llm_name": "llama3",
        "nlm_url": "http://localhost:5010/api/parseDocument?renderFormat=all&applyOcr=yes&useNewIndentParser=yes",
        "neo4j_url": "bolt://localhost:7687",
        "neo4j_username": "neo4j",
        "neo4j_password": "password",
        }

In [5]:
try:
    reader = LayoutPDFReader(db_config["nlm_url"])
    print(file_path)
    parsed_doc = reader.read_pdf(file_path)
    layout_root = parsed_doc.root_node
    json_doc = parsed_doc.json
except Exception as e:
    print("Error:", e)

documents/report.pdf


Draw bounding box

In [8]:
def draw_bounding_box_on_pdf_image(pdf_path, coordinates, page_number=0, dpi=200):
    # Convert PDF page to image
    images = convert_from_path(pdf_path, first_page=page_number + 1, last_page=page_number + 1, dpi=dpi)
    
    # Assuming we have only one image since we specified a single page
    img = images[0]
    
    # Get the size of the PDF page in points (1 point = 1/72 inches)
    pdf_doc = fitz.open(pdf_path)
    page = pdf_doc.load_page(page_number)
    page_width, page_height = page.rect.width, page.rect.height
    
    # Scale coordinates to match the image resolution
    scale_x = img.width / page_width
    scale_y = img.height / page_height
    scaled_coordinates = tuple(int(coord * max(scale_x, scale_y)) for coord in coordinates)

    # Manually tune the right limit more
    scaled_coordinates = (scaled_coordinates[0], scaled_coordinates[1], scaled_coordinates[2] + 60, scaled_coordinates[3])
    
    # Draw the bounding box on the image
    draw = ImageDraw.Draw(img)
    draw.rectangle(scaled_coordinates, outline="red", width=2)

    # Image file name
    # take the pdf_path and add page_number
    # image_path = pdf_path.replace(".pdf", f"_page_{page_number}_{coordinates[1]}.png")
    image_path = pdf_path.split("/")[-1].replace(".pdf", f"_page_{page_number}_{coordinates[1]}.png")
    
    # Save the image with the bounding box
    img.save(f"output/{image_path}")

Visualise layout tree (workings)

In [271]:
def visualize_block_tree(root, max_level=-1, graph=None):
    if graph is None:
        graph = Digraph()
    
    node_id = str(id(root))
    graph.node(node_id, f"{root.tag} (level {root.level})")

    for child in root.children:
        child_id = str(id(child))
        max_level = max(max_level, child.level)
        graph.node(child_id, f"{child.tag} (level {child.level})")
        graph.edge(node_id, child_id)
        # reset the root node and recurse
        _, lvl = visualize_block_tree(child, max_level, graph)
        max_level = max(max_level, lvl)
    
    return graph, max_level

In [272]:
graph, max_level = visualize_block_tree(layout_root)
graph.render('block_tree', view=True)
print(max_level)

3


In [273]:
def parent_chain(node):
    """
    Returns the parent chain of the block consisting of all the parents of the block until the root.
    """
    chain = []
    parent = node.parent
    while parent:
        chain.append(parent)
        parent = parent.parent
    chain.reverse()
    return chain

def parent_text(node):
    """
    Returns the text of the parent chain of the block. This is useful for adding section information to the text.
    """
    chain = parent_chain(node)
    header_texts = []
    para_texts = []
    for p in chain:
        if p.tag == "header":
            header_texts.append(p.to_text()) 
        elif p.tag in ['list_item', 'para']:
            para_texts.append(p.to_text())
    text = "\n>\n".join(header_texts)
    if len(para_texts) > 0:
        text +="\n\n".join(para_texts)
    return text
   
def to_context_text(node, include_section_info=True):
    """
    This is a customised function largely derived from layout_reader.py of the llmsherpa library
    Returns the text of the block with section information. This provides context to the text.
    """
    text = "Metadata:\n"
    if include_section_info and parent_text(node) != "":
        text += parent_text(node) + "  >\n"
    text += "Content:\n"
    if node.tag in ['list_item', 'para']:
        text += node.to_text(include_children=True, recurse=True)
    elif node.tag == 'table':
        text += node.to_html()
    else:
        text += node.to_text(include_children=True, recurse=True)
    return text

In [276]:
def is_use_semantic_chunking(leaf_nodes):
    # Returns true if more than 50% of paragraphs have only one line
    count = 0
    num_paras = len([node for node in leaf_nodes if node.tag == "para"])
    
    for node in leaf_nodes:
        if node.tag == "para":
            txt = node.to_text().strip()

            lines = txt.split("\n")
            if len(lines) == 1:
                count += 1
                # print("Single line para:", txt)

    print("Number of single line para:", count)
    print("Number of paragraphs:", num_paras)
    return count > num_paras/2

In [277]:


def find_leaf_nodes(node):
    leaf_nodes = []

    if len(node.children) == 0:
        leaf_nodes.append(node)
    for child in node.children:
        find_leaf_nodes(child)

    return leaf_nodes
    

leaf_nodes = find_leaf_nodes(layout_root)

if is_use_semantic_chunking(leaf_nodes):
    # Perform semantic chunking

    ## Load embeddings
    embeddings = OllamaEmbeddings(
        base_url=db_config["ollama_base_url"],	
        model=db_config["llm_name"]
    )
    ## Chunk documents using semantic chunker
    text_splitter = SemanticChunker(
        embeddings, breakpoint_threshold_type="percentile"
    )

    full_text = ""
    for child in layout_root.children:
        full_text += child.to_text(include_children=True, recurse=True) + "\n"

    docs = text_splitter.create_documents([full_text])
else:
    print("Using llmsherpa")
    # Use chunks from llmsherpa
    # Each chunk is each leaf_node with to_context_text()
    collated_pg_content = [to_context_text(node) for node in leaf_nodes]
        
    # Convert to Langchain documents
    docs = [LangchainDocument(page_content=collated_pg_content[i], metadata={key: leaf_nodes[i].block_json[key] for key in ('bbox', 'page_idx', 'level')} | {"file_path": file_path}) for i in range(len(collated_pg_content))]

Number of single line para: 26
Number of paragraphs: 104
Using llmsherpa


Testing with LLM

In [245]:
embeddings = OllamaEmbeddings(
    base_url=db_config["ollama_base_url"],
    model=db_config["llm_name"]
)

In [246]:
llm = Ollama(model="llama3", temperature=0, base_url="http://localhost:11434")

In [247]:
hybrid_db = Neo4jVector.from_documents(
    docs,
    embedding=embeddings,
    url=db_config["neo4j_url"],
    username=db_config["neo4j_username"],
    password=db_config["neo4j_password"],
    search_type="hybrid",
    # TODO: Remove later
    pre_delete_collection=True,
)

INFO:neo4j.notifications:Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Schema.IndexOrConstraintAlreadyExists} {category: SCHEMA} {title: `CREATE CONSTRAINT IF NOT EXISTS FOR (e:Chunk) REQUIRE (e.id) IS UNIQUE` has no effect.} {description: `CONSTRAINT constraint_1dc138a FOR (e:Chunk) REQUIRE (e.id) IS UNIQUE` already exists.} for query: 'CREATE CONSTRAINT IF NOT EXISTS FOR (n:`Chunk`) REQUIRE n.id IS UNIQUE;'


In [248]:
index_name = "vector"  # default index name
keyword_index_name = "keyword"  # default keyword index name

store = Neo4jVector.from_existing_index(
    embeddings,
    url=db_config["neo4j_url"],
    username=db_config["neo4j_username"],
    password=db_config["neo4j_password"],
    index_name=index_name,
    keyword_index_name=keyword_index_name,
    search_type="hybrid",
)

In [249]:
retriever = store.as_retriever(search_kwargs={'k': 25, 'fetch_k': 50, 'score_threshold': 0.6})

Rerank (shown to return better results)

In [250]:
query = "What are ALL the sales figure changes in the three months ended mar31, 2023"

In [251]:
ranker = Ranker(model_name="ms-marco-MiniLM-L-12-v2", cache_dir="cache")

In [252]:
docs = retriever.invoke(query)

In [253]:
print("Number of preliminary docs retrieved:", len(docs))

Number of preliminary docs retrieved: 25


In [254]:
def docs_to_passages(docs):
    idx = 0
    passages = []
    for doc in docs:
        passages.append({
            "id": idx,
            "text": doc.page_content,
            "meta": doc.metadata
        })
        idx += 1
    return passages

In [255]:
rerankrequest = RerankRequest(query=query, passages=docs_to_passages(docs))
ranked_passages = ranker.rerank(rerankrequest)
print("Number of reranked docs:", len(ranked_passages))

Number of reranked docs: 25


In [256]:
# Exclude scores below 0.8
filtered_ranked_passages = [doc for doc in ranked_passages if doc['score'] >= 0.8]
# If query isn't specific enough, the score will be very low. In this case, we can use the top 5 docs.
filtered_ranked_passages = filtered_ranked_passages if len(filtered_ranked_passages) > 3 else ranked_passages[:10]

In [257]:
print(len(filtered_ranked_passages))

10


In [258]:
def passages_to_langchainDocument(passages):
    docs = []
    for passage in passages:
        docs.append(LangchainDocument(page_content=passage['text'], metadata=passage['meta']))
    return docs

In [259]:
final_docs = passages_to_langchainDocument(filtered_ranked_passages)

In [260]:
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [
                f"Document {i+1}:\n\n{d.page_content}\nMetadata: {d.metadata}"
                for i, d in enumerate(docs)
            ]
        )
    )

In [261]:
pretty_print_docs(final_docs)

Document 1:

Metadata:
AWS
>
Three      Months      Ended       March      31,       2023      2024  >
Content:
$
 
   127,358   
 
$      143,313 122,584      128,006 4,774      15,307 (655)      (2,324)  
 
  (948)      (2,467) 1
 
   (85) $
 
   3,172   
 
$      10,431
Metadata: {'level': 2, 'page_idx': 11, 'bbox': [441.6, 376.19, 536.88, 478.94], 'file_path': 'documents/report.pdf'}
----------------------------------------------------------------------------------------------------
Document 2:

Metadata:
AWS
>
Three      Months      Ended       March      31,       2023      2024  >
Content:
$
 
   29,123   
 
$      31,935 30,370      31,032 $
 
   (1,247)   
 
$      903
Metadata: {'level': 2, 'page_idx': 11, 'bbox': [441.6, 230.71, 535.2, 273.44], 'file_path': 'documents/report.pdf'}
----------------------------------------------------------------------------------------------------
Document 3:

Metadata:
AWS
>
Three      Months      Ended       March      31,       2023      2

In [262]:
llm_prompt = PromptTemplate(
    template=""""
        <|begin_of_text|>
        <|start_header_id|>system<|end_header_id|>
        You are a highly knowledgeable and structured Retrieval QA model. You are given a query and a set of documents.
        Your task is to provide a detailed and well-structured answer based on the documents provided.
        The documents have all been pre-processed and are determined by your overlords to be relevant to the query -- do not second-guess them.
        Please ensure that your answer is clear, concise, and divided into the following sections:

        1. **Introduction**: Briefly summarize the query and the context.
        2. **Key Information from Documents**: Highlight the most relevant information from the documents that directly addresses the query.
        3. **Detailed Answer**: Provide a thorough and detailed answer to the query, integrating information from the documents.
        4. **Conclusion**: Summarize the key points and provide any additional insights or recommendations if relevant.

        Remember to keep your answers concise and structured.
        <|eot_id|><|start_header_id|>user<|end_header_id|>
        Query: {query}
        Documents: {documents}

        <|eot_id|><|start_header_id|>assistant<|end_header_id|>
        """,
    input_variables=["query", "documents"],
)

pipeline = llm_prompt | llm | StrOutputParser()

# limit due to limited context length of llm
limit = 7000

# process documents stored in the compressed_docs
docs = [doc.page_content for doc in final_docs]
context = "\n\n---\n\n".join(docs)
print(context)

# remove all text past limit
context = context[:limit]


Metadata:
AWS
>
Three      Months      Ended       March      31,       2023      2024  >
Content:
$
 
   127,358   
 
$      143,313 122,584      128,006 4,774      15,307 (655)      (2,324)  
 
  (948)      (2,467) 1
 
   (85) $
 
   3,172   
 
$      10,431

---

Metadata:
AWS
>
Three      Months      Ended       March      31,       2023      2024  >
Content:
$
 
   29,123   
 
$      31,935 30,370      31,032 $
 
   (1,247)   
 
$      903

---

Metadata:
AWS
>
Three      Months      Ended       March      31,       2023      2024  >
Content:
60   
 
%      60   
 
% 23      22 17      18

---

Metadata:
AMAZON.COM      ANNOUNCES      FIRST      QUARTER      RESULTS
>
Diluted       March      31,
>
Three      Months      Ended       March      31,       2023      2024  >
Content:
Net      sales Operating      expenses Operating      income

---

Metadata:
AMAZON.COM      ANNOUNCES      FIRST      QUARTER      RESULTS
>
Diluted       March      31,
>
Three      Months      Ended   

In [263]:
def draw_bbox_for_final_docs(docs):
    for doc in docs:
        print(doc.metadata)
        draw_bounding_box_on_pdf_image(doc.metadata["file_path"], doc.metadata["bbox"], doc.metadata["page_idx"])

In [264]:
output = pipeline.invoke({"query": query, "documents": context})
display(Markdown(output))
print("---" * 5)
print(final_docs)
draw_bbox_for_final_docs(final_docs)

**Introduction**

The query asks for the sales figure changes in the three months ended March 31, 2023. The provided documents are metadata and content related to Amazon's first quarter results.

**Key Information from Documents**

From the documents, we can extract the following key information:

* Net sales increased 13% to $143.3 billion in the first quarter of 2024 compared to $127.4 billion in the same period of 2023.
* Excluding the $0.2 billion unfavorable impact from year-over-year changes in foreign exchange rates, net sales increased 13% compared with the first quarter of 2023.

**Detailed Answer**

Based on the provided documents, we can conclude that the sales figure changes in the three months ended March 31, 2023 are as follows:

* Net sales increased by $15.9 billion (13%) from $127.4 billion to $143.3 billion.
* This increase is attributed to a combination of factors, including year-over-year changes in foreign exchange rates and other market conditions.

**Conclusion**

In conclusion, the sales figure changes in the three months ended March 31, 2023 show an increase of 13% from the same period in 2023. This growth can be attributed to various factors, including changes in foreign exchange rates and market conditions.

---------------
[Document(page_content='Metadata:\nAWS\n>\nThree      Months      Ended       March      31,       2023      2024  >\nContent:\n$\n \n   127,358   \n \n$      143,313 122,584      128,006 4,774      15,307 (655)      (2,324)  \n \n  (948)      (2,467) 1\n \n   (85) $\n \n   3,172   \n \n$      10,431', metadata={'level': 2, 'page_idx': 11, 'bbox': [441.6, 376.19, 536.88, 478.94], 'file_path': 'documents/report.pdf'}), Document(page_content='Metadata:\nAWS\n>\nThree      Months      Ended       March      31,       2023      2024  >\nContent:\n$\n \n   29,123   \n \n$      31,935 30,370      31,032 $\n \n   (1,247)   \n \n$      903', metadata={'level': 2, 'page_idx': 11, 'bbox': [441.6, 230.71, 535.2, 273.44], 'file_path': 'documents/report.pdf'}), Document(page_content='Metadata:\nAWS\n>\nThree      Months      Ended       March      31,       2023      2024  >\nContent:\n60   \n \n%      60   \n \n% 23      22 17      18', metadata={'level': 2, 'page_idx': 11, 'bbox':

### Archive

This is a test to see if iterating through each document and seeing how they answer the question will help

In [None]:
query = "Give me the key highlights of financial results for the first quarter ended March 31, 2024"

In [None]:
llm1_manager = Ollama(model="llama3", temperature=0, format='json', base_url="http://localhost:11434")
llm2_answerer = Ollama(model="llama3", temperature=0.3, base_url="http://localhost:11434")
# retrieve documents here
llm3_summarizer = Ollama(model="llama3", temperature=0.3, base_url="http://localhost:11434")

In [None]:
# llm1 will disect the initial prompt into more specific subprompts
llm1_manager_task = PromptTemplate(
    template="""<|begin_of_text|>
    <|start_header_id|>system<|end_header_id|>
    Your task is to dissect 
    the following question into 5 more specific queries 
    for prompting the vector database. In each of your sub-prompts, 
    be specific in your use of keyword.

    <|eot_id|><|start_header_id|>user<|end_header_id|>
    Question: {query}

    Answer format:
    'query1': 'prompt', 'query2': 'prompt',
    <|eot_id|><|start_header_id|>assistant<|end_header_id|>
    """,

    # these refer to the items in {} in the template above
    input_variables=["query"],
)

pipeline = llm1_manager_task | llm1_manager | JsonOutputParser()
llm1_manager_resp = pipeline.invoke({"query":query})
print(llm1_manager_resp)


{'query1': 'What were the total revenues reported by the company for the first quarter ended March 31, 2024?', 'query2': 'What was the net income of the company for the first quarter ended March 31, 2024?', 'query3': "What was the gross profit margin percentage for the company's products and services for the first quarter ended March 31, 2024?", 'query4': 'What were the operating expenses reported by the company for the first quarter ended March 31, 2024?', 'query5': 'What was the cash flow from operations for the company for the first quarter ended March 31, 2024?'}


In [None]:
doc_prompts = [llm1_manager_resp[query] for query in llm1_manager_resp]
print(doc_prompts)

['What were the total revenues reported by the company for the first quarter ended March 31, 2024?', 'What was the net income of the company for the first quarter ended March 31, 2024?', "What was the gross profit margin percentage for the company's products and services for the first quarter ended March 31, 2024?", 'What were the operating expenses reported by the company for the first quarter ended March 31, 2024?', 'What was the cash flow from operations for the company for the first quarter ended March 31, 2024?']


In [None]:
set_of_docs = []
for doc_prompt in doc_prompts:
    compressed_docs = compression_retriever.invoke(doc_prompt)
    set_of_docs.append(compressed_docs)

In [None]:
print((set_of_docs))

[[Document(page_content='Metadata:\nY/Y      %  >\nContent:\n(1)   \n \n_      For      the      twelve      months      ended      March      31,      2023      and      2024,      this      amount      relates      to      equipment      included      in      “Property      and      equipment      acquired      under      finance      leases,      net      of  \n \n  remeasurements      and      modifications”      of      $517      million      and      $676      million.', metadata={'level': 2, 'page_idx': 13, 'bbox': [63.36, 525.76, 534.24, 545.92], 'relevance_score': 0.9993714}), Document(page_content='Metadata:\nY/Y      %  >\nContent:\n(2)      For      the      twelve      months      ended      March      31,      2023      and      2024,      this      amount      relates      to      property      included      in      “Principal      repayments      of      finance      leases”      of      $6,544      million      and      $3,774  \n \n  million.', metadata={'level': 2, '

In [None]:
# Flatten array, remove duplicates
flat_docs = [doc for docs in set_of_docs for doc in docs]

In [None]:
len(flat_docs)

25

In [None]:
responses = []
# For each document, return yes or no depending on whether the document is relevant to the original query
for doc in flat_docs:
    llm2_answerer_task = PromptTemplate(
        template="""
        <|begin_of_text|>
        <|start_header_id|>system<|end_header_id|>
        Your task is to use the document to answer the original query.
        ASSUME THAT THE DOCUMENTS ARE ALL RELEVANT TO THE QUERY.
        The document WILL NOT answer the full query, but only highlight the key points of how it can contribute to an answer.
        Keep to at most 5 sentences.
        EXCLUDE ANY PREAMBLE.
        <|eot_id|><|start_header_id|>user<|end_header_id|>
        Document: {doc}
        Original Query: {query}
        <|eot_id|><|start_header_id|>assistant<|end_header_id|>
        """,
        input_variables=["doc", "query"],
    )
    pipeline = llm2_answerer_task | llm2_answerer | StrOutputParser()
    llm2_answerer_resp = pipeline.invoke({"doc": doc.page_content, "query": query})
    print(llm2_answerer_resp)
    responses.append(llm2_answerer_resp)

The document highlights that for the twelve months ended March 31, 2023 and 2024, equipment acquired under finance leases, net of remeasurements and modifications, was $676 million. This information provides a snapshot of the company's financial performance over a longer period, including the first quarter ended March 31, 2024.
The document provides information on principal repayments of finance leases for the 12 months ended March 31, 2023 and 2024. For the period ended March 31, 2024, the amount relates to property included in "Principal repayments of finance leases" of $3,774 million. This suggests that there were significant principal repayments made on finance leases during the first quarter of 2024.
This document does not provide specific financial results for the first quarter ended March 31, 2024. However, it can contribute to an answer by highlighting Amazon's focus on customer obsession, passion for invention, operational excellence, and long-term thinking, which may be refle

In [None]:
print(responses)

['Unfortunately, there are no specific financial results mentioned in the provided document. However, it does provide information on equipment acquired under finance leases for the 12 months ended March 31, 2023 and 2024, which were $517 million and $676 million, respectively. There is no mention of first-quarter financial results specifically.', "Unfortunately, there is no information provided about the financial results for the first quarter ended March 31, 2024. The document only mentions a comparison between two years (2023 and 2024) regarding principal repayments of finance leases. If you're looking for specific financial highlights, I'd be happy to help you with that once more information is provided.", "I apologize, but there are no financial results or key highlights mentioned in the provided document. The text appears to be an introduction to Amazon and its values, rather than a report on its financial performance. If you're looking for information on Amazon's financial result

In [None]:
llm3_summarizer_task = PromptTemplate(
    template="""
    <|begin_of_text|>
    <|start_header_id|>system<|end_header_id|>
    YOU ARE A SUMMARIZER. YOUR OPINION ON THE ANSWERS OR QUERY IS NOT NEEDED.
    SUMMARISE THE ANSWERS TO THE QUERY IN AS MANY SENTENCES AS YOU NEED.
    <|eot_id|><|start_header_id|>user<|end_header_id|>
    Answers: {answers}
    Query: {query}
    <|eot_id|><|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["answers", "query"],
)

pipeline = llm3_summarizer_task | llm3_summarizer | StrOutputParser()
# array to text
responses_str = "; ".join(responses)
# print(responses_str)
llm3_summarizer_resp = pipeline.invoke({"answers": responses_str, "query": query})
print(llm3_summarizer_resp)

Here are the key highlights of financial results for the first quarter ended March 31, 2024:

* Net sales expected to be between $144.0 billion and $146.5 billion.
* Operating income is expected to be between $3.2 billion and $3.7 billion.
* Net income is expected to be between $2.1 billion and $2.6 billion.
* Diluted earnings per share (EPS) are expected to be between $4.35 and $5.15.

Note: These financial results are based on Amazon's guidance provided in their press release, but do not include actual figures as the query is asking for key highlights of financial results for the first quarter ended March 31, 2024, which are not available in the provided document.


... this seems kinda useless