In [1]:
%pip install -U --quiet llmsherpa graphviz aspose-words langchain_experimental python-docx fpdf pdfkit pdf2image

Note: you may need to restart the kernel to use updated packages.


In [2]:
from llmsherpa.readers import LayoutPDFReader
from llmsherpa.readers import LayoutReader
from llmsherpa.readers.layout_reader import ListItem, Paragraph, Table
from pprint import pprint
from IPython.display import display, HTML, IFrame
from graphviz import Digraph
import subprocess
import os
import aspose.words as aw
from langchain.docstore.document import Document as LangchainDocument
from langchain_community.document_loaders.llmsherpa import LLMSherpaFileLoader
from langchain_community.vectorstores import Neo4jVector
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from docx import Document as docxDocument
from fpdf import FPDF
from pdfkit import from_file
from pdf2image import convert_from_path
from PIL import ImageDraw, Image
import fitz
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import FlashrankRerank
from langchain_community.llms import Ollama

Input file here (we are limiting to pdf for now)

In [3]:
file_path = "documents/word.pdf"

### Preliminary chunking using LLMSherpa

##### LLMSherpa Setup

In [4]:
db_config={"ollama_base_url": "http://localhost:11434",
        "llm_name": "llama3",
        "nlm_url": "http://localhost:5010/api/parseDocument?renderFormat=all&applyOcr=yes&useNewIndentParser=yes",
        "neo4j_url": "bolt://localhost:7687",
        "neo4j_username": "neo4j",
        "neo4j_password": "password",
        "bbox_correction_multiple": 2.78
        }

In [5]:
try:
    reader = LayoutPDFReader(db_config["nlm_url"])
    print(file_path)
    parsed_doc = reader.read_pdf(file_path)
    layout_root = parsed_doc.root_node
    json_doc = parsed_doc.json
except Exception as e:
    print(e)

documents/word.pdf


In [6]:
pprint(json_doc[4])

{'bbox': [58.56, 312.09, 394.8, 740.1399999999999],
 'block_class': 'cls_0',
 'block_idx': 4,
 'level': 1,
 'page_idx': 0,
 'sentences': [' ',
               ' ',
               '  In      January      1941--President      Franklin      '
               'Roosevelt      came      to      this      chamber      to  ',
               ' ',
               '  speak      to      the      Nation.',
               'He      said-----I      address      you      at   ',
               ' ',
               'a      moment  ',
               ' ',
               '  unprecedented [[Page      $2273]]  ',
               ' ',
               '  in      the      history      of      the      Union".',
               'Hitler      was      on      the      march.',
               'War      was      raging  ',
               ' ',
               '  in      Europe.',
               "President      Roosevelt\\\\\\'s      purpose      was      "
               'to      wake      up      the      Congress  ',
     

Draw bounding box

In [7]:
def draw_bounding_box_on_pdf_image(pdf_path, coordinates, page_number=0, dpi=200):
    # Convert PDF page to image
    images = convert_from_path(pdf_path, first_page=page_number + 1, last_page=page_number + 1)
    
    # Assuming we have only one image since we specified a single page
    img = images[0]
    
    # Get the size of the PDF page in points (1 point = 1/72 inches)
    pdf_doc = fitz.open(pdf_path)
    page = pdf_doc.load_page(page_number)
    page_width, page_height = page.rect.width, page.rect.height
    
    # Scale coordinates to match the image resolution
    scale_x = img.width / page_width
    print(scale_x)
    scale_y = img.height / page_height
    print(scale_y)
    scaled_coordinates = tuple(int(coord * scale_x) for coord in coordinates)

    # Manually tune the right limit more
    scaled_coordinates = (scaled_coordinates[0], scaled_coordinates[1], scaled_coordinates[2] + 60, scaled_coordinates[3])
    
    # Draw the bounding box on the image
    draw = ImageDraw.Draw(img)
    draw.rectangle(scaled_coordinates, outline="red", width=2)
    
    # Save the image with the bounding box
    img.save("output/bbox.png")

In [8]:
draw_bounding_box_on_pdf_image(file_path, json_doc[4]["bbox"], json_doc[4]["page_idx"])

2.7777777777777777
2.7777777777777777


Visualise layout tree

In [9]:
def visualize_block_tree(root, max_level=-1, graph=None):
    if graph is None:
        graph = Digraph()
    
    node_id = str(id(root))
    graph.node(node_id, f"{root.tag} (level {root.level})")

    for child in root.children:
        child_id = str(id(child))
        max_level = max(max_level, child.level)
        graph.node(child_id, f"{child.tag} (level {child.level})")
        graph.edge(node_id, child_id)
        # reset the root node and recurse
        _, lvl = visualize_block_tree(child, max_level, graph)
        max_level = max(max_level, lvl)
    
    return graph, max_level

In [10]:
graph, max_level = visualize_block_tree(layout_root)
graph.render('block_tree', view=True)
print(max_level)

1


In [11]:
def parent_chain(node):
    """
    Returns the parent chain of the block consisting of all the parents of the block until the root.
    """
    chain = []
    parent = node.parent
    while parent:
        chain.append(parent)
        parent = parent.parent
    chain.reverse()
    return chain

def parent_text(node):
    """
    Returns the text of the parent chain of the block. This is useful for adding section information to the text.
    """
    chain = parent_chain(node)
    header_texts = []
    para_texts = []
    for p in chain:
        if p.tag == "header":
            header_texts.append(p.to_text()) 
        elif p.tag in ['list_item', 'para']:
            para_texts.append(p.to_text())
    text = "\n\n >> next header >> \n\n".join(header_texts)
    if len(para_texts) > 0:
        text +="\n\n".join(para_texts)
    return text
   
def to_context_text(node, include_section_info=True):
    """
    This is a customised function largely derived from layout_reader.py of the llmsherpa library
    Returns the text of the block with section information. This provides context to the text.
    """
    text = "**CONTENT HEADER**"
    if include_section_info and parent_text(node) != "":
        text += parent_text(node) + "\n\n >> next header >> \n\n"
    else:
        text += "\n\n"
    text += "**CONTENT**\n"
    if node.tag in ['list_item', 'para', 'table']:
        text += node.to_text(include_children=True, recurse=True)
    else:
        text += node.to_text()
    return text

In [12]:
def is_use_semantic_chunking(leaf_nodes):
    # Returns true if more than 50% of paragraphs have only one line
    count = 0
    num_paras = len(leaf_nodes)
    
    for node in leaf_nodes:
        if node.tag == "para":
            txt = node.to_text().strip()

            lines = txt.split("\n")
            if len(lines) == 1:
                count += 1
                print("Text is:", txt)

    print("Count is:", count)
    return count > num_paras/2

In [13]:
leaf_nodes = []

def find_leaf_nodes(node):
    if len(node.children) == 0:
        leaf_nodes.append(node)
    for child in node.children:
        find_leaf_nodes(child)

find_leaf_nodes(layout_root)

print(f"Found {len(leaf_nodes)} leaf nodes")

if is_use_semantic_chunking(leaf_nodes):
    # Perform semantic chunking

    ## Load embeddings
    embeddings = OllamaEmbeddings(
        base_url=db_config["ollama_base_url"],	
        model=db_config["llm_name"]
    )
    ## Chunk documents using semantic chunker
    text_splitter = SemanticChunker(
        embeddings, breakpoint_threshold_type="percentile"
    )

    full_text = ""
    for child in layout_root.children:
        full_text += child.to_text(include_children=True, recurse=True) + "\n"

    docs = text_splitter.create_documents([full_text[:20]])
    print(docs[0])
else:
    # Use chunks from llmsherpa
    # Each chunk is each leaf_node with to_context_text()
    collated_pg_content = [to_context_text(node) for node in leaf_nodes]

    assert len(collated_pg_content) == len(leaf_nodes)
    
    # Convert to Langchain documents
    # langchainDocument metadata limited to bbox, page_idx, and level for now
    docs = [LangchainDocument(page_content=collated_pg_content[i], metadata={key: leaf_nodes[i].block_json[key] for key in ('bbox', 'page_idx', 'level')}) for i in range(len(collated_pg_content))]

Found 19 leaf nodes
Count is: 0


Testing with LLM

In [14]:
embeddings = OllamaEmbeddings(
    base_url=db_config["ollama_base_url"],
    model=db_config["llm_name"]
)

In [15]:
print(embeddings)

base_url='http://localhost:11434' model='llama3' embed_instruction='passage: ' query_instruction='query: ' mirostat=None mirostat_eta=None mirostat_tau=None num_ctx=None num_gpu=None num_thread=None repeat_last_n=None repeat_penalty=None temperature=None stop=None tfs_z=None top_k=None top_p=None show_progress=False headers=None model_kwargs=None


In [16]:
hybrid_db = Neo4jVector.from_documents(
    docs,
    embedding=embeddings,
    url=db_config["neo4j_url"],
    username=db_config["neo4j_username"],
    password=db_config["neo4j_password"],
    search_type="hybrid",
)



In [17]:
index_name = "vector"  # default index name
keyword_index_name = "keyword"  # default keyword index name

store = Neo4jVector.from_existing_index(
    embeddings,
    url=db_config["neo4j_url"],
    username=db_config["neo4j_username"],
    password=db_config["neo4j_password"],
    index_name=index_name,
    keyword_index_name=keyword_index_name,
    search_type="hybrid",
)

In [18]:
retriever = store.as_retriever()

In [47]:
query = "Does the speaker believe Roe v Wade got it right?"

In [48]:
resp = retriever.invoke(query)[0]
print("PageContent:", resp.page_content)
print("Metadata:", resp.metadata)

PageContent: **CONTENT HEADER**PRESIDENTIAL      MESSAGE

 >> next header >> 

**CONTENT**
 
 
  Between      those      who      want      to      pull      America      back      to      the      past      and      those      who  
 
  want      to      move      America      into      the      future.
Metadata: {'level': 1, 'page_idx': 9, 'bbox': [58.8, 449.66, 379.92, 472.7]}


In [49]:
draw_bounding_box_on_pdf_image(file_path, resp.metadata["bbox"], resp.metadata["page_idx"])

2.7777777777777777
2.7777777777777777


Rerank (shown to return better results)

In [50]:
llm = Ollama(model="llama3", temperature=0, base_url="http://localhost:11434")

compressor = FlashrankRerank()
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

compressed_docs = compression_retriever.invoke(
    "Does the speaker believe Roe v Wade got it right?"
)

In [51]:
compressed_docs[0]

Document(page_content="**CONTENT HEADER**PRESIDENTIAL      MESSAGE\n\n >> next header >> \n\n**CONTENT**\n \n \n  So      tonight--let\\\\\\'s      stand      up      for      families      like      hers!\nTo      my      friends  \n \n  across      the      aisle--don\\\\\\'t      keep      families      waiting      any      longer.\nGuarantee      the  \n \n  right      to      IVF      nationwide!\n \n \n  Like      most      Americans--I      believe--Roe      v.      Wade      got      it      right.\nAnd   \n \nI      thank  \n \n  Vice      President      Harris      for      being      an      incredible      leader      defending  \n \n  reproductive      freedom      and      so      much      more.\n \n \n  But      my      predecessor--came      to      office      determined      to      see      Roe      v.      Wade  \n \n  overturned.\nHe\\\\\\'s      the      reason      it      was.\nIn      fact--he      brags      about      it.\nLook      at  \n \n  the      chao

In [52]:
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [
                f"Document {i+1}:\n\n{d.page_content}\nMetadata: {d.metadata}"
                for i, d in enumerate(docs)
            ]
        )
    )

In [53]:
pretty_print_docs(compressed_docs)

Document 1:

**CONTENT HEADER**PRESIDENTIAL      MESSAGE

 >> next header >> 

**CONTENT**
 
 
  So      tonight--let\\\'s      stand      up      for      families      like      hers!
To      my      friends  
 
  across      the      aisle--don\\\'t      keep      families      waiting      any      longer.
Guarantee      the  
 
  right      to      IVF      nationwide!
 
 
  Like      most      Americans--I      believe--Roe      v.      Wade      got      it      right.
And   
 
I      thank  
 
  Vice      President      Harris      for      being      an      incredible      leader      defending  
 
  reproductive      freedom      and      so      much      more.
 
 
  But      my      predecessor--came      to      office      determined      to      see      Roe      v.      Wade  
 
  overturned.
He\\\'s      the      reason      it      was.
In      fact--he      brags      about      it.
Look      at  
 
  the      chaos.
 
 
  Joining      us      tonight      is      K

In [54]:
draw_bounding_box_on_pdf_image(file_path, compressed_docs[0].metadata["bbox"], compressed_docs[0].metadata["page_idx"])

2.7777777777777777
2.7777777777777777
