In [1]:
pip install openparse

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import openparse

In [2]:
# Loading latest version of pdf_parser
# auto reloads the module if it has been changed

from pdf_parser import PdfParser

parser = PdfParser()

  from .autonotebook import tqdm as notebook_tqdm


Initializing Docment Parser......
Loading Surya models......
Loaded detection model vikp/surya_layout3 on device mps with dtype torch.float16
Loaded detection model vikp/surya_det3 on device mps with dtype torch.float16
Loaded reading order model vikp/surya_order on device mps with dtype torch.float16
Initializing Tesseract......


In [41]:
pdf_bytes = open('11_AuditedStatements2021_JPProcurement.pdf', 'rb').read()

pdf_layout = parser.parse_pdf(pdf_bytes)
pdf_layout.to_excel('output_1002_v2.xlsx', index=False)
pdf_layout

AttributeError: 'DocumentParser' object has no attribute 'parse_pdf'

In [44]:
def extract_table_data_from_excel(file_path):
    """
    Extract only the rows labeled as 'Table' from an Excel file and return the data in matrix format
    containing page_idx, position, bbox, label, and text.
    """
    # Load the Excel file
    df = pd.read_excel(file_path)
    
    # Filter for rows labeled as 'Table'
    table_df = df[df['label'] == 'Table']
    
    # Select necessary columns
    table_matrix = table_df[['page_idx', 'bbox', 'label']]
    
    return table_matrix

# Call the function with the uploaded file path

excel_file_path = 'output_1002_v2.xlsx'  # Path to the Excel file
filtered_table_matrix = extract_table_data_from_excel(excel_file_path)
filtered_table_matrix
page_idx_vector = filtered_table_matrix['page_idx'].tolist()
page_idx_vector

[1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]

In [47]:
import fitz  # PyMuPDF
import os

# Define a function to extract and save individual pages as separate PDFs
def extract_pages_from_pdf(input_pdf_path, output_folder, table_pages):
    # Open the PDF document
    doc = fitz.open(input_pdf_path)
    
    # List to keep track of saved PDF paths
    saved_pdfs = []

    # Loop through the provided page indices and extract the corresponding pages
    for page_idx in table_pages:
        # Load the page by index
        page = doc.load_page(int(page_idx))

        # Create a new PDF with just this single page
        single_page_pdf = fitz.open()  # Initialize a new empty PDF
        single_page_pdf.insert_pdf(doc, from_page=page_idx, to_page=page_idx)  # Insert only the single page

        # Define the path for the output PDF
        output_pdf_path = os.path.join(output_folder, f"table_page_{page_idx + 1}.pdf")
        single_page_pdf.save(output_pdf_path)
        single_page_pdf.close()

        # Append the path to the list
        saved_pdfs.append(output_pdf_path)

    # Close the original PDF document
    doc.close()

    return saved_pdfs

# Example usage:
input_pdf_path = "11_AuditedStatements2021_JPProcurement.pdf"
output_folder = "extracted_table_v3"
table_pages = page_idx_vector
saved_pdfs = extract_pages_from_pdf(input_pdf_path, output_folder, table_pages)


In [43]:
import fitz  # PyMuPDF
import os

def extract_and_save_table_pages_separately(pdf_path, table_pages, output_folder):
    """
    Extract each page containing a table and save it as a separate PDF.

    Parameters:
    - pdf_path: Path to the original PDF file.
    - table_pages: List of page indices where tables are located (ensure it's integers).
    - output_folder: Folder to save the individual page PDFs.
    
    Returns:
    - A list of paths to the saved individual PDFs.
    """
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Open the original PDF
    doc = fitz.open(pdf_path)
    
    # List to store the paths of saved PDFs
    saved_pdfs = []

    # Loop through the provided page indices and extract the corresponding pages
    for page_idx in table_pages:
        page = doc.load_page(int(page_idx))  # Load the page by index

        # Create a new PDF with just this single page
        single_page_pdf = fitz.open()
        single_page_pdf.new_page(width=page.rect.width, height=page.rect.height)
        single_page_pdf[-1].show_pdf_page(page.rect, doc, page_idx)  # Copy the page content
        
        # Define the path for the output PDF
        output_pdf_path = os.path.join(output_folder, f"table_page_{page_idx + 1}.pdf")
        single_page_pdf.save(output_pdf_path)
        single_page_pdf.close()

        # Append the path to the list
        saved_pdfs.append(output_pdf_path)

    doc.close()

    return saved_pdfs

# Example usage:
pdf_path = "11_AuditedStatements2021_JPProcurement.pdf"
table_pages = filtered_table_matrix['page_idx'].unique().tolist()  # Convert to list of integers if necessary
output_folder = "extracted_table_v2"  # Folder where individual PDFs will be saved

# Extract and save each page containing tables as separate PDFs
saved_pdfs = extract_and_save_table_pages_separately(pdf_path, table_pages, output_folder)

print(f"Extracted PDFs saved at: {saved_pdfs}")


Extracted PDFs saved at: ['extracted_table_v2/table_page_2.pdf', 'extracted_table_v2/table_page_3.pdf', 'extracted_table_v2/table_page_4.pdf', 'extracted_table_v2/table_page_6.pdf', 'extracted_table_v2/table_page_7.pdf', 'extracted_table_v2/table_page_8.pdf', 'extracted_table_v2/table_page_9.pdf', 'extracted_table_v2/table_page_10.pdf', 'extracted_table_v2/table_page_11.pdf', 'extracted_table_v2/table_page_12.pdf', 'extracted_table_v2/table_page_13.pdf', 'extracted_table_v2/table_page_14.pdf', 'extracted_table_v2/table_page_15.pdf']


In [6]:
import openparse

def parse_extracted_pdfs(saved_pdfs):
    """
    Parse each of the extracted single-page PDFs containing tables.
    
    Parameters:
    - saved_pdfs: List of paths to the extracted single-page PDFs.
    
    Returns:
    - A list of parsed documents for further processing.
    """
    parser = openparse.DocumentParser()
    parsed_docs = []

    # Iterate over each saved PDF and parse it
    for pdf_path in saved_pdfs:
        parsed_doc = parser.parse(pdf_path)  # Parse the single-page PDF
        parsed_docs.append(parsed_doc)
        print(f"Parsed document from {pdf_path}")

    return parsed_docs

# Example usage:
# Assuming `saved_pdfs` is the output from Step 1
parsed_docs = parse_extracted_pdfs(saved_pdfs)

# Now you can analyze or print the parsed content from each document
for i, parsed_doc in enumerate(parsed_docs):
    print(f"\nParsed content from document {i + 1} ({saved_pdfs[i]}):")
    for node in parsed_doc.nodes:
        print(node)  # Printing the parsed nodes for each page

Parsed document from extracted_table_pages/table_page_2.pdf
Parsed document from extracted_table_pages/table_page_3.pdf
Parsed document from extracted_table_pages/table_page_4.pdf
Parsed document from extracted_table_pages/table_page_6.pdf
Parsed document from extracted_table_pages/table_page_7.pdf
Parsed document from extracted_table_pages/table_page_8.pdf
Parsed document from extracted_table_pages/table_page_9.pdf
Parsed document from extracted_table_pages/table_page_10.pdf
Parsed document from extracted_table_pages/table_page_11.pdf
Parsed document from extracted_table_pages/table_page_12.pdf
Parsed document from extracted_table_pages/table_page_13.pdf
Parsed document from extracted_table_pages/table_page_14.pdf
Parsed document from extracted_table_pages/table_page_15.pdf

Parsed content from document 1 (extracted_table_pages/table_page_2.pdf):

Parsed content from document 2 (extracted_table_pages/table_page_3.pdf):

Parsed content from document 3 (extracted_table_pages/table_page_

In [21]:
import openparse

def parse_extracted_pdfs_and_save_nodes(saved_pdfs):
    """
    Parse each of the extracted single-page PDFs containing tables and save the nodes.

    Parameters:
    - saved_pdfs: List of paths to the extracted single-page PDFs.
    
    Returns:
    - A dictionary where the key is the PDF file path and the value is the list of parsed nodes for that PDF.
    """
    parser = openparse.DocumentParser()
    parsed_nodes_by_pdf = {}

    # Iterate over each saved PDF and parse it
    for pdf_path in saved_pdfs:
        parsed_doc = parser.parse(pdf_path)  # Parse the single-page PDF
        nodes = [node for node in parsed_doc.nodes]  # Extract nodes from the parsed document
        
        # Save the nodes for this particular PDF
        parsed_nodes_by_pdf[pdf_path] = nodes
        print(f"Parsed document from {pdf_path} with {len(nodes)} nodes")

    return parsed_nodes_by_pdf

# Example usage:
# Assuming `saved_pdfs` is the output from Step 1
parsed_nodes_by_pdf = parse_extracted_pdfs_and_save_nodes(saved_pdfs)

# Now you can access and analyze the nodes from each PDF
for pdf_path, nodes in parsed_nodes_by_pdf.items():
    print(f"\nParsed content from {pdf_path}:")
    for node in nodes:
        print(node)  # Printing each node for this specific PDF


Parsed document from extracted_table_pages/table_page_2.pdf with 0 nodes
Parsed document from extracted_table_pages/table_page_8.pdf with 0 nodes
Parsed document from extracted_table_pages/table_page_9.pdf with 0 nodes
Parsed document from extracted_table_pages/table_page_10.pdf with 0 nodes
Parsed document from extracted_table_pages/table_page_11.pdf with 0 nodes
Parsed document from extracted_table_pages/table_page_13.pdf with 0 nodes
Parsed document from extracted_table_pages/table_page_14.pdf with 0 nodes
Parsed document from extracted_table_pages/table_page_15.pdf with 0 nodes
Parsed document from extracted_table_pages/table_page_16.pdf with 0 nodes
Parsed document from extracted_table_pages/table_page_17.pdf with 0 nodes
Parsed document from extracted_table_pages/table_page_18.pdf with 0 nodes

Parsed content from extracted_table_pages/table_page_2.pdf:

Parsed content from extracted_table_pages/table_page_8.pdf:

Parsed content from extracted_table_pages/table_page_9.pdf:

Parse

In [14]:
def parse_pdfs_and_save_nodes_to_df(saved_pdfs):
    """
    Parse each of the extracted single-page PDFs containing tables and save their nodes in a DataFrame.
    
    Parameters:
    - saved_pdfs: List of paths to the extracted single-page PDFs.
    
    Returns:
    - A DataFrame containing the parsed nodes with their document index and content.
    """
    parser = openparse.DocumentParser()
    parsed_data = []

    # Iterate over each saved PDF and parse it
    for i, pdf_path in enumerate(saved_pdfs):
        parsed_doc = parser.parse(pdf_path)  # Parse the single-page PDF
        print(f"Parsed document from {pdf_path}")
        
        # Iterate over the nodes in the parsed document
        for node in parsed_doc.nodes:
            # Save the document index and node content in the parsed data list
            parsed_data.append({
                'document_index': i + 1,  # Document number (1-based index)
                #'pdf_path': pdf_path,
                'node_content': str(node)
            })

    # Convert the parsed data into a DataFrame
    parsed_df = pd.DataFrame(parsed_data)
    
    return parsed_df

In [18]:

parsed_nodes_df = parse_pdfs_and_save_nodes_to_df(saved_pdfs)
parsed_nodes_df

Parsed document from extracted_table_v2/table_page_2.pdf
Parsed document from extracted_table_v2/table_page_3.pdf
Parsed document from extracted_table_v2/table_page_4.pdf
Parsed document from extracted_table_v2/table_page_6.pdf
Parsed document from extracted_table_v2/table_page_7.pdf
Parsed document from extracted_table_v2/table_page_8.pdf
Parsed document from extracted_table_v2/table_page_9.pdf
Parsed document from extracted_table_v2/table_page_10.pdf
Parsed document from extracted_table_v2/table_page_11.pdf
Parsed document from extracted_table_v2/table_page_12.pdf
Parsed document from extracted_table_v2/table_page_13.pdf
Parsed document from extracted_table_v2/table_page_14.pdf
Parsed document from extracted_table_v2/table_page_15.pdf


In [19]:
import openparse
import pandas as pd

def parse_pdfs_and_save_nodes_per_page(saved_pdfs):
    """
    Parse each of the extracted single-page PDFs containing tables and save their nodes in a separate DataFrame per page.
    
    Parameters:
    - saved_pdfs: List of paths to the extracted single-page PDFs.
    
    Returns:
    - A list of DataFrames, each containing the parsed nodes for a specific PDF page.
    """
    parser = openparse.DocumentParser()
    parsed_dataframes = []  # To store the DataFrame for each page
    
    # Iterate over each saved PDF and parse it
    for i, pdf_path in enumerate(saved_pdfs):
        print(pdf_path)
        parsed_doc = parser.parse(pdf_path) 
        #print(f"Parsed document from {pdf_path}")
        
        # Collect node content for each page into a list of dictionaries
        parsed_data = []
        for node in parsed_doc.nodes:
            print(node)
            #parsed_data.append({
            #    'document_index': i + 1,  # Document number (1-based index)
            #    'pdf_path': pdf_path,
            #    'node_content': str(node)
            #})
        
        # Convert the parsed data into a temporary DataFrame for this PDF page
        temp_df = pd.DataFrame(parsed_data)
        parsed_dataframes.append(temp_df)
        
        # Display the temporary DataFrame for the current PDF page
        #print(f"Temporary DataFrame for document {i + 1} ({pdf_path}):")
    print(parsed_dataframes)
    return parsed_dataframes  # Return a list of DataFrames for each page

# Parse and save the nodes in separate DataFrames per page
parsed_dataframes = parse_pdfs_and_save_nodes_per_page(saved_pdfs)


extracted_table_v2/table_page_2.pdf
extracted_table_v2/table_page_3.pdf
extracted_table_v2/table_page_4.pdf
extracted_table_v2/table_page_6.pdf
extracted_table_v2/table_page_7.pdf
extracted_table_v2/table_page_8.pdf
extracted_table_v2/table_page_9.pdf
extracted_table_v2/table_page_10.pdf
extracted_table_v2/table_page_11.pdf
extracted_table_v2/table_page_12.pdf
extracted_table_v2/table_page_13.pdf
extracted_table_v2/table_page_14.pdf
extracted_table_v2/table_page_15.pdf
[Empty DataFrame
Columns: []
Index: [], Empty DataFrame
Columns: []
Index: [], Empty DataFrame
Columns: []
Index: [], Empty DataFrame
Columns: []
Index: [], Empty DataFrame
Columns: []
Index: [], Empty DataFrame
Columns: []
Index: [], Empty DataFrame
Columns: []
Index: [], Empty DataFrame
Columns: []
Index: [], Empty DataFrame
Columns: []
Index: [], Empty DataFrame
Columns: []
Index: [], Empty DataFrame
Columns: []
Index: [], Empty DataFrame
Columns: []
Index: [], Empty DataFrame
Columns: []
Index: []]


In [40]:
import openparse
basic_doc_path = "extracted_table_v2/table_page_15.pdf"
parser = openparse.DocumentParser()
parsed_basic_doc=parser.parse(basic_doc_path)

for node in parsed_basic_doc.nodes:
    print(node)

In [32]:
import openparse

def parse_extracted_pdfs_and_calculate_bbox(saved_pdfs, exclude_texts=None):
    """
    Parse each of the extracted single-page PDFs containing tables, save the nodes, and calculate the bounding box.

    Parameters:
    - saved_pdfs: List of paths to the extracted single-page PDFs.
    - exclude_texts: List of texts to exclude when calculating the bounding box (optional).

    Returns:
    - A dictionary where the key is the PDF file path and the value is a tuple:
      (list of parsed nodes, calculated bounding box).
    """
    parser = openparse.DocumentParser()
    parsed_nodes_and_bbox_by_pdf = {}

    # Iterate over each saved PDF and parse it
    for pdf_path in saved_pdfs:
        parsed_doc = parser.parse(pdf_path)  # Parse the single-page PDF
        nodes = [node for node in parsed_doc.nodes]  # Extract nodes from the parsed document

        # Calculate the bounding box using the nodes for the current page
        max_bbox = calculate_maximum_bounding_box(nodes, exclude_texts)  # Use the function you provided
        
        # Save both the nodes and the calculated bounding box for this particular PDF
        parsed_nodes_and_bbox_by_pdf[pdf_path] = (nodes, max_bbox)
        print(f"Parsed document from {pdf_path} with {len(nodes)} nodes and calculated bounding box")

    return parsed_nodes_and_bbox_by_pdf


In [39]:
parse_extracted_pdfs_and_calculate_bbox(saved_pdfs)


TypeError: '<' not supported between instances of 'float' and 'NoneType'

In [38]:

def calculate_maximum_bounding_box(nodes, exclude_texts=None):
    """
    Calculate the maximum bounding box from a list of parsed nodes for a specific page.
    
    Parameters:
    - nodes: List of parsed nodes for a specific page.
    - exclude_texts: List of texts to exclude when calculating the bounding box.
    
    Returns:
    - A dictionary containing the label and max_bbox.
    """
    # Initialize variables to store the minimum x0, y0 and maximum x1, y1
    x0_min = float('inf')
    y0_min = float('inf')
    x1_max = float('-inf')
    y1_max = float('-inf')

    # Initialize variables for page width and height (assuming all pages have the same dimensions)
    page_width = None
    page_height = None
    
    # Initialize an empty list to store the bbox coordinates
    bbox_data = []

    # Iterate through all the nodes in the parsed document
    for node in nodes:
        # Check if the node has a bbox attribute
        if hasattr(node, 'bbox'):
            # Check if the node contains any of the excluded text
            if exclude_texts and any(exclude_text in node.text for exclude_text in exclude_texts):
                continue  # Skip this node if it contains the excluded text

            bbox_list = node.bbox  # Assuming bbox is a list of Bbox objects

            # Iterate over bbox_list (in case there are multiple Bbox objects)
            for bbox in bbox_list:
                # Extract page dimensions if not already set
                if page_width is None or page_height is None:
                    page_width = bbox.page_width
                    page_height = bbox.page_height

                # Extract x0, y0, x1, y1 from the Bbox object
                x0, y0, x1, y1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1

                # Append the values to bbox_data list
                bbox_data.append([x0, y0, x1, y1])

                # Update the min and max values for x0, y0, x1, y1
                x0_min = min(x0_min, x0)
                y0_min = min(y0_min, y0)
                x1_max = max(x1_max, x1)
                y1_max = max(y1_max, y1)

    # Ensure the min and max bounding box values are bounded by the page dimensions
    x0_min_bounded = max(0, x0_min)  # x0_min should not be less than 0
    y0_min_bounded = max(0, y0_min)  # y0_min should not be less than 0
    x1_max_bounded = min(page_width, x1_max)  # x1_max should not exceed page_width
    y1_max_bounded = min(page_height, y1_max)  # y1_max should not exceed page_height

    # Optionally, adjust the bounding box by a margin (e.g., 100 units)
    x0_min_adjusted = max(0, x0_min_bounded)
    y0_min_adjusted = max(0, y0_min_bounded)
    x1_max_adjusted = min(page_width, x1_max_bounded)
    y1_max_adjusted = min(page_height, y1_max_bounded)

    # Create the max bounding box as a list
    max_bbox = [x0_min_adjusted, y0_min_adjusted, x1_max_adjusted, y1_max_adjusted]

    # Create the object with the label "table" and the max_bbox
    objects = [{'label': 'table', 'bbox': max_bbox}]

    # Return the objects list
    print(objects)

In [40]:
def calculate_maximum_bounding_box(nodes, exclude_texts=None):
    """
    Calculate the maximum bounding box from a list of parsed nodes for a specific page.
    
    Parameters:
    - nodes: List of parsed nodes for a specific page.
    - exclude_texts: List of texts to exclude when calculating the bounding box.
    
    Returns:
    - A dictionary containing the label and max_bbox.
    """
    # Initialize variables to store the minimum x0, y0 and maximum x1, y1
    x0_min = float('inf')
    y0_min = float('inf')
    x1_max = float('-inf')
    y1_max = float('-inf')

    # Initialize variables for page width and height
    page_width = None
    page_height = None

    # Initialize an empty list to store the bbox coordinates
    bbox_data = []

    # Iterate through all the nodes in the parsed document
    for node in nodes:
        # Check if the node has a bbox attribute
        if hasattr(node, 'bbox'):
            # Check if the node contains any of the excluded text
            if exclude_texts and any(exclude_text in node.text for exclude_text in exclude_texts):
                continue  # Skip this node if it contains the excluded text

            bbox_list = node.bbox  # Assuming bbox is a list of Bbox objects or a single Bbox object

            # Handle both cases: whether `bbox_list` is a list or a single bbox object
            if not isinstance(bbox_list, list):
                bbox_list = [bbox_list]

            # Iterate over bbox_list (to handle multiple bbox objects)
            for bbox in bbox_list:
                # Extract page dimensions (only once, if not already set)
                if page_width is None or page_height is None:
                    page_width = bbox.page_width
                    page_height = bbox.page_height

                # Extract x0, y0, x1, y1 from the Bbox object
                x0, y0, x1, y1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1

                # Append the values to bbox_data list
                bbox_data.append([x0, y0, x1, y1])

                # Update the min and max values for x0, y0, x1, y1
                x0_min = min(x0_min, x0)
                y0_min = min(y0_min, y0)
                x1_max = max(x1_max, x1)
                y1_max = max(y1_max, y1)

    # Ensure the min and max bounding box values are bounded by the page dimensions
    if page_width is not None and page_height is not None:
        x0_min_bounded = max(0, x0_min)  # x0_min should not be less than 0
        y0_min_bounded = max(0, y0_min)  # y0_min should not be less than 0
        x1_max_bounded = min(page_width, x1_max)  # x1_max should not exceed page_width
        y1_max_bounded = min(page_height, y1_max)  # y1_max should not exceed page_height
    else:
        raise ValueError("Page dimensions could not be extracted from bbox.")

    # Optionally, adjust the bounding box by a margin (you can customize this)
    x0_min_adjusted = max(0, x0_min_bounded)
    y0_min_adjusted = max(0, y0_min_bounded)
    x1_max_adjusted = min(page_width, x1_max_bounded)
    y1_max_adjusted = min(page_height, y1_max_bounded)

    # Create the max bounding box as a list
    max_bbox = [x0_min_adjusted, y0_min_adjusted, x1_max_adjusted, y1_max_adjusted]

    # Create the object with the label "table" and the max_bbox
    objects = [{'label': 'table', 'bbox': max_bbox}]

    return objects

# Example usage of the function
exclude_texts = ['Statement of Financial Position', 'The financial statements and accounting policies on pages 6']
parsed_nodes_for_page = parsed_nodes_by_pdf[saved_pdfs[0]]  # Example parsed nodes for a specific page
calculated_bbox = calculate_maximum_bounding_box(parsed_nodes_for_page, exclude_texts)

# Output the calculated bounding box
print(calculated_bbox)


ValueError: Page dimensions could not be extracted from bbox.

In [22]:
def calculate_maximum_bounding_box(nodes, exclude_texts=None):
    """
    Calculate the maximum bounding box from a list of parsed nodes for a specific page.
    
    Parameters:
    - nodes: List of parsed nodes for a specific page.
    - exclude_texts: List of texts to exclude when calculating the bounding box.
    
    Returns:
    - A dictionary containing the label and max_bbox.
    """
    # Initialize variables to store the minimum x0, y0 and maximum x1, y1
    x0_min = float('inf')
    y0_min = float('inf')
    x1_max = float('-inf')
    y1_max = float('-inf')

    # Initialize variables for page width and height
    page_width = None
    page_height = None

    # Initialize an empty list to store the bbox coordinates
    bbox_data = []

    # Iterate through all the nodes in the parsed document
    for node in nodes:
        # Check if the node has a bbox attribute
        if hasattr(node, 'bbox'):
            # Check if the node contains any of the excluded text
            if exclude_texts and any(exclude_text in node.text for exclude_text in exclude_texts):
                continue  # Skip this node if it contains the excluded text

            bbox_list = node.bbox  # Assuming bbox is a list of Bbox objects or a single Bbox object

            # Handle both cases: whether `bbox_list` is a list or a single bbox object
            if not isinstance(bbox_list, list):
                bbox_list = [bbox_list]

            # Iterate over bbox_list (to handle multiple bbox objects)
            for bbox in bbox_list:
                # Extract page dimensions (only once, if not already set)
                if page_width is None or page_height is None:
                    if hasattr(bbox, 'page_width') and hasattr(bbox, 'page_height'):
                        page_width = bbox.page_width
                        page_height = bbox.page_height
                    else:
                        # Skip this node if page dimensions are missing
                        print(f"Skipping bbox on page {bbox.page} due to missing page dimensions.")
                        continue

                # Extract x0, y0, x1, y1 from the Bbox object
                x0, y0, x1, y1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1

                # Append the values to bbox_data list
                bbox_data.append([x0, y0, x1, y1])

                # Update the min and max values for x0, y0, x1, y1
                x0_min = min(x0_min, x0)
                y0_min = min(y0_min, y0)
                x1_max = max(x1_max, x1)
                y1_max = max(y1_max, y1)

    # Ensure the min and max bounding box values are bounded by the page dimensions
    if page_width is not None and page_height is not None:
        x0_min_bounded = max(0, x0_min)  # x0_min should not be less than 0
        y0_min_bounded = max(0, y0_min)  # y0_min should not be less than 0
        x1_max_bounded = min(page_width, x1_max)  # x1_max should not exceed page_width
        y1_max_bounded = min(page_height, y1_max)  # y1_max should not exceed page_height
    else:
        print("Warning: Page dimensions not found for any nodes. Returning empty bounding box.")
        return []

    # Optionally, adjust the bounding box by a margin (you can customize this)
    x0_min_adjusted = max(0, x0_min_bounded)
    y0_min_adjusted = max(0, y0_min_bounded)
    x1_max_adjusted = min(page_width, x1_max_bounded)
    y1_max_adjusted = min(page_height, y1_max_bounded)

    # Create the max bounding box as a list
    max_bbox = [x0_min_adjusted, y0_min_adjusted, x1_max_adjusted, y1_max_adjusted]

    # Create the object with the label "table" and the max_bbox
    objects = [{'label': 'table', 'bbox': max_bbox}]

    return objects

# Example usage of the function
exclude_texts = ['Statement of Financial Position', 'The financial statements and accounting policies on pages 6']
parsed_nodes_for_page = parsed_nodes_by_pdf[saved_pdfs[0]]  # Example parsed nodes for a specific page
calculated_bbox = calculate_maximum_bounding_box(parsed_nodes_for_page, exclude_texts)

# Output the calculated bounding box
print(calculated_bbox)


NameError: name 'parsed_nodes_by_pdf' is not defined

In [23]:
# Example usage:
# Assuming `saved_pdfs` is the output from Step 1
exclude_texts = ['Statement of Financial Position', 'The financial statements and accounting policies on pages 6']
parsed_nodes_and_bbox_by_pdf = parse_extracted_pdfs_and_calculate_bbox(saved_pdfs, exclude_texts)

# Now you can access the parsed nodes and calculated bounding box from each PDF
for pdf_path, (nodes, bbox) in parsed_nodes_and_bbox_by_pdf.items():
    print(f"\nParsed content and bounding box from {pdf_path}:")
    print(f"Nodes: {nodes}")
    print(f"Bounding Box: {bbox}")

NameError: name 'parse_extracted_pdfs_and_calculate_bbox' is not defined