In [1]:
pip install openparse

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import openparse

In [3]:
# Loading latest version of pdf_parser
# auto reloads the module if it has been changed

from pdf_parser import PdfParser

parser = PdfParser()

  from .autonotebook import tqdm as notebook_tqdm


Initializing Docment Parser......
Loading Surya models......
Loaded detection model vikp/surya_layout3 on device mps with dtype torch.float16
Loaded detection model vikp/surya_det3 on device mps with dtype torch.float16
Loaded reading order model vikp/surya_order on device mps with dtype torch.float16
Initializing Tesseract......


In [4]:
test_pdf = 'pic2.pdf'

In [5]:
pdf_bytes = open('pic2.pdf', 'rb').read()

pdf_layout = parser.parse_pdf(pdf_bytes)
pdf_layout.to_excel('output_pic2.xlsx', index=False)


Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  1.78it/s]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  2.56it/s]
Finding reading order: 100%|██████████| 1/1 [00:01<00:00,  1.02s/it]


In [6]:
pdf_layout

Unnamed: 0,page_idx,position,bbox,label,text
0,0,0,"(162, 150, 567, 218)",Section-header,Mountain Beverages Limited\nBusiness Report an...
1,0,1,"(162, 244, 466, 267)",Section-header,Statement of Financial Position\n
2,0,2,"(152, 252, 1076, 1289)",Table,SLALOMeHt OF PInahtial PUSIIOH\n\nASSETS Notes...
3,0,3,"(161, 1297, 1051, 1344)",Text,The financial statements and accounting polici...
4,0,4,"(369, 1406, 456, 1429)",Text,Director\n
5,0,5,"(584, 1556, 605, 1579)",Text,


In [9]:
def extract_table_data_from_excel(file_path):
    """
    Extract only the rows labeled as 'Table' from an Excel file and return the data in matrix format
    containing page_idx, position, bbox, label, and text.
    """
    # Load the Excel file
    df = pd.read_excel(file_path)
    
    # Filter for rows labeled as 'Table'
    table_df = df[df['label'] == 'Table']
    
    # Select necessary columns
    table_matrix = table_df[['page_idx', 'bbox', 'label']]
    
    return table_matrix

# Call the function with the uploaded file path

excel_file_path = 'output_pic2.xlsx'  # Path to the Excel file
filtered_table_matrix = extract_table_data_from_excel(excel_file_path)
filtered_table_matrix

Unnamed: 0,page_idx,bbox,label
2,0,"(152, 252, 1076, 1289)",Table


In [21]:
import pandas as pd
import fitz  # PyMuPDF

def get_maximum_bounding_box(parsed_basic_doc, table_pages, exclude_texts):
    """
    Get the maximum bounding box across all the bounding boxes for each table page.

    Parameters:
    - parsed_basic_doc: The parsed document object.
    - table_pages: A list of page indices where tables are located.
    - exclude_texts: A list of text snippets to exclude when calculating the bounding box.
    
    Returns:
    - A dictionary with page indices as keys and max bounding boxes as values.
    """
    # Dictionary to store the maximum bounding box for each page
    page_bboxes = {}

    # Iterate through all the nodes in the parsed document
    for node in parsed_basic_doc.nodes:
        page_idx = node['page_idx']

        # Only process nodes from pages containing tables
        if page_idx in table_pages:
            # Skip nodes that contain excluded texts
            if any(exclude_text in node.text for exclude_text in exclude_texts):
                continue

            # Initialize variables to store the minimum x0, y0 and maximum x1, y1
            x0_min, y0_min = float('inf'), float('inf')
            x1_max, y1_max = float('-inf'), float('-inf')

            # Check if the node has a bbox attribute
            if hasattr(node, 'bbox'):
                bbox_list = node.bbox  # Assuming bbox is a list of Bbox objects

                # Iterate over bbox_list (in case there are multiple Bbox objects)
                for bbox in bbox_list:
                    x0, y0, x1, y1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1

                    # Update the min and max values for x0, y0, x1, y1
                    x0_min = min(x0_min, x0)
                    y0_min = min(y0_min, y0)
                    x1_max = max(x1_max, x1)
                    y1_max = max(y1_max, y1)

                # Store the bounding box for the current page
                page_bboxes[page_idx] = [x0_min, y0_min, x1_max, y1_max]

    return page_bboxes

def crop_and_save_table_pages(pdf_path, table_matrix, parsed_basic_doc, output_png_prefix, exclude_texts):
    """
    Crop the pages that contain tables based on their bounding box and save them as PNGs.

    Parameters:
    - pdf_path: Path to the original PDF file.
    - table_matrix: A DataFrame containing page_idx of table locations.
    - parsed_basic_doc: The parsed document object.
    - output_png_prefix: Prefix for naming the output PNG files.
    - exclude_texts: A list of text snippets to exclude when calculating the bounding box.
    """
    # Get the unique page numbers where tables are located
    table_pages = table_matrix['page_idx'].unique()

    # Get the maximum bounding box for each table-containing page
    max_bboxes = get_maximum_bounding_box(parsed_basic_doc, table_pages, exclude_texts)

    # Open the PDF with PyMuPDF (fitz)
    doc = fitz.open(pdf_path)

    # Set the desired image resolution (in DPI)
    dpi = 1200  # You can adjust this for higher or lower quality
    zoom = dpi / 72  # 72 DPI is the default resolution for PDF pages
    mat = fitz.Matrix(zoom, zoom)  # Create a transformation matrix for the resolution

    # Loop through each page that contains tables and apply the crop
    for page_idx, bbox in max_bboxes.items():
        page = doc.load_page(page_idx)  # Load the page based on the index

        # Define a rectangle for the crop (left, top, right, bottom)
        rect = fitz.Rect(bbox[0], bbox[1], bbox[2], bbox[3])

        # Set the page crop box to this rectangle
        page.set_cropbox(rect)

        # Render the cropped page as an image with the desired resolution
        pix = page.get_pixmap(matrix=mat, alpha=False)  # alpha=False to avoid transparency

        # Define the output path for the PNG
        output_png_path = f"{output_png_prefix}_page_{page_idx + 1}.png"

        # Save the image as PNG
        pix.save(output_png_path)

        print(f"Cropped and saved page {page_idx + 1} as {output_png_path}")

    # Close the document
    doc.close()

# Example usage:

# Step 2: Extract table page data from the Excel file
excel_file_path = 'output_pic2.xlsx'  # Path to the Excel file generated earlier
filtered_table_matrix = extract_table_data_from_excel(excel_file_path)

# Step 4: Crop the pages and save as PNG
pdf_path = 'pic1.pdf'  # Path to your original PDF
exclude_texts = ['Statement of Financial Position', 'The financial statements and accounting policies on pages 6']
output_png_prefix = "page_cropped"

crop_and_save_table_pages(pdf_path, filtered_table_matrix, test_pdf, output_png_prefix, exclude_texts)


AttributeError: 'str' object has no attribute 'nodes'

In [19]:
def get_maximum_bounding_box(parsed_basic_doc, table_pages):
    """
    Get the maximum bounding box across all the bounding boxes for each table page.

    Parameters:
    - parsed_basic_doc: The parsed document object.
    - table_pages: A list of page indices where tables are located.
    
    Returns:
    - A dictionary with page indices as keys and max bounding boxes as values.
    """
    # Initialize a dictionary to store max bounding boxes for each page
    page_bboxes = {}

    # Iterate through all the nodes in the parsed document
    for node in parsed_basic_doc.nodes:
        page_idx = node['page_idx']
        
        if page_idx in table_pages:  # Process only pages with tables
            # Initialize variables to store the minimum x0, y0 and maximum x1, y1
            x0_min, y0_min = float('inf'), float('inf')
            x1_max, y1_max = float('-inf'), float('-inf')

            # Extract bounding box list from node
            if hasattr(node, 'bbox'):
                bbox_list = node.bbox  # Assuming bbox is a list of Bbox objects

                # Iterate over bbox_list (in case there are multiple Bbox objects)
                for bbox in bbox_list:
                    x0, y0, x1, y1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1
                    
                    # Update the min and max values for x0, y0, x1, y1
                    x0_min = min(x0_min, x0)
                    y0_min = min(y0_min, y0)
                    x1_max = max(x1_max, x1)
                    y1_max = max(y1_max, y1)
                
                # Store the bounding box for the current page
                page_bboxes[page_idx] = (x0_min, y0_min, x1_max, y1_max)

    return page_bboxes

# Example usage for Step 3:
table_pages = filtered_table_matrix['page_idx'].unique()  # Get unique pages containing tables
max_bboxes = get_maximum_bounding_box(test_pdf, table_pages)


AttributeError: 'str' object has no attribute 'nodes'

In [14]:
def iterate_table_pages(pdf_layout):
    """
    Iterate over pages in the pdf_layout DataFrame that contain a table and print the page number.
    Returns a list of page numbers with tables.
    """
    # Filter for rows labeled as 'Table'
    table_df = pdf_layout[pdf_layout['label'] == 'Table']
    
    # Get unique page numbers where tables are located
    table_pages = table_df['page_idx'].unique()
    
    # Iterate over the pages containing tables
    for page in table_pages:
        print(f"Processing page: {page}")
    
    return table_pages

# Call the function
table_pages = iterate_table_pages(filtered_table_matrix)


Processing page: 0


In [11]:

parser = openparse.DocumentParser()
parsed_basic_doc=parser.parse(filtered_table_matrix)

for node in parsed_basic_doc.nodes:
    print(node)

AttributeError: 'DataFrame' object has no attribute 'pages'

In [11]:
import openparse
import pypdf

def extract_table_pages(pdf_path, table_pages):
    """
    Extract pages that contain tables based on the table_pages indices from the original PDF.

    Parameters:
    - pdf_path: Path to the original PDF file.
    - table_pages: List of page indices where tables are located.
    
    Returns:
    - Path to the temporary extracted PDF containing only the table pages.
    """
    # Open the original PDF file
    with open(pdf_path, 'rb') as file:
        pdf_reader = pypdf.PdfReader(file)
        pdf_writer = pypdf.PdfWriter()

        # Extract and write only the table pages
        for page_idx in table_pages:
            page = pdf_reader.pages[page_idx]  # Extract the specific page
            pdf_writer.add_page(page)
        
        # Save the new PDF containing only the table pages
        output_pdf_path = "table_pages_extracted.pdf"
        with open(output_pdf_path, 'wb') as output_pdf:
            pdf_writer.write(output_pdf)

    return output_pdf_path

# Example usage:
pdf_path = "pic2.pdf"
table_pages = filtered_table_matrix['page_idx'].unique()  # Assuming this contains the table page indices

# Extract only the table pages from the original PDF
extracted_pdf_path = extract_table_pages(pdf_path, table_pages)


Ignoring wrong pointing object 5 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)


TypeError: sequence indices must be integers

In [13]:
import fitz  # PyMuPDF
import os

def extract_and_save_table_pages_separately(pdf_path, table_pages, output_folder):
    """
    Extract each page containing a table and save it as a separate PDF.

    Parameters:
    - pdf_path: Path to the original PDF file.
    - table_pages: List of page indices where tables are located (ensure it's integers).
    - output_folder: Folder to save the individual page PDFs.
    
    Returns:
    - A list of paths to the saved individual PDFs.
    """
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Open the original PDF
    doc = fitz.open(pdf_path)
    
    # List to store the paths of saved PDFs
    saved_pdfs = []

    # Loop through the provided page indices and extract the corresponding pages
    for page_idx in table_pages:
        page = doc.load_page(int(page_idx))  # Load the page by index

        # Create a new PDF with just this single page
        single_page_pdf = fitz.open()
        single_page_pdf.insert_page(-1, from_page=page)

        # Define the path for the output PDF
        output_pdf_path = os.path.join(output_folder, f"table_page_{page_idx + 1}.pdf")
        single_page_pdf.save(output_pdf_path)
        single_page_pdf.close()

        # Append the path to the list
        saved_pdfs.append(output_pdf_path)

    doc.close()

    return saved_pdfs

# Example usage:
pdf_path = "pic2.pdf"
table_pages = filtered_table_matrix['page_idx'].unique().tolist()  # Convert to list of integers if necessary
output_folder = "extracted_table_pages"  # Folder where individual PDFs will be saved

# Extract and save each page containing tables as separate PDFs
saved_pdfs = extract_and_save_table_pages_separately(pdf_path, table_pages, output_folder)

print(f"Extracted PDFs saved at: {saved_pdfs}")


TypeError: insert_page() got an unexpected keyword argument 'from_page'

In [14]:
import fitz  # PyMuPDF
import os

def extract_and_save_table_pages_separately(pdf_path, table_pages, output_folder):
    """
    Extract each page containing a table and save it as a separate PDF.

    Parameters:
    - pdf_path: Path to the original PDF file.
    - table_pages: List of page indices where tables are located (ensure it's integers).
    - output_folder: Folder to save the individual page PDFs.
    
    Returns:
    - A list of paths to the saved individual PDFs.
    """
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Open the original PDF
    doc = fitz.open(pdf_path)
    
    # List to store the paths of saved PDFs
    saved_pdfs = []

    # Loop through the provided page indices and extract the corresponding pages
    for page_idx in table_pages:
        page = doc.load_page(int(page_idx))  # Load the page by index

        # Create a new PDF with just this single page
        single_page_pdf = fitz.open()
        single_page_pdf.new_page(width=page.rect.width, height=page.rect.height)
        single_page_pdf[-1].show_pdf_page(page.rect, doc, page_idx)  # Copy the page content
        
        # Define the path for the output PDF
        output_pdf_path = os.path.join(output_folder, f"table_page_{page_idx + 1}.pdf")
        single_page_pdf.save(output_pdf_path)
        single_page_pdf.close()

        # Append the path to the list
        saved_pdfs.append(output_pdf_path)

    doc.close()

    return saved_pdfs

# Example usage:
pdf_path = "pic2.pdf"
table_pages = filtered_table_matrix['page_idx'].unique().tolist()  # Convert to list of integers if necessary
output_folder = "extracted_table_pages"  # Folder where individual PDFs will be saved

# Extract and save each page containing tables as separate PDFs
saved_pdfs = extract_and_save_table_pages_separately(pdf_path, table_pages, output_folder)

print(f"Extracted PDFs saved at: {saved_pdfs}")


Extracted PDFs saved at: ['extracted_table_pages/table_page_1.pdf']


In [32]:
import PyPDF2

ModuleNotFoundError: No module named 'PyPDF2'

In [31]:
import openparse
import pyPdf2

def extract_table_pages(pdf_path, table_pages):
    """
    Extract pages that contain tables based on the table_pages indices from the original PDF.

    Parameters:
    - pdf_path: Path to the original PDF file.
    - table_pages: List of page indices where tables are located.
    
    Returns:
    - Path to the temporary extracted PDF containing only the table pages.
    """
    # Open the original PDF file
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        pdf_writer = PyPDF2.PdfWriter()

        # Extract and write only the table pages
        for page_idx in table_pages:
            page = pdf_reader.pages[page_idx]  # Extract the specific page
            pdf_writer.add_page(page)
        
        # Save the new PDF containing only the table pages
        output_pdf_path = "table_pages_extracted.pdf"
        with open(output_pdf_path, 'wb') as output_pdf:
            pdf_writer.write(output_pdf)

    return output_pdf_path

# Example usage:
pdf_path = "pic2.pdf"
table_pages = filtered_table_matrix['page_idx'].unique()  # Assuming this contains the table page indices

# Extract only the table pages from the original PDF
extracted_pdf_path = extract_table_pages(pdf_path, table_pages)


ModuleNotFoundError: No module named 'pyPdf2'