In [8]:
pip install openparse

Note: you may need to restart the kernel to use updated packages.


In [9]:
import pandas as pd
import openparse

In [3]:
# Loading latest version of pdf_parser
# auto reloads the module if it has been changed

from pdf_parser import PdfParser

parser = PdfParser()

  from .autonotebook import tqdm as notebook_tqdm


Initializing Docment Parser......
Loading Surya models......
Loaded detection model vikp/surya_layout3 on device mps with dtype torch.float16
Loaded detection model vikp/surya_det3 on device mps with dtype torch.float16
Loaded reading order model vikp/surya_order on device mps with dtype torch.float16
Initializing Tesseract......


In [6]:
pdf_bytes = open('pic2.pdf', 'rb').read()

pdf_layout = parser.parse_pdf(pdf_bytes)
pdf_layout.to_excel('output_pic2.xlsx', index=False)

Detecting bboxes:   0%|          | 0/1 [00:00<?, ?it/s]

Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  3.14it/s]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  3.81it/s]
Finding reading order: 100%|██████████| 1/1 [00:00<00:00,  3.63it/s]


In [13]:
def extract_table_data_from_excel(file_path):
    """
    Extract only the rows labeled as 'Table' from an Excel file and return the data in matrix format
    containing page_idx, position, bbox, label, and text.
    """
    # Load the Excel file
    df = pdf_layout
    
    # Filter for rows labeled as 'Table'
    table_df = df[df['label'] == 'Table']
    
    # Select necessary columns
    table_matrix = table_df[['page_idx', 'bbox', 'label']]
    
    return table_matrix

# Call the function with the uploaded file path

filtered_table_matrix = extract_table_data_from_excel(pdf_layout)
filtered_table_matrix

Unnamed: 0,page_idx,bbox,label
2,0,"(152, 252, 1076, 1289)",Table


In [14]:
def iterate_table_pages(pdf_layout):
    """
    Iterate over pages in the pdf_layout DataFrame that contain a table and print the page number.
    Returns a list of page numbers with tables.
    """
    # Filter for rows labeled as 'Table'
    table_df = pdf_layout[pdf_layout['label'] == 'Table']
    
    # Get unique page numbers where tables are located
    table_pages = table_df['page_idx'].unique()
    
    # Iterate over the pages containing tables
    for page in table_pages:
        print(f"Processing page: {page}")
    
    return table_pages

# Call the function
table_pages = iterate_table_pages(filtered_table_matrix)


Processing page: 0


In [11]:

parser = openparse.DocumentParser()
parsed_basic_doc=parser.parse(filtered_table_matrix)

for node in parsed_basic_doc.nodes:
    print(node)

AttributeError: 'DataFrame' object has no attribute 'pages'

In [None]:
import openparse
import PyPDF2

# Step 2: Parse the PDF using openparse
def extract_table_pages_with_openparse(pdf_path, table_label='Table'):
    """
    Parse the PDF and extract the page indices of nodes labeled as 'Table' using openparse.
    
    Parameters:
    - pdf_path: Path to the original PDF file.
    - table_label: Label to identify table nodes (default is 'Table').
    
    Returns:
    - A list of page indices where tables are located.
    """
    parser = openparse.DocumentParser()
    parsed_basic_doc = parser.parse(pdf_path)
    
    # Step 3: Extract the page indices where tables are found
    table_pages = set()  # Using a set to avoid duplicate pages
    for node in parsed_basic_doc.nodes:
        if node['label'] == table_label:  # Check if the node label is 'Table'
            table_pages.add(node['page_idx'])  # Add page index to the set
    
    return list(table_pages)  # Convert to list

# Step 4: Connect to the function to extract and save the table pages
def extract_and_save_table_pages(pdf_path, table_pages, output_pdf_path):
    """
    Extract pages containing tables based on table_pages list and save them as a single-page PDF.
    
    Parameters:
    - pdf_path: Path to the original PDF file.
    - table_pages: A list of page indices containing tables.
    - output_pdf_path: Path to save the extracted table pages as separate PDFs.
    """
    # Open the PDF file
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        
        # Create a PDF writer object to save individual table pages
        pdf_writer = PyPDF2.PdfWriter()

        # Loop through the table pages
        for page_num in table_pages:
            page = pdf_reader.pages[page_num - 1]  # Subtract 1 because page numbers are 0-indexed
            pdf_writer.add_page(page)
        
        # Write the extracted pages to the new PDF
        with open(output_pdf_path, 'wb') as output_pdf:
            pdf_writer.write(output_pdf)

# Example usage:
basic_doc_path = 'input.pdf'  # Path to your original PDF
output_pdf_path = 'output_table_pages.pdf'  # Path to save the PDF with only table pages

# Extract the page indices containing tables using openparse
table_pages = extract_table_pages_with_openparse(basic_doc_path)

# Save the extracted table pages to a new PDF
extract_and_save_table_pages(basic_doc_path, table_pages, output_pdf_path)


In [15]:
import openparse
import pandas as pd

# Step 1: Parse the PDF and save layout to Excel (This part assumes you already have this done)
pdf_bytes = open('pic2.pdf', 'rb').read()
parser = openparse.DocumentParser()
parsed_basic_doc = parser.parse(pdf_bytes)
parsed_basic_doc.to_excel('output_pic2.xlsx', index=False)

# Step 2: Extract table page data from the Excel file
def extract_table_data_from_excel(file_path):
    """
    Extract only the rows labeled as 'Table' from an Excel file and return the data in matrix format
    containing page_idx, bbox, label.
    """
    # Load the Excel file
    df = pd.read_excel(file_path)
    
    # Filter for rows labeled as 'Table'
    table_df = df[df['label'] == 'Table']
    
    # Select necessary columns
    table_matrix = table_df[['page_idx', 'bbox', 'label']]
    
    return table_matrix

# Step 3: Get the maximum bounding box for pages containing tables
def get_maximum_bounding_box(parsed_basic_doc, table_pages):
    """
    Get the maximum bounding box across all the bounding boxes for each table page.

    Parameters:
    - parsed_basic_doc: The parsed document object.
    - table_pages: A list of page indices where tables are located.
    
    Returns:
    - A dictionary with page indices as keys and max bounding boxes as values.
    """
    # Initialize a dictionary to store max bounding boxes for each page
    page_bboxes = {}

    # Iterate through all the nodes in the parsed document
    for node in parsed_basic_doc.nodes:
        page_idx = node['page_idx']
        
        if page_idx in table_pages:  # Process only pages with tables
            # Initialize variables to store the minimum x0, y0 and maximum x1, y1
            x0_min, y0_min = float('inf'), float('inf')
            x1_max, y1_max = float('-inf'), float('-inf')

            # Extract bounding box list from node
            if hasattr(node, 'bbox'):
                bbox_list = node.bbox  # Assuming bbox is a list of Bbox objects

                # Iterate over bbox_list (in case there are multiple Bbox objects)
                for bbox in bbox_list:
                    x0, y0, x1, y1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1
                    
                    # Update the min and max values for x0, y0, x1, y1
                    x0_min = min(x0_min, x0)
                    y0_min = min(y0_min, y0)
                    x1_max = max(x1_max, x1)
                    y1_max = max(y1_max, y1)
                
                # Store the bounding box for the current page
                page_bboxes[page_idx] = (x0_min, y0_min, x1_max, y1_max)

    return page_bboxes

# Step 4: Save the extracted table pages into a new PDF
def extract_and_save_table_pages_with_bbox(pdf_path, table_matrix, parsed_basic_doc, output_pdf_path):
    """
    Extract pages containing tables based on the table_matrix and save them as a single-page PDF.
    
    Parameters:
    - pdf_path: Path to the original PDF file.
    - table_matrix: A DataFrame containing page_idx and other information about the table locations.
    - parsed_basic_doc: The parsed document object from openparse.
    - output_pdf_path: Path to save the extracted table pages as separate PDFs.
    """
    # Open the PDF file
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        
        # Create a PDF writer object to save individual table pages
        pdf_writer = PyPDF2.PdfWriter()

        # Get the unique page numbers where tables are located
        table_pages = table_matrix['page_idx'].unique()

        # Get the maximum bounding box for each table-containing page
        max_bboxes = get_maximum_bounding_box(parsed_basic_doc, table_pages)

        # Loop through the table pages
        for page_num in table_pages:
            page = pdf_reader.pages[page_num]  # Use the page index directly
            pdf_writer.add_page(page)
            
            # Optionally, print or log the bounding box information
            if page_num in max_bboxes:
                print(f"Page {page_num} Max Bounding Box:", max_bboxes[page_num])
        
        # Write the extracted pages to the new PDF
        with open(output_pdf_path, 'wb') as output_pdf:
            pdf_writer.write(output_pdf)

# Example usage:
excel_file_path = 'output_pic2.xlsx'  # Path to the Excel file generated earlier
pdf_path = 'pic2.pdf'  # Path to your original PDF
output_pdf_path = 'output_table_pages_with_bbox.pdf'  # Path to save the PDF with only table pages

# Step 5: Extract table page information from the Excel file
filtered_table_matrix = extract_table_data_from_excel(excel_file_path)

# Step 6: Save the extracted table pages into a new PDF and get bounding box info
extract_and_save_table_pages_with_bbox(pdf_path, filtered_table_matrix, parsed_basic_doc, output_pdf_path)

print(f"Table pages have been extracted and saved to {output_pdf_path}")


AttributeError: 'bytes' object has no attribute 'pages'

Cropped PDF saved at: pic1_output.pdf
