In [1]:
pip install openparse

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import openparse

In [3]:
# Loading latest version of pdf_parser
# auto reloads the module if it has been changed

from pdf_parser import PdfParser

parser = PdfParser()

  from .autonotebook import tqdm as notebook_tqdm


Initializing Docment Parser......
Loading Surya models......
Loaded detection model vikp/surya_layout3 on device mps with dtype torch.float16
Loaded detection model vikp/surya_det3 on device mps with dtype torch.float16
Loaded reading order model vikp/surya_order on device mps with dtype torch.float16
Initializing Tesseract......


In [16]:
pdf_bytes = open('11_AuditedStatements.pdf', 'rb').read()

pdf_layout = parser.parse_pdf(pdf_bytes)
pdf_layout.to_excel('output_1002.xlsx', index=False)


Detecting bboxes: 100%|██████████| 2/2 [00:07<00:00,  3.54s/it]
Detecting bboxes: 100%|██████████| 2/2 [00:07<00:00,  3.85s/it]
Finding reading order: 100%|██████████| 3/3 [00:13<00:00,  4.46s/it]


In [17]:
def extract_table_data_from_excel(file_path):
    """
    Extract only the rows labeled as 'Table' from an Excel file and return the data in matrix format
    containing page_idx, position, bbox, label, and text.
    """
    # Load the Excel file
    df = pd.read_excel(file_path)
    
    # Filter for rows labeled as 'Table'
    table_df = df[df['label'] == 'Table']
    
    # Select necessary columns
    table_matrix = table_df[['page_idx', 'bbox', 'label']]
    
    return table_matrix

# Call the function with the uploaded file path

excel_file_path = 'output_1002.xlsx'  # Path to the Excel file
filtered_table_matrix = extract_table_data_from_excel(excel_file_path)
filtered_table_matrix

Unnamed: 0,page_idx,bbox,label
2,1,"(176, 141, 1074, 710)",Table
80,7,"(96, 198, 1101, 755)",Table
83,8,"(89, 176, 1121, 1331)",Table
88,9,"(90, 174, 1204, 621)",Table
91,10,"(89, 146, 1126, 1051)",Table
121,12,"(153, 1078, 635, 1218)",Table
130,13,"(113, 194, 1122, 1500)",Table
135,14,"(147, 209, 1100, 425)",Table
137,14,"(135, 492, 1115, 917)",Table
141,14,"(147, 1041, 1100, 1237)",Table


In [19]:
import fitz  # PyMuPDF
import os

def extract_and_save_table_pages_separately(pdf_path, table_pages, output_folder):
    """
    Extract each page containing a table and save it as a separate PDF.

    Parameters:
    - pdf_path: Path to the original PDF file.
    - table_pages: List of page indices where tables are located (ensure it's integers).
    - output_folder: Folder to save the individual page PDFs.
    
    Returns:
    - A list of paths to the saved individual PDFs.
    """
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Open the original PDF
    doc = fitz.open(pdf_path)
    
    # List to store the paths of saved PDFs
    saved_pdfs = []

    # Loop through the provided page indices and extract the corresponding pages
    for page_idx in table_pages:
        page = doc.load_page(int(page_idx))  # Load the page by index

        # Create a new PDF with just this single page
        single_page_pdf = fitz.open()
        single_page_pdf.new_page(width=page.rect.width, height=page.rect.height)
        single_page_pdf[-1].show_pdf_page(page.rect, doc, page_idx)  # Copy the page content
        
        # Define the path for the output PDF
        output_pdf_path = os.path.join(output_folder, f"table_page_{page_idx + 1}.pdf")
        single_page_pdf.save(output_pdf_path)
        single_page_pdf.close()

        # Append the path to the list
        saved_pdfs.append(output_pdf_path)

    doc.close()

    return saved_pdfs

# Example usage:
pdf_path = "11_AuditedStatements.pdf"
table_pages = filtered_table_matrix['page_idx'].unique().tolist()  # Convert to list of integers if necessary
output_folder = "extracted_table_pages"  # Folder where individual PDFs will be saved

# Extract and save each page containing tables as separate PDFs
saved_pdfs = extract_and_save_table_pages_separately(pdf_path, table_pages, output_folder)

print(f"Extracted PDFs saved at: {saved_pdfs}")


Extracted PDFs saved at: ['extracted_table_pages/table_page_2.pdf', 'extracted_table_pages/table_page_8.pdf', 'extracted_table_pages/table_page_9.pdf', 'extracted_table_pages/table_page_10.pdf', 'extracted_table_pages/table_page_11.pdf', 'extracted_table_pages/table_page_13.pdf', 'extracted_table_pages/table_page_14.pdf', 'extracted_table_pages/table_page_15.pdf', 'extracted_table_pages/table_page_16.pdf', 'extracted_table_pages/table_page_17.pdf', 'extracted_table_pages/table_page_18.pdf']
