In [1]:
# For parsing the PDF
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfpage import PDFPage
from pdfminer.pdftypes import resolve1
from pdfminer.psparser import PSLiteral, PSKeyword
from pdfminer.utils import decode_text

# For drawing boxes on the PDF
import fitz

In [2]:
# Helper Method to draw bounding boxes on PDF
def draw_boxes_on_pdf(input_pdf_path: str, output_pdf_path: str, pages_data: dict):
    """
    Draws rectangles around form fields on a PDF document

    Args:
        input_pdf_path: The path to the input PDF document
        output_pdf_path: The path to the output PDF document with rectangles
        pages_data: A dictionary containing page data. Each page entry should have:
            - 'metadata': dict, Metadata about the page
            - 'fields': list, A list of dictionaries containing form field data. Each dictionary should have:
                - 'field_id': int, Unique identifier for the field
                - 'rect': dict, The coordinates {'x1', 'y1', 'x2', 'y2'} defining the field's rectangle
    """
    # Open the PDF
    doc = fitz.open(input_pdf_path)
    
    # Process each page
    for page_number, page_data in pages_data.items():
        page = doc[page_number - 1]  # Convert to 0-based page numbers
        page_height = page.rect.height
        
        # For each field on this page, draw a rectangle and add a number
        for field in page_data['fields']:
            rect = field['rect']
            # Convert coordinates (flip vertically)
            box = fitz.Rect(
                rect['x1'],                    # left
                page_height - rect['y2'],      # top (flipped)
                rect['x2'],                    # right
                page_height - rect['y1']       # bottom (flipped)
            )
            
            # Draw red rectangle
            page.draw_rect(box, color=(1, 0, 0), width=1)
            
            # Calculate center of box for text placement
            center_x = (rect['x1'] + rect['x2']) / 2
            center_y = page_height - ((rect['y1'] + rect['y2']) / 2)
            
            # Add field ID number in the center
            page.insert_text(
                point=(center_x, center_y),
                text=str(field['field_id']),  # Use field's ID
                color=(1, 0, 0),
                fontsize=12
            )
    
    # Save the modified PDF
    doc.save(output_pdf_path)
    doc.close()

# Minimal Example

In [3]:
pdf_path = 'sample-form.pdf'
pages_data = {}

with open(pdf_path, 'rb') as file:
    # Step 1: Parse the PDF document.
    parser = PDFParser(file)
    doc = PDFDocument(parser)

    if 'AcroForm' not in doc.catalog:
        print("No AcroForm found in the PDF document.")
        exit()

    # Step 2: Extract form fields from the AcroForm section of the document catalog.
    field_id = 1
    fields = resolve1(doc.catalog['AcroForm']).get('Fields', [])
    # Step 3: Iterate through pages.
    for page_num, page in enumerate(PDFPage.create_pages(doc)):
        page_number = page_num + 1
        # Initialize a dictionary to store field data for this page
        pages_data[page_number] = {
            'page_number': page_number,
            'fields': []
        }
        # Step 4: Process annotations on the page.
        if page.annots:
            # Step 5: Resolve annotation references.
            annots = resolve1(page.annots)
            if isinstance(annots, list):
                for annot in annots:
                    # Step 6: Check if the annotation is of type 'Widget'.
                    annot_obj = resolve1(annot)
                    subtype = annot_obj.get('Subtype')
                    # Step 7: Determine the field object.
                    if isinstance(subtype, PSLiteral) and subtype.name == 'Widget':
                        parent = annot_obj.get('Parent')
                        field_obj = resolve1(parent) if parent else annot_obj

                        # Step 8: Extract the field name ('T') and field value ('V')
                        field_name = field_obj.get('T') if field_obj.get('T') else None
                        field_value = field_obj.get('V') if field_obj.get('V') else None

                        rect = annot_obj.get('Rect')
                        # Step 9: Decode and store extracted details.
                        field = {
                            'field_id': field_id,
                            'form_field': decode_text(field_name),
                            'value': resolve1(field_value),
                            'rect': {
                                'x1': int(rect[0]),
                                'y1': int(rect[1]),
                                'x2': int(rect[2]),
                                'y2': int(rect[3]),
                            }
                        }
                        
                        pages_data[page_number]['fields'].append(field)
                        field_id += 1

In [4]:
from pprint import pprint
pprint(pages_data)

{1: {'fields': [{'field_id': 1,
                 'form_field': 'textbox_1',
                 'rect': {'x1': 67, 'x2': 239, 'y1': 597, 'y2': 619},
                 'value': b'Just some text'},
                {'field_id': 2,
                 'form_field': 'textbox_with_format',
                 'rect': {'x1': 249, 'x2': 420, 'y1': 597, 'y2': 619},
                 'value': b'1223423424'},
                {'field_id': 3,
                 'form_field': 'dropdown',
                 'rect': {'x1': 427, 'x2': 512, 'y1': 597, 'y2': 619},
                 'value': b'option B'},
                {'field_id': 4,
                 'form_field': 'date',
                 'rect': {'x1': 67, 'x2': 217, 'y1': 541, 'y2': 563},
                 'value': b'12/10/24'},
                {'field_id': 5,
                 'form_field': 'radio button group',
                 'rect': {'x1': 236, 'x2': 254, 'y1': 542, 'y2': 560},
                 'value': /'4'},
                {'field_id': 6,
                 'for

In [5]:
draw_boxes_on_pdf(pdf_path, "form_with_bounding_boxes.pdf", pages_data)

# Full Extraction

In [6]:
# Helper Methods
def decode_value(value):
    """
    Decodes a value according to the PDF specification

    Args:
        value: The value to decode

    Returns:
        The decoded value
    """
    # decode PSLiteral, PSKeyword
    if isinstance(value, (PSLiteral, PSKeyword)):
        value = value.name

    # decode bytes
    if isinstance(value, bytes):
        try:
            value = decode_text(value)
        except:
            value = value.hex()  # Fallback to hex if decode fails

    # handle nested structures
    elif isinstance(value, dict):
        return {k: decode_value(v) for k, v in value.items()}
    elif isinstance(value, list):
        return [decode_value(item) for item in value]

    return value

def get_field_type(field_obj):
    """
    Gets detailed field type information including subtypes for buttons and choice fields

    Args:
        field_obj: A PDF field object containing field properties and metadata

    Returns:
        dict: A dictionary containing detailed field type information
    """
    ft = field_obj.get('FT')
    if not ft:
        return {'code': None, 'readable': 'Unknown', 'subtype': None}
    
    ft_name = ft.name if hasattr(ft, 'name') else ft
    flags = field_obj.get('Ff', 0)
    if isinstance(flags, PSLiteral):
        flags = int(flags.name)
    
    # Button fields (Btn)
    if ft_name == 'Btn':
        if flags & 65536:  # Push button
            return {'code': 'Btn', 'readable': 'Push Button', 'subtype': 'push'}
        elif flags & 32768:  # Radio button
            return {'code': 'Btn', 'readable': 'Radio Button', 'subtype': 'radio'}
        else:  # Checkbox
            return {'code': 'Btn', 'readable': 'Checkbox', 'subtype': 'checkbox'}
    
    # Choice fields (Ch)
    elif ft_name == 'Ch':
        if flags & 131072:  # Combo box (dropdown)
            if flags & 262144:  # Editable combo box
                return {'code': 'Ch', 'readable': 'Combo Box (Editable)', 'subtype': 'combo_editable'}
            else:
                return {'code': 'Ch', 'readable': 'Combo Box', 'subtype': 'combo'}
        else:  # List box
            if flags & 2097152:  # Multi-select list box
                return {'code': 'Ch', 'readable': 'List Box (Multi-select)', 'subtype': 'list_multi'}
            else:
                return {'code': 'Ch', 'readable': 'List Box', 'subtype': 'list'}
    
    # Text fields (Tx)
    elif ft_name == 'Tx':
        if flags & 4096:  # Rich text
            return {'code': 'Tx', 'readable': 'Rich Text', 'subtype': 'rich'}
        elif flags & 8192:  # File select
            return {'code': 'Tx', 'readable': 'File Select', 'subtype': 'file'}
        else:
            return {'code': 'Tx', 'readable': 'Text', 'subtype': 'plain'}
    
    # Signature fields
    elif ft_name == 'Sig':
        return {'code': 'Sig', 'readable': 'Signature', 'subtype': 'signature'}
    
    return {'code': ft_name, 'readable': f'Unknown ({ft_name})', 'subtype': None}

def get_field_options(field_obj):
    """
    Gets the available options for choice fields

    Args:
        field_obj: A PDF field object containing field properties and metadata

    Returns:
        list: A list of available options for choice fields (combo boxes and list boxes)
              Returns None if the field is not a choice field or has no options
    """
    if field_obj.get('FT') and field_obj.get('FT').name == 'Ch':
        opt = field_obj.get('Opt')
        if opt:
            opt = resolve1(opt)
            return [decode_value(option) for option in opt]
    return None

In [7]:
# Extraction Method
def extract_form_fields(pdf_path):
    """
    Extracts form fields and their metadata from a PDF document, including page-specific details

    Args:
        pdf_path: The path to the PDF document containing the form fields

    Returns:
        dict: A dictionary containing page-specific data and form fields
    """
    # Initialize structure with pages
    pages_data = {}
    field_id = 1
    
    with open(pdf_path, 'rb') as file:
        parser = PDFParser(file)
        doc = PDFDocument(parser)
        
        if 'AcroForm' not in doc.catalog:
            return {}
            
        fields = resolve1(doc.catalog['AcroForm']).get('Fields', [])
        
        # Iterate through pages
        for page_num, page in enumerate(PDFPage.create_pages(doc)):
            page_number = page_num + 1
            
            # Get page dimensions and rotation
            mediabox = page.mediabox
            cropbox = page.cropbox if hasattr(page, 'cropbox') else mediabox
            rotation = page.rotate if hasattr(page, 'rotate') else 0
            
            # Initialize page data with metadata
            pages_data[page_number] = {
                'metadata': {
                    'dimensions': {
                        'width': int(mediabox[2] - mediabox[0]),
                        'height': int(mediabox[3] - mediabox[1])
                    },
                    'mediabox': {
                        'x1': int(mediabox[0]),
                        'y1': int(mediabox[1]),
                        'x2': int(mediabox[2]),
                        'y2': int(mediabox[3])
                    },
                    'cropbox': {
                        'x1': int(cropbox[0]),
                        'y1': int(cropbox[1]),
                        'x2': int(cropbox[2]),
                        'y2': int(cropbox[3])
                    },
                    'rotation': rotation,
                    'page_number': page_number
                },
                'fields': []
            }
            
            if page.annots:
                annots = resolve1(page.annots)
                if isinstance(annots, list):
                    for annot in annots:
                        annot_obj = resolve1(annot)
                        subtype = annot_obj.get('Subtype')
                        if isinstance(subtype, PSLiteral) and subtype.name == 'Widget':
                            parent = annot_obj.get('Parent')
                            field_obj = resolve1(parent) if parent else annot_obj
                            
                            rect = annot_obj.get('Rect')
                            type_info = get_field_type(field_obj)
                            
                            field = {
                                'field_id': field_id,
                                'form_field': decode_text(field_obj.get('T')) if field_obj.get('T') else None,
                                'value': decode_value(resolve1(field_obj.get('V'))) if field_obj.get('V') else None,
                                'field_type': type_info['code'],
                                'field_type_readable': {'Btn': 'Button', 'Tx': 'Text', 'Ch': 'Choice', 'Sig': 'Signature'}.get(type_info['code'], f"Unknown ({type_info['code']})"),
                                'field_subtype': type_info['subtype'],
                                'field_subtype_readable': type_info['readable'],
                                'options': get_field_options(field_obj),
                                'rect': {
                                    'x1': int(rect[0]),
                                    'y1': int(rect[1]),
                                    'x2': int(rect[2]),
                                    'y2': int(rect[3]),
                                }
                            }
                            
                            pages_data[page_number]['fields'].append(field)
                            field_id += 1
                            
    return pages_data

In [8]:
# Example Usage
pdf_path = 'sample-form.pdf'
form_fields = extract_form_fields(pdf_path)
draw_boxes_on_pdf(pdf_path, "form_with_bounding_boxes.pdf", form_fields)

In [9]:
pprint(form_fields)

{1: {'fields': [{'field_id': 1,
                 'field_subtype': 'plain',
                 'field_subtype_readable': 'Text',
                 'field_type': 'Tx',
                 'field_type_readable': 'Text',
                 'form_field': 'textbox_1',
                 'options': None,
                 'rect': {'x1': 67, 'x2': 239, 'y1': 597, 'y2': 619},
                 'value': 'Just some text'},
                {'field_id': 2,
                 'field_subtype': 'plain',
                 'field_subtype_readable': 'Text',
                 'field_type': 'Tx',
                 'field_type_readable': 'Text',
                 'form_field': 'textbox_with_format',
                 'options': None,
                 'rect': {'x1': 249, 'x2': 420, 'y1': 597, 'y2': 619},
                 'value': '1223423424'},
                {'field_id': 3,
                 'field_subtype': 'combo',
                 'field_subtype_readable': 'Combo Box',
                 'field_type': 'Ch',
                 