In [None]:
pip install openparse

Note: you may need to restart the kernel to use updated packages.


In [None]:
!pip install -q transformers
!pip install -q easyocr

## Load model

Next, we load a Table Transformer pre-trained for table detection. We use the "no_timm" version here to load the checkpoint with a Transformers-native backbone.

In [3]:
from transformers import AutoModelForObjectDetection

model = AutoModelForObjectDetection.from_pretrained("microsoft/table-transformer-detection", revision="no_timm")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
model.config.id2label

{0: 'table', 1: 'table rotated'}

In [5]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print("")




## Load image

Next, we can load a PDF image.

In [6]:
pip install torchvision

Note: you may need to restart the kernel to use updated packages.


## Prepare image for the model

Preparing the image for the model can be done as follows:

In [7]:
from torchvision import transforms

class MaxResize(object):
    def __init__(self, max_size=800):
        self.max_size = max_size

    def __call__(self, image):
        width, height = image.size
        current_max_size = max(width, height)
        scale = self.max_size / current_max_size
        resized_image = image.resize((int(round(scale*width)), int(round(scale*height))))

        return resized_image

detection_transform = transforms.Compose([
    MaxResize(800),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

In [8]:
pixel_values = detection_transform(image).unsqueeze(0)
pixel_values = pixel_values.to(device)
print(pixel_values.shape)

NameError: name 'image' is not defined

## Forward pass

Next, we forward the pixel values through the model. The model outputs logits of shape (batch_size, num_queries, num_labels + 1). The +1 is for the "no object" class.

In [None]:
import torch

with torch.no_grad():
  outputs = model(pixel_values)

In [None]:
outputs.logits.shape

torch.Size([1, 15, 3])

In [None]:
import openparse
basic_doc_path = "pic2.pdf"
parser = openparse.DocumentParser()
parsed_basic_doc=parser.parse(basic_doc_path)

for node in parsed_basic_doc.nodes:
    print(node)

Ignoring wrong pointing object 5 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)


id_='b92cea6d-1953-4779-a325-11650dfcc850' elements=(TextElement(text='**Notes**', lines=(LineElement(bbox=(285.3, 692.51, 314.28, 701.51), spans=(TextSpan(text='Notes ', is_bold=True, is_italic=False, size=8.99),), style=None, text='**Notes**'),), bbox=Bbox(page=0, page_height=842.0, page_width=595.0, x0=285.3, y0=692.51, x1=314.28, y1=701.51), variant=<NodeVariant.TEXT: 'text'>, embed_text='**Notes**'), TextElement(text='**Mountain Beverages Limited**\n**Business Report and Financial Statements**\n**For the Year ended 30 June 2021**', lines=(LineElement(bbox=(80.82, 759.59, 212.75, 768.59), spans=(TextSpan(text='Mountain Beverages Limited ', is_bold=True, is_italic=False, size=8.99),), style=None, text='**Mountain Beverages Limited**'), LineElement(bbox=(80.82, 748.65, 274.29, 757.64), spans=(TextSpan(text='Business Report and Financial Statements ', is_bold=True, is_italic=False, size=8.99),), style=None, text='**Business Report and Financial Statements**'), LineElement(bbox=(80.82,

In [None]:
import pandas as pd

# Initialize variables to store the minimum x0, y0 and maximum x1, y1
x0_min = float('inf')
y0_min = float('inf')
x1_max = float('-inf')
y1_max = float('-inf')

# Initialize variables for page width and height (assuming all pages have the same dimensions)
page_width = None
page_height = None

# Initialize an empty list to store the bbox coordinates
bbox_data = []

# Iterate through all the nodes in the parsed document
for node in parsed_basic_doc.nodes:
    # Check if the node has a bbox attribute
    if hasattr(node, 'bbox'):
        bbox_list = node.bbox  # Assuming bbox is a list of Bbox objects

        # Iterate over bbox_list (in case there are multiple Bbox objects)
        for bbox in bbox_list:
            # Extract page dimensions if not already set
            if page_width is None or page_height is None:
                page_width = bbox.page_width
                page_height = bbox.page_height
            
            # Extract x0, y0, x1, y1 from the Bbox object
            x0, y0, x1, y1 = bbox.x0, bbox.y0, bbox.x1, bbox.y1
            
            # Append the values to bbox_data list
            bbox_data.append([x0, y0, x1, y1])
            
            # Update the min and max values for x0, y0, x1, y1
            x0_min = min(x0_min, x0)
            y0_min = min(y0_min, y0)
            x1_max = max(x1_max, x1)
            y1_max = max(y1_max, y1)

# Create a DataFrame from the bbox_data list
df_bbox = pd.DataFrame(bbox_data, columns=['x0', 'y0', 'x1', 'y1'])

# Print the DataFrame
print(df_bbox)

# Ensure the min and max bounding box values are bounded by the page dimensions
x0_min_bounded = max(0, x0_min)  # x0_min should not be less than 0
y0_min_bounded = max(0, y0_min)  # y0_min should not be less than 0
x1_max_bounded = min(page_width, x1_max)  # x1_max should not exceed page_width
y1_max_bounded = min(page_height, y1_max)  # y1_max should not exceed page_height

# Optionally, adjust the bounding box by a margin (e.g., 100 units)
x0_min_adjusted = max(0, x0_min_bounded)
y0_min_adjusted = max(0, y0_min_bounded+10)
x1_max_adjusted = min(page_width, x1_max_bounded)
y1_max_adjusted = min(page_height, y1_max_bounded-100)

# Create the max bounding box as a tuple
max_bbox = (x0_min_adjusted, y0_min_adjusted, x1_max_adjusted, y1_max_adjusted)

# Print the maximum bounding box
print("Maximum Bounding Box:", max_bbox)


          x0        y0         x1         y1
0   80.82000  544.8200  314.28000  768.59000
1  362.03000  544.8200  416.52000  701.18000
2  445.26000  544.8200  499.75000  701.18000
3   80.82000  279.5900  499.75000  513.19000
4   80.82000  197.4400  506.78000  247.93000
5   80.92979  178.5909  508.86989  219.21813
Maximum Bounding Box: (80.82, 188.5909, 508.86988999999994, 668.59)


In [None]:
import fitz  # PyMuPDF

# Define the bounding box for cropping
crop_bbox = max_bbox

# Open the PDF
pdf_path = "pic1.pdf"
doc = fitz.open(pdf_path)

# Loop through each page and apply the crop
for page_num in range(doc.page_count):
    page = doc.load_page(page_num)  # Load the page
    # Define a rectangle for the crop (left, bottom, right, top)
    rect = fitz.Rect(crop_bbox[0], crop_bbox[1], crop_bbox[2], crop_bbox[3])
    
    # Set the page crop box to this rectangle
    page.set_cropbox(rect)

# Save the cropped PDF to a new file
cropped_pdf_path = "pic1_output.pdf"
cropped_pdf= doc.save(cropped_pdf_path)
doc.close()

print(f"Cropped PDF saved at: {cropped_pdf_path}")


Cropped PDF saved at: pic1_output.pdf


In [None]:
import fitz  # PyMuPDF
from PIL import Image

# Define the bounding box for cropping
crop_bbox = max_bbox  # This should be a list or tuple: (left, bottom, right, top)

# Open the PDF
pdf_path = "pic1.pdf"
doc = fitz.open(pdf_path)

# Loop through each page and apply the crop
for page_num in range(doc.page_count):
    page = doc.load_page(page_num)  # Load the page
    # Define a rectangle for the crop (left, bottom, right, top)
    rect = fitz.Rect(crop_bbox[0], crop_bbox[1], crop_bbox[2], crop_bbox[3])
    
    # Set the page crop box to this rectangle
    page.set_cropbox(rect)
    
    # Get the cropped page as a pixmap (an image representation)
    pix = page.get_pixmap()
    
    # Convert the pixmap to an image using PIL
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    
    # Save the image as a PNG file
    png_output_path = f"pic1_cropped_page{page_num + 1}.png"
    img.save(png_output_path)

    print(f"Cropped image saved at: {png_output_path}")

# Close the document
doc.close()


Cropped image saved at: pic1_cropped_page1.png


In [None]:
doc_with_tables_path ="pic1_output.pdf"

parser = openparse.DocumentParser(
    table_args ={"parsing_algorithm":"table-transformers"}
)
parsed_doc2 =parser.parse(doc_with_tables_path)

display(parsed_doc2)
for node in parsed_doc2.nodes:
    display(node)

ParsedDocument(id_='4ac4c8d0-7f50-4abc-ab05-e7430be0a870', nodes=[Node(id_='22fa4b27-47ee-475b-b9a5-63b5ac901e02', elements=(ImageElement(text='', bbox=Bbox(page=0, page_height=842.0, page_width=595.0, x0=28.78030000000001, y0=41.0, x1=566.2197, y1=801.0), image='/9j/4AAQSkZJRgABAgEBLAEsAAD/xAAfAAABBQEBAQEBAQAAAAAAAAAAAQIDBAUGBwgJCgv/2wBDABoSFBcUEBoXFRceHBofKEIrKCQkKFE6PTBCYFVlZF9VXVtqeJmCanGRc1tdhbWHkZ6jq62rZ4C8ybqmyJmoq6X/xAC1EAACAQMDAgQDBQUEBAAAAX0BAgMABBEFEiExQQYTUWEHInEUMoGRoQgjQrHBFVLR8CQzYnKCCQoWFxgZGiUmJygpKjQ1Njc4OTpDREVGR0hJSlNUVVZXWFlaY2RlZmdoaWpzdHV2d3h5eoOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4eLj5OXm5+jp6vHy8/T19vf4+fr/2wBDARweHigjKE4rK06lbl1upaWlpaWlpaWlpaWlpaWlpaWlpaWlpaWlpaWlpaWlpaWlpaWlpaWlpaWlpaWlpaWlpaX/xAAfAQADAQEBAQEBAQEBAAAAAAAAAQIDBAUGBwgJCgv/xAC1EQACAQIEBAMEBwUEBAABAncAAQIDEQQFITEGEkFRB2FxEyIygQgUQpGhscEJIzNS8BVictEKFiQ04SXxFxgZGiYnKCkqNTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqCg4SFhoeIiYqSk5SVlpeYmZqio6Slpqeoqaqys7S1tre4

In [None]:
import fitz  # PyMuPDF
import pandas as pd
from pdf2image import convert_from_path
from PIL import Image
import openparse  # Assuming this is the library you're using for parsing and OCR

# First, extract the bounding box coordinates from your previous step
max_bbox = (x0_min_adjusted, y0_min_adjusted, x1_max_adjusted, y1_max_adjusted)

# Now, convert the PDF pages into images to apply OCR on the area within `max_bbox`
pdf_path = "pic1_output.pdf"

# Convert the PDF to images (1 image per page)
pages = convert_from_path(pdf_path)

# Loop through each page, crop to the bounding box, and perform OCR
cropped_images = []
for i, page in enumerate(pages):
    # Crop the image to the bounding box (max_bbox)
    cropped_image = page.crop((max_bbox[0], max_bbox[1], max_bbox[2], max_bbox[3]))
    
    # Optionally save or display the cropped image
    # cropped_image.show()  # If you want to view the cropped area
    
    # Append the cropped image for OCR processing
    cropped_images.append(cropped_image)

# Now, apply OCR on each cropped image using openparse or another OCR tool
parsed_results = []
for cropped_image in cropped_images:
    # Assuming openparse can take images directly, or you could save and pass it as a PDF
    # OCR parsing (this step will vary depending on how openparse handles images)
    parsed_doc = openparse.DocumentParser(
        table_args={"parsing_algorithm": "table-transformers"}
    )
    
    # Assuming parser.parse can accept images directly, if not, save the image first
    # You may need to convert image back to PDF or process as image for OCR
    # parsed_result = parsed_doc.parse(cropped_image)  # Adjust this line for actual usage
    
    parsed_results.append(parsed_doc)

# Display the OCR results
for parsed_result in parsed_results:
    display(parsed_result)  # For Jupyter, you can directly display the parsed output

# If the parsed document contains nodes with bbox, loop through and display the nodes
for parsed_doc2 in parsed_results:
    for node in parsed_doc2.nodes:
        display(node)


<openparse.doc_parser.DocumentParser at 0x17a1fc940>

AttributeError: 'DocumentParser' object has no attribute 'nodes'

In [None]:
import fitz  # PyMuPDF
import pandas as pd

# Define the bounding box for cropping
crop_bbox = max_bbox

# Open the PDF
pdf_path = "pic1.pdf"
doc = fitz.open(pdf_path)

# Initialize an empty list to store the cropping details for each page
crop_data = []

# Loop through each page and apply the crop
for page_num in range(doc.page_count):
    page = doc.load_page(page_num)  # Load the page
    # Define a rectangle for the crop (left, bottom, right, top)
    rect = fitz.Rect(crop_bbox[0], crop_bbox[1], crop_bbox[2], crop_bbox[3])
    
    # Set the page crop box to this rectangle
    page.set_cropbox(rect)
    
    # Store the crop details for the current page
    crop_data.append({
        'Page Number': page_num + 1,
        'x0': crop_bbox[0],
        'y0': crop_bbox[1],
        'x1': crop_bbox[2],
        'y1': crop_bbox[3]
    })

# Save the cropped PDF to a new file
cropped_pdf_path = "pic1_output.pdf"
doc.save(cropped_pdf_path)
doc.close()

# Create a DataFrame from the crop_data
df_crop_data = pd.DataFrame(crop_data)

# Save the DataFrame to a CSV file
output_csv_path = "pic1_crop_data.csv"
df_crop_data.to_csv(output_csv_path, index=False)

# Print the cropped PDF path and the DataFrame
print(f"Cropped PDF saved at: {cropped_pdf_path}")
print(f"Crop details saved in CSV at: {output_csv_path}")

# Display the DataFrame
print(df_crop_data)


Cropped PDF saved at: pic1_output.pdf
Crop details saved in CSV at: pic1_crop_data.csv
   Page Number     x0        y0         x1      y1
0            1  80.82  188.5909  508.86989  668.59


## Load structure recognition model

Next, we load a Table Transformer pre-trained for table structure recognition.

In [None]:
from transformers import TableTransformerForObjectDetection

# new v1.1 checkpoints require no timm anymore
structure_model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-structure-recognition-v1.1-all")
structure_model.to(device)
print("")




In [None]:
structure_transform = transforms.Compose([
    MaxResize(1000),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

In [None]:
pixel_values = structure_transform(df_crop_data).unsqueeze(0)
pixel_values = pixel_values.to(device)
print(pixel_values.shape)

TypeError: cannot unpack non-iterable int object

In [None]:
# forward pass
with torch.no_grad():
  outputs = structure_model(pixel_values)

In [None]:
# update id2label to include "no object"
structure_id2label = structure_model.config.id2label
structure_id2label[len(structure_id2label)] = "no object"

cells = outputs_to_objects(outputs, cropped_table.size, structure_id2label)
print(cells)

In [9]:
## Visualize cells

We can visualize all recognized cells using PIL's ImageDraw module.

SyntaxError: unterminated string literal (detected at line 3) (127101817.py, line 3)

In [None]:
from PIL import ImageDraw

cropped_table_visualized = cropped_table.copy()
draw = ImageDraw.Draw(cropped_table_visualized)

for cell in cells:
    draw.rectangle(cell["bbox"], outline="red")

cropped_table_visualized

In [56]:
# forward pass
with torch.no_grad():
  outputs = structure_model(pixel_values)

In [58]:
# update id2label to include "no object"
structure_id2label = structure_model.config.id2label
structure_id2label[len(structure_id2label)] = "no object"

cells = outputs_to_objects(outputs, cropped_table.size, structure_id2label)
print(cells)

NameError: name 'outputs_to_objects' is not defined

## Visualize cells

We can visualize all recognized cells using PIL's ImageDraw module.

In [69]:
def get_cell_coordinates_by_row(table_data):
    # Extract rows and columns
    rows = [entry for entry in table_data if entry['label'] == 'table row']
    columns = [entry for entry in table_data if entry['label'] == 'table column']

    # Sort rows and columns by their Y and X coordinates, respectively
    rows.sort(key=lambda x: x['bbox'][1])
    columns.sort(key=lambda x: x['bbox'][0])

    # Function to find cell coordinates
    def find_cell_coordinates(row, column):
        cell_bbox = [column['bbox'][0], row['bbox'][1], column['bbox'][2], row['bbox'][3]]
        return cell_bbox

    # Generate cell coordinates and count cells in each row
    cell_coordinates = []

    for row in rows:
        row_cells = []
        for column in columns:
            cell_bbox = find_cell_coordinates(row, column)
            row_cells.append({'column': column['bbox'], 'cell': cell_bbox})

        # Sort cells in the row by X coordinate
        row_cells.sort(key=lambda x: x['column'][0])

        # Append row information to cell_coordinates
        cell_coordinates.append({'row': row['bbox'], 'cells': row_cells, 'cell_count': len(row_cells)})

    # Sort rows from top to bottom
    cell_coordinates.sort(key=lambda x: x['row'][1])

    return cell_coordinates

cell_coordinates = get_cell_coordinates_by_row(cells)

NameError: name 'cells' is not defined

In [68]:
import numpy as np
import csv
import easyocr
from tqdm.auto import tqdm

reader = easyocr.Reader(['en']) # this needs to run only once to load the model into memory

def apply_ocr(cell_coordinates):
    # let's OCR row by row
    data = dict()
    max_num_columns = 0
    for idx, row in enumerate(tqdm(cell_coordinates)):
      row_text = []
      for cell in row["cells"]:
        # crop cell out of image
        cell_image = np.array(cropped_table.crop(cell["cell"]))
        # apply OCR
        result = reader.readtext(np.array(cell_image))
        if len(result) > 0:
          # print([x[1] for x in list(result)])
          text = " ".join([x[1] for x in result])
          row_text.append(text)

      if len(row_text) > max_num_columns:
          max_num_columns = len(row_text)

      data[idx] = row_text

    print("Max number of columns:", max_num_columns)

    # pad rows which don't have max_num_columns elements
    # to make sure all rows have the same number of columns
    for row, row_data in data.copy().items():
        if len(row_data) != max_num_columns:
          row_data = row_data + ["" for _ in range(max_num_columns - len(row_data))]
        data[row] = row_data

    return data

data = apply_ocr(cell_coordinates)

for row, row_data in data.items():
    print(row_data)

NameError: name 'cell_coordinates' is not defined

In [None]:
import csv

with open('output.csv','w') as result_file:
    wr = csv.writer(result_file, dialect='excel')

    for row, row_text in data.items():
      wr.writerow(row_text)

In [None]:
import pandas as pd

df = pd.read_csv("output.csv")
df.head()