From db2f0808f3d7ca62b2da49e94b61c6f419118463 Mon Sep 17 00:00:00 2001 From: Jake Stockwin Date: Thu, 16 Apr 2020 16:13:55 +0100 Subject: [PATCH] [tables] Improve performance of extract_simple_table Closes #62 --- CHANGELOG.md | 3 +++ py_pdf_parser/tables.py | 24 ++++++++++++++---------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 97f7abd0..50ed2397 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Font sizes are now `float` not `int`. The `font_size_precision` in the additions defaults to 1, and as such all fonts will change to have a single decimal place. To keep the old behaviour, you can pass `font_size_precision=0` when instantiating your PDFDocument. +### Fixed +- Improved performance of `extract_simple_table`, which is now much faster. ([#65](https://github.com/optimor/py-pdf-parser/pull/65)) + ## [0.1.0] - 2019-04-08 ### Added - Initial version of the product. Note: The version is less than 1, so this product should not yet be considered stable. API changes and other breaking changes are possible, if not likely. diff --git a/py_pdf_parser/tables.py b/py_pdf_parser/tables.py index 961248c8..00bd0eef 100644 --- a/py_pdf_parser/tables.py +++ b/py_pdf_parser/tables.py @@ -82,18 +82,22 @@ def extract_simple_table( reference_element, inclusive=True, tolerance=tolerance, all_pages=True ) + reference_columns = [ + elements.vertically_in_line_with( + element, inclusive=True, tolerance=tolerance, all_pages=True + ) + for element in reference_row + ] + reference_rows = [ + elements.horizontally_in_line_with(element, inclusive=True, tolerance=tolerance) + for element in reference_column + ] + table: List[List] = [] - for reference_column_element in reference_column: + for current_row in reference_rows: row: List = [] - for reference_row_element in reference_row: - element = elements.horizontally_in_line_with( - reference_column_element, inclusive=True, tolerance=tolerance - ).vertically_in_line_with( - reference_row_element, - inclusive=True, - tolerance=tolerance, - all_pages=True, - ) + for current_column in reference_columns: + element = current_row & current_column try: row.append(element.extract_single_element()) except NoElementFoundError as err: