Merge pull request #65 from optimor/extract-table-performance

[tables] Improve performance of extract_simple_table
jstockwin · Apr 16, 2020 · c3bf3dc · c3bf3dc
2 parents fa6c847 + db2f080
commit c3bf3dc
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 10 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Changed
 - Font sizes are now `float` not `int`. The `font_size_precision` in the additions defaults to 1, and as such all fonts will change to have a single decimal place. To keep the old behaviour, you can pass `font_size_precision=0` when instantiating your PDFDocument.
 
+### Fixed
+- Improved performance of `extract_simple_table`, which is now much faster. ([#65](https://github.com/optimor/py-pdf-parser/pull/65))
+
 ## [0.1.0] - 2019-04-08
 ### Added
 - Initial version of the product. Note: The version is less than 1, so this product should not yet be considered stable. API changes and other breaking changes are possible, if not likely.
diff --git a/py_pdf_parser/tables.py b/py_pdf_parser/tables.py
@@ -82,18 +82,22 @@ def extract_simple_table(
         reference_element, inclusive=True, tolerance=tolerance, all_pages=True
     )
 
+    reference_columns = [
+        elements.vertically_in_line_with(
+            element, inclusive=True, tolerance=tolerance, all_pages=True
+        )
+        for element in reference_row
+    ]
+    reference_rows = [
+        elements.horizontally_in_line_with(element, inclusive=True, tolerance=tolerance)
+        for element in reference_column
+    ]
+
     table: List[List] = []
-    for reference_column_element in reference_column:
+    for current_row in reference_rows:
         row: List = []
-        for reference_row_element in reference_row:
-            element = elements.horizontally_in_line_with(
-                reference_column_element, inclusive=True, tolerance=tolerance
-            ).vertically_in_line_with(
-                reference_row_element,
-                inclusive=True,
-                tolerance=tolerance,
-                all_pages=True,
-            )
+        for current_column in reference_columns:
+            element = current_row & current_column
             try:
                 row.append(element.extract_single_element())
             except NoElementFoundError as err: