From db2f0808f3d7ca62b2da49e94b61c6f419118463 Mon Sep 17 00:00:00 2001
From: Jake Stockwin <jstockwin@gmail.com>
Date: Thu, 16 Apr 2020 16:13:55 +0100
Subject: [PATCH] [tables] Improve performance of extract_simple_table

Closes #62
---
 CHANGELOG.md            |  3 +++
 py_pdf_parser/tables.py | 24 ++++++++++++++----------
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 97f7abd0..50ed2397 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Changed
 - Font sizes are now `float` not `int`. The `font_size_precision` in the additions defaults to 1, and as such all fonts will change to have a single decimal place. To keep the old behaviour, you can pass `font_size_precision=0` when instantiating your PDFDocument.
 
+### Fixed
+- Improved performance of `extract_simple_table`, which is now much faster. ([#65](https://github.com/optimor/py-pdf-parser/pull/65))
+
 ## [0.1.0] - 2019-04-08
 ### Added
 - Initial version of the product. Note: The version is less than 1, so this product should not yet be considered stable. API changes and other breaking changes are possible, if not likely.
diff --git a/py_pdf_parser/tables.py b/py_pdf_parser/tables.py
index 961248c8..00bd0eef 100644
--- a/py_pdf_parser/tables.py
+++ b/py_pdf_parser/tables.py
@@ -82,18 +82,22 @@ def extract_simple_table(
         reference_element, inclusive=True, tolerance=tolerance, all_pages=True
     )
 
+    reference_columns = [
+        elements.vertically_in_line_with(
+            element, inclusive=True, tolerance=tolerance, all_pages=True
+        )
+        for element in reference_row
+    ]
+    reference_rows = [
+        elements.horizontally_in_line_with(element, inclusive=True, tolerance=tolerance)
+        for element in reference_column
+    ]
+
     table: List[List] = []
-    for reference_column_element in reference_column:
+    for current_row in reference_rows:
         row: List = []
-        for reference_row_element in reference_row:
-            element = elements.horizontally_in_line_with(
-                reference_column_element, inclusive=True, tolerance=tolerance
-            ).vertically_in_line_with(
-                reference_row_element,
-                inclusive=True,
-                tolerance=tolerance,
-                all_pages=True,
-            )
+        for current_column in reference_columns:
+            element = current_row & current_column
             try:
                 row.append(element.extract_single_element())
             except NoElementFoundError as err: