Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix:table detection #174

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 43 additions & 31 deletions borb/toolkit/table/table_detection_by_lines.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,23 +52,46 @@ def __init__(self):
#
# PRIVATE
#
def _determine_sorted_lines_end_points(
self, lines_in_table: typing.List[LineSegment]
) -> typing.Tuple[typing.List[Decimal], typing.List[Decimal]]:
# take out of all xs / ys
whole_xs: typing.List[Decimal] = []
whole_ys: typing.List[Decimal] = []

for l in lines_in_table:
whole_xs.append(Decimal(l.x0))
whole_xs.append(Decimal(l.x1))
whole_ys.append(Decimal(l.y0))
whole_ys.append(Decimal(l.y1))

min_dist = Decimal(1)
# filter xs / ys based on distance
filtered_xs: typing.List[Decimal] = []
filtered_ys: typing.List[Decimal] = []

for x in sorted(whole_xs):
if not filtered_xs or x - filtered_xs[-1] > min_dist:
filtered_xs.append(x)

for y in sorted(whole_ys):
if not filtered_ys or y - filtered_ys[-1] > min_dist:
filtered_ys.append(y)

return filtered_xs, filtered_ys
#
# PRIVATE
#

def _determine_number_of_rows_and_columns(
self, lines_in_table: typing.List[LineSegment]
) -> typing.Tuple[int, int]:
# keep track of unique xs / ys (to derive number of rows/cols)
unique_xs: typing.Set[int] = set()
unique_ys: typing.Set[int] = set()

for l in lines_in_table:
unique_xs.add(int(l.x0))
unique_xs.add(int(l.x1))
unique_ys.add(int(l.y0))
unique_ys.add(int(l.y1))
# determine the end point of the lines
xs, ys = self._determine_sorted_lines_end_points(lines_in_table)

# determine number of rows/cols
number_of_rows: int = len(unique_ys) - 1
number_of_cols: int = len(unique_xs) - 1
number_of_rows: int = len(xs) - 1
number_of_cols: int = len(ys) - 1

# return
return number_of_rows, number_of_cols
Expand All @@ -93,32 +116,23 @@ def _determine_table_bounding_box(
def _determine_table_cell_boundaries(
self, lines_in_table: typing.List[LineSegment]
) -> Table:
# keep track of unique xs / ys (to derive number of rows/cols)
unique_xs: typing.Set[int] = set()
unique_ys: typing.Set[int] = set()

for l in lines_in_table:
unique_xs.add(int(l.x0))
unique_xs.add(int(l.x1))
unique_ys.add(int(l.y0))
unique_ys.add(int(l.y1))
# determine the end points of the lines
xs: typing.List[Decimal]
ys: typing.List[Decimal]
xs, ys = self._determine_sorted_lines_end_points(lines_in_table)

# determine number of rows and cols
number_of_rows: int = len(unique_ys) - 1
number_of_cols: int = len(unique_xs) - 1

# sort unique_xs and unique_ys
xs: typing.List[Decimal] = sorted([Decimal(x) for x in unique_xs])
ys: typing.List[Decimal] = sorted([Decimal(y) for y in unique_ys])
number_of_rows: int = len(ys) - 1
number_of_cols: int = len(xs) - 1

# find neighbouring cells and join wherever appropriate
ds: disjointset = disjointset()
for i in range(0, number_of_rows):
for j in range(0, number_of_cols):
ds.add((i, j))

for c in range(0, len(xs) - 1):
for r in range(0, len(ys) - 1):
for c in range(0, number_of_cols):
for r in range(0, number_of_rows):
if c + 2 < len(xs):
logger.debug(
"attempting to merge [%d %d] with its right neighbour" % (r, c)
Expand Down Expand Up @@ -169,9 +183,7 @@ def _determine_table_cell_boundaries(
# check whether all areas are rectangular
for i in range(min_col, max_col):
for j in range(min_row, max_row):
assert (
j * number_of_rows + i
) in v, "Non-rectangular area detected in table."
assert (i, j) in v, "Non-rectangular area detected in table."

# create TableCell
tc: TableCell = TableCell(
Expand Down