In [21]:
import tensorflow as tf
from tensorflow.keras.utils import plot_model
import matplotlib.pyplot as plt
import pydot
import graphviz
from script.network import TableNet

Paper: https://arxiv.org/abs/2001.01469

### Parse Data

In [46]:
import os
import cv2
import json
import parse
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from pdf2image import convert_from_path

directory = './dataset/Fintabnet_data/input/'
final_col_directory = './dataset/Fintabnet_data/column_mask/'
final_table_directory = './dataset/Fintabnet_data/table_mask/'
final_cell_directory = './dataset/Fintabnet_data/cell_mask/'
max_tables = 0

df = pd.read_csv("dataset/Fintabnet_data/GT.csv", encoding='utf-8')
ant = df.iloc[:,3]
oss = df.iloc[:,0]
format_string = "http://education-annotate.oss-cn-beijing.aliyuncs.com/table_2%2Fpdf2000%2F{}.pdf"

In [63]:
for i in range(len(ant)):
	y = json.loads(ant[i])

	# Find all the non-empty annotation records
	if y:
		parsed = parse.parse(format_string, oss[i].split("?")[0])
		filename = parsed[0]

		# Parse width, height
		width = y['container']["page1"]["width"]
		height = y['container']["page1"]["height"]
		
		# convert pdf to image
		page = convert_from_path(f'{directory}{filename}.pdf', size=(width*5, height*5))[0]
		# plt.imshow(page)
		# Create grayscale image array
		col_mask = np.zeros((height, width), dtype=np.int32)
		table_mask = np.zeros((height, width), dtype = np.int32)
		cell_mask = np.zeros((height, width), dtype = np.int32)

		valid = False
		if any("result" not in d for d in y["annotations"]):
			print(f"\033[1;92m ERROR: {filename} no result\033[0m")
			continue
		n_cols = sum(d["result"]["类型"]=="表头" for d in y["annotations"])
		n_tables = sum(d["result"]["类型"]=="表格" for d in y["annotations"])
		print(f"{filename}:{n_cols} columns, {n_tables} tables")
		if not n_cols:
			# n_cols = count_cols(y["annotations"])
			n_cols = 2
			print(f"\033[1;93m WARNING: {filename} no header\033[0m")
		col_x_mins, col_x_maxs = [height] * n_cols, [0] * n_cols
		col_y_mins, col_y_maxs = [width] * n_cols, [0] * n_cols

		# y
		# the result is a Python dictionary:
		for i, d in enumerate(y["annotations"]):
			x_min = int(min(corner[0] for corner in d["points"]))
			x_max = int(max(corner[0] for corner in d["points"]))
			y_min = int(min(corner[1] for corner in d["points"]))
			y_max = int(max(corner[1] for corner in d["points"]))

			if d["result"]["类型"]=="无效数据":
				print(f"{filename} is invalid")
				break
			elif d["result"]["类型"]=="表格": 
				# fill column mask
				for ci in range(n_cols):
					col_mask[col_y_mins[ci]:col_y_maxs[ci], col_x_mins[ci]:col_x_maxs[ci]] = 255

				# fill table mask
				table_mask[y_min:y_max, x_min:x_max] = 255

				# export cropped table image
				table_image = page.crop((d["points"][0][0]*5, d["points"][0][1]*5, d["points"][2][0]*5, d["points"][2][1]*5))
				table_file_name = directory + f"ocr_input/{filename}"
				table_image.save(f"{table_file_name}.png")

				# # export whole table ocr GT
				# generate_table_text(y, i)
				

				# clear col_ arrays
				col_x_mins, col_x_maxs = [height] * n_cols, [0] * n_cols
				col_y_mins, col_y_maxs = [width] * n_cols, [0] * n_cols

			else:
				valid = True
				# fill cell mask
				cell_mask[y_min:y_max, x_min:x_max] = 255
				# plt.imshow(page.crop((x_min*5, y_min*5, x_max*5, y_max*5)))
				# plt.title(d["label"])
				# if len(d["label"].splitlines())>1:
				#     continue
				cell_file_name = f"{final_cell_directory}ocr_traindata/{filename}_seg_{i}"

				label = ''.join(d["label"].splitlines())
				with open(f"{cell_file_name}.gt.txt", "w+") as f:
					f.write(label)

				cell_image = page.crop((x_min*5, y_min*5, x_max*5, y_max*5))
				cell_image.save(f"{cell_file_name}.png")
				# write data to txt
				# im = Image.fromarray(col_mask.astype(np.uint8),'L')
				

				col_x_mins[i%n_cols] = min(col_x_mins[i%n_cols], x_min)
				col_x_maxs[i%n_cols] = max(col_x_maxs[i%n_cols], x_max)
				col_y_mins[i%n_cols] = min(col_y_mins[i%n_cols], y_min)
				col_y_maxs[i%n_cols] = max(col_y_maxs[i%n_cols], y_max)

		if valid:
			im = Image.fromarray(col_mask.astype(np.uint8),'L')
			im.save(final_col_directory + filename + ".jpeg")
			# plt.show(im)

			im = Image.fromarray(table_mask.astype(np.uint8),'L')
			im.save(final_table_directory + filename + ".jpeg")

			im = Image.fromarray(cell_mask.astype(np.uint8),'L')
			im.save(final_cell_directory + filename + ".jpeg")

file0:2 columns, 1 tables
file1:7 columns, 1 tables
file3:4 columns, 1 tables
file4:5 columns, 1 tables
file5:5 columns, 1 tables
file6:7 columns, 1 tables
file7:5 columns, 1 tables
file8:18 columns, 1 tables
file9:10 columns, 1 tables
file12:6 columns, 1 tables
file13:2 columns, 1 tables
file14:12 columns, 1 tables
file15:7 columns, 1 tables
file16:6 columns, 1 tables
file18:7 columns, 1 tables
file19:4 columns, 1 tables
file20:9 columns, 1 tables
file21:4 columns, 1 tables
file22:5 columns, 1 tables
file23:5 columns, 1 tables
file24:14 columns, 1 tables
file25:16 columns, 1 tables
file26:4 columns, 1 tables
file28:4 columns, 2 tables
file29:2 columns, 1 tables
file30:2 columns, 1 tables
file31:0 columns, 0 tables
file31 is invalid
file32:2 columns, 1 tables
file33:7 columns, 1 tables
file35:5 columns, 1 tables
file36:5 columns, 1 tables
file37:8 columns, 2 tables
file38:7 columns, 1 tables
file39:5 columns, 1 tables
file40:12 columns, 1 tables
file41:18 columns, 1 tables
file42:10 co

In [22]:
# # load JSON
# x = ant[2] #img_0
# plt.imshow(page)
# # parse x:
# y = json.loads(x)

# n_cols = sum(d["result"]["类型"]=="表头" for d in y["annotations"])
# col_x_mins, col_x_maxs = [float("inf")] * n_cols, [0] * n_cols
# col_y_mins, col_y_maxs = [float("inf")] * n_cols, [0] * n_cols

# y["annotations"]

'http://education-annotate.oss-cn-beijing.aliyuncs.com/table_2%2Fpdf2000%2Ffile3.pdf?OSSAccessKeyId=LTAIqzGnw1FYDk8Q&Expires=1672542720&Signature=mQexZwf%2B05s1mDjHY4p09kD456k%3D'

In [221]:
!python3 data_preprocess/generate_mask_json.py

file0:2 columns, 1 tables
file1:7 columns, 1 tables
file2:4 columns, 1 tables
file3:5 columns, 1 tables
file4:5 columns, 1 tables
file5:7 columns, 1 tables
file6:5 columns, 1 tables
file7:18 columns, 1 tables
file8:10 columns, 1 tables
file9:6 columns, 1 tables
file10:2 columns, 1 tables
file11:12 columns, 1 tables
file12:7 columns, 1 tables
file13:6 columns, 1 tables
file14:7 columns, 1 tables
file15:4 columns, 1 tables
file16:9 columns, 1 tables
file17:4 columns, 1 tables
file18:5 columns, 1 tables
file19:5 columns, 1 tables
file20:14 columns, 1 tables
file21:16 columns, 1 tables
file22:4 columns, 1 tables
file23:4 columns, 2 tables
file24:2 columns, 1 tables
file25:2 columns, 1 tables
file26:0 columns, 0 tables
file26 is invalid
file27:2 columns, 1 tables
file28:7 columns, 1 tables
file29:5 columns, 1 tables
file30:5 columns, 1 tables
file31:8 columns, 2 tables
file32:7 columns, 1 tables
file33:5 columns, 1 tables
file34:12 columns, 1 tables
file35:18 columns, 1 tables
file36:10 col

In [183]:
# for debugging only
from data_preprocess.generate_mask_json import show_sample
show_sample(2)

[{'id': 1,
  'points': [[42.27272727272728, 260.8264437588779],
   [101.81818181818183, 260.8264437588779],
   [101.81818181818183, 271.2809892134233],
   [42.27272727272728, 271.2809892134233]],
  'label': 'NSP-Minnesota',
  'page': 1,
  'result': {'类型': '表内容'},
  'color': 'rgba(145, 204, 51, 1)'},
 {'id': 2,
  'points': [[42.727272727272734, 273.5537164861506],
   [65.90909090909092, 273.5537164861506],
   [65.90909090909092, 282.19008012251425],
   [42.727272727272734, 282.19008012251425]],
  'label': 'PSCo ',
  'page': 1,
  'result': {'类型': '表内容'},
  'color': 'rgba(145, 204, 51, 1)'},
 {'id': 3,
  'points': [[45.00000000000001, 284.917352849787],
   [60.00000000000001, 284.917352849787],
   [60.00000000000001, 292.6446255770597],
   [45.00000000000001, 292.6446255770597]],
  'label': 'SPS ',
  'page': 1,
  'result': {'类型': '表内容'},
  'color': 'rgba(145, 204, 51, 1)'},
 {'id': 4,
  'points': [[43.63636363636364, 296.2809892134233],
   [166.36363636363637, 296.2809892134233],
   [166.

### OCR

<b>Train OCR With Tesstrain</b>

In [65]:
# # !mkdir dataset/Fintabnet_data/cell_mask/ocr_traindata
# !mv dataset/Fintabnet_data/cell_mask/*.gt.txt dataset/Fintabnet_data/cell_mask/*.png dataset/Fintabnet_data/cell_mask/ocr_traindata/
!cp -r dataset/Fintabnet_data/cell_mask/ocr_traindata  ~/eyegaze/tesstrain/data/
!cd ~/eyegaze/tesstrain/ && make training MODEL_NAME=foo DATA_DIR=data GROUND_TRUTH_DIR=data/ocr_traindata


find -L data/ocr_traindata -name '*.gt.txt' | xargs paste -s > "data/foo/all-gt"
unicharset_extractor --output_unicharset "data/foo/unicharset" --norm_mode 2 "data/foo/all-gt"
unicharset_extractor: error while loading shared libraries: libtesseract.so.5: cannot open shared object file: No such file or directory
make: *** [Makefile:197: data/foo/unicharset] Error 127


<b>Load Model</b>

In [1]:
# copy trained model from tesstrain to our project dataset: 
# !sudo cp ~/eyegaze/tesstrain/data/foo.traineddata dataset/foo.traineddata

# add to tesseract langugage models 
!sudo cp dataset/foo.traineddata /home/linuxbrew/.linuxbrew/share/tessdata/fintabnet.traineddata #REPLACE BY YOUR TESSERACT TESSDATA DIRECTORY

<b>Predict</b>

See part II.

<!-- 
Page segmentation modes:
  0    Orientation and script detection (OSD) only.
  1    Automatic page segmentation with OSD.
  2    Automatic page segmentation, but no OSD, or OCR.
  3    Fully automatic page segmentation, but no OSD. (Default)
  4    Assume a single column of text of variable sizes.
  5    Assume a single uniform block of vertically aligned text.
  6    Assume a single uniform block of text.
  7    Treat the image as a single text line.
  8    Treat the image as a single word.
  9    Treat the image as a single word in a circle.
 10    Treat the image as a single character.
 11    Sparse text. Find as much text as possible in no particular order.
 12    Sparse text with OSD.
 13    Raw line. Treat the image as a single text line,
                        bypassing hacks that are Tesseract-specific. -->