# Trying out different Table Scanners

In [1]:
import layoutparser as lp 

import matplotlib.pyplot as plt
%matplotlib inline 

import glob
import pandas as pd
import numpy as np
import cv2 
from pdf2image import convert_from_path # to convert pdf to image
from PIL import Image # same as above

## Importing the Image

In [None]:
# convert pdf to image
pages = convert_from_path(
    "/home/hennes/Downloads/AC1_Form20.pdf",
    dpi=500,
    fmt='jpeg')

In [None]:
pages[1].save('image.png')

In [None]:
# If image already saved, import here

pic = Image.open('image.png')

In [None]:
pic = np.array(pic) # convert jpeg to numpy array
pic = pic[:, :, ::-1]
copy = pic

Line Removal

In [None]:
# Grayscale image 

pic = cv2.cvtColor(pic, cv2.COLOR_BGR2GRAY)

for i in range(2):
    # Thresholding (Turning image to true black and white)

    ret, binary = cv2.threshold(pic, 120, 255, cv2.THRESH_BINARY_INV)

    # Remove vertical lines
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,40))
    remove_vertical = cv2.morphologyEx(binary, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
    cnts = cv2.findContours(remove_vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    for c in cnts:
        pic= cv2.drawContours(pic, [c], -1, (255,255,255), 5)

for i in range(2):
    # Thresholding (Turning image to true black and white)

    ret, binary = cv2.threshold(pic, 120, 255, cv2.THRESH_BINARY_INV)

    # Remove horizontal lines
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25,1))
    # This kernel specifies what the next code should look for.
    # In this case, it should look for a rectangle 40 px wide and 1 px tall

    remove_horizontal = cv2.morphologyEx(binary, cv2.MORPH_OPEN, horizontal_kernel, iterations=3)
    # cv2.morphology is a generic function that can apply different techniques to the image
    # the flags determine the technique
    # MORPH_OPEN first erodes image (reduces white space and then fills it again), this removes noise
    # the result is an image which only contains the lines that were found and 'opened'

    cnts = cv2.findContours(remove_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    # with this code we find the contours of the lines from the resulting image

    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    for c in cnts:
        pic = cv2.drawContours(pic, [c], -1, (255,255,255), 5)
    # Each of these contours is then drawn over the original picture in white

Further Pre-Processing

In [None]:
# Noise Removal

pic = cv2.medianBlur(pic,5)

In [None]:
# Grayscale and Threshold 

ret, pic = cv2.threshold(pic, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU,)

In [None]:
cv2.imshow('image', cv2.resize(pic, (1066, 800)))
k = cv2.waitKey(0) & 0xFF
if k == 27:         # wait for ESC key to exit
    cv2.destroyAllWindows()

Possible further Pre-Processing

In [None]:
# Erosion (making thick lines thinner)

kernel = np.ones((3,3),np.uint8)
pic = cv2.dilate(pic, kernel, iterations = 1)

## Layout Parser

After removing the black lines the output is much better. It is still not good enough though. Especially the names are not correctly read.

In [None]:
# Configuring the Model for detection of zones

model = lp.Detectron2LayoutModel(
            config_path ='lp://TableBank/faster_rcnn_R_101_FPN_3x/config', # In model catalog
            label_map   ={0: "Table"}) # In model`label_map`

In [None]:
# Using Model to detect layout of document

layout = model.detect(copy)
lp.draw_box(pic, layout, box_width=3)

In [None]:
# This step is not necessary because I am using the table model (there are only table blocs)

table_blocks = lp.Layout([b for b in layout if b.type == 'Table'])

In [None]:
# crop the image so that only table is inside

segment_image = (table_blocks[0]
                       .pad(left=5, right=5, top=5, bottom=10)
                       .crop_image(pic))

In [None]:
# Specify OCR agent

ocr_agent = lp.TesseractAgent()

In [None]:
# perform OCR on cropped image

res = ocr_agent.detect(segment_image, return_response=True) # let's us directly analyse the OCR response

In [None]:
text = ocr_agent.gather_data(res, agg_level = lp.TesseractFeatureType.WORD)

In [None]:
# visualise result and compare to original picture

lp.draw_text(segment_image, text, font_size=50, with_box_on_text=True,
             text_box_width=1)

## Image Table OCR

In [40]:
import os
from table_ocr import pdf_to_images
from table_ocr import extract_tables
from table_ocr import extract_cells
from table_ocr import ocr_image
from table_ocr import ocr_to_csv
import argparse
import pandas as pd
from io import StringIO
import glob
import time

In [211]:
print(extract_cells.__file__)

/home/hennes/.local/lib/python3.8/site-packages/table_ocr/extract_cells/__init__.py


In [11]:
pdf_to_images.pdf_to_images('AC1_Form20.pdf')

['AC1_Form20-000.png',
 'AC1_Form20-001.png',
 'AC1_Form20-002.png',
 'AC1_Form20-003.png',
 'AC1_Form20-004.png',
 'AC1_Form20-005.png',
 'AC1_Form20-006.png',
 'AC1_Form20-007.png',
 'AC1_Form20-008.png',
 'AC1_Form20-009.png',
 'AC1_Form20-010.png',
 'AC1_Form20-011.png',
 'AC1_Form20-012.png',
 'AC1_Form20-013.png']

In [47]:
# Measuring Time it takes 
start = time.time()

# Preprocessing Table
pdf_to_images.preprocess_img('trial.png')

# Measuring time
end = time.time()
print(end - start)

1.527527093887329


In [48]:
# Measuring Time it takes 
start = time.time()

# Extracting Table Image from PDF Page image
extract_tables.main(['trial.png'])

# Measuring time
end = time.time()
print(end - start)

0.13017511367797852


In [49]:
# Measuring Time it takes 
start = time.time()

# Extract individual cell images
extract_cells.main('trial/table-000.png')

# Measuring time
end = time.time()
print(end - start)

0.10376310348510742


In [50]:
# Measuring Time it takes 
start = time.time()

# perform OCR on each image
for image in [x for x in glob.glob('/home/hennes/Internship/trial/cells/*') if x.endswith('.png')]:
    ocr_image.main(image, None) # have to give 'None' as argument, because not executed in shell script
    
# in source code there is a line for this case

# Measuring time
end = time.time()
print(end - start)

58.58324480056763


In [51]:
# Measuring Time it takes 
start = time.time()

# Put OCRed str into csv
files = [x for x in glob.glob('/home/hennes/Internship/trial/cells/ocr_data/*') if x.endswith('.txt')]
files.sort() # files need to be alphabetically sorted
output = ocr_to_csv.main(files)
csv = StringIO(output)

# Measuring time
end = time.time()
print(end - start)

0.017495155334472656


In [52]:
# Measuring Time it takes 
start = time.time()

# Turning csv into dataframe
# Skipping the first two rows because they have fewer columns than rest
# Also useful for chaining of tables later
df = pd.read_csv(csv,  header = None, skiprows=[0, 1])

# Measuring time
end = time.time()
print(end - start)

0.00984501838684082


In [39]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,51,41,305.0,20,3,236,3,0,8,3,2,580,0,6,586.0,0.0
1,52,42,337.0,11,0,283,1,1,7,4,$,649,0,5,654.0,0.0
2,53,43,2.88,10,3,258,0,0,2,2,3,566,0,10,576.0,0.0
3,54,434),214.0,4,3,286,1,1,3,1,0,513,0,7,520.0,0.0
4,55,44,179.0,135,18,211,4,3,4,3,9,566,0,9,575.0,0.0
5,56,_45,427.0,109,9,262,6,5,1,4,5,828,0,7,835.0,0.0
6,57,46,432.0,2,2,434,4,2,3,2,7,898,0,7,905.0,0.0
7,58,47,466.0,16,4,339,3,1,3,1,2,835,0,15,850.0,0.0
8,59,48,255.0,32,3,254,3,0,3,1,3,554,0,7,561.0,0.0
9,60,484),220.0,24,3,292,3,0,8,0,4,549,0,3,552.0,0.0


## Checking Time needed for OCR

In [54]:
import math
import os
import sys

import cv2
import numpy as np
import pytesseract

def crop_to_text(image):
    MAX_COLOR_VAL = 255
    BLOCK_SIZE = 15
    SUBTRACT_FROM_MEAN = -2

    img_bin = cv2.adaptiveThreshold(
        ~image,
        MAX_COLOR_VAL,
        cv2.ADAPTIVE_THRESH_MEAN_C,
        cv2.THRESH_BINARY,
        BLOCK_SIZE,
        SUBTRACT_FROM_MEAN,
    )

    img_h, img_w = image.shape
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img_w * 0.5), 1))
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img_h * 0.9), 1))
    horizontal_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
    vertical_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
    both = horizontal_lines + vertical_lines
    cleaned = img_bin - both

    # Get rid of little noise.
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
    opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)
    opened = cv2.dilate(opened, kernel)

    contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    bounding_rects = [cv2.boundingRect(c) for c in contours]
    NUM_PX_COMMA = 6
    MIN_CHAR_AREA = 5 * 9
    char_sized_bounding_rects = [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]
    if char_sized_bounding_rects:
        minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
        for x, y, w, h in char_sized_bounding_rects:
            minx = min(minx, x)
            miny = min(miny, y)
            maxx = max(maxx, x + w)
            maxy = max(maxy, y + h)
        x, y, w, h = minx, miny, maxx - minx, maxy - miny
        cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)]
    else:
        # If we morphed out all of the text, assume an empty image.
        cropped = MAX_COLOR_VAL * np.ones(shape=(20, 100), dtype=np.uint8)
    bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
    return bordered
def ocr_image(image, config):
    return pytesseract.image_to_string(
        image,
        config=config
    )


In [65]:
image_file = '/home/hennes/Internship/trial/cells/002-004.png'
tess_args = None

# Measuring Time it takes 
start = time.time()

directory, filename = os.path.split(image_file)
filename_sans_ext, ext = os.path.splitext(filename)
image = cv2.imread(image_file, cv2.IMREAD_GRAYSCALE)

# Measuring time
end = time.time()
print("Image Reading time =", end - start)


# Measuring Time it takes 
start = time.time()

cropped = crop_to_text(image)
ocr_data_dir = os.path.join(directory, "ocr_data")
os.makedirs(ocr_data_dir, exist_ok=True)
out_imagepath = os.path.join(ocr_data_dir, filename)
out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext))
cv2.imwrite(out_imagepath, cropped)
if not tess_args:
    d = os.path.dirname(sys.modules["table_ocr"].__file__)
    tessdata_dir = os.path.join(d, "tessdata")
    tess_args = ["--psm", "7", "-l", "table-ocr", "--tessdata-dir", tessdata_dir]

# Measuring time
end = time.time()
print('Cropping time =', end - start)
    
    
# Measuring Time it takes 
start = time.time()    
    
txt = ocr_image(cropped, " ".join(tess_args))
with open(out_txtpath, "w") as txt_file:
    txt_file.write(txt)

# Measuring time
end = time.time()
print('OCR time =', end - start)



Image Reading time = 0.0003886222839355469
Cropping time = 0.0012357234954833984
OCR time = 0.1403186321258545


## Improving Cropping

Examples of badly cropped images:

- 003-005.png
- 002-006.png
- 002-005.png
- 003-013.png
- 011-011.png
- 012-001.png

In [1]:
import math
import os
import sys

import cv2
import numpy as np
import pytesseract

In [171]:
# Choosing the image

image_file = '/home/hennes/Internship/trial/cells/026-009.png'
directory, filename = os.path.split(image_file)
filename_sans_ext, ext = os.path.splitext(filename)
image = cv2.imread(image_file, cv2.IMREAD_GRAYSCALE)

MAX_COLOR_VAL = 255
BLOCK_SIZE = 15
SUBTRACT_FROM_MEAN = -2

img_bin = cv2.adaptiveThreshold(
    ~image,
    MAX_COLOR_VAL,
    cv2.ADAPTIVE_THRESH_MEAN_C,
    cv2.THRESH_BINARY,
    BLOCK_SIZE,
    SUBTRACT_FROM_MEAN,
)

In [172]:
# It seems that this causes the problem. In 003-005 (and probably all other 1s, the entire number is cropped.)

In [173]:
img_h, img_w = image.shape
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img_w * 0.5), 1))
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img_h * 0.9), 1))
horizontal_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
vertical_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
both = horizontal_lines + vertical_lines
cleaned = img_bin - both

# It seems that this causes the problem. In 003-005 (and probably all other 1s, the entire number is cropped)

In [174]:
# Used as input directly the thresholded image, not the cleaned one.
# The result seems good. Maybe the cropping is not necessary?
# Good result with: 003-005.png, 002-006.png, 002-005.png, 

In [175]:
# Another solution is to make the kernel used for the cropping only see longer lines.
# The problem seems to have been that pixel lines in the number were wrongly identified as borders.

# Good result with: 003-013, 011-011.png, 

In [176]:
# Get rid of little noise.
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)
opened = cv2.dilate(opened, kernel)

contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
bounding_rects = [cv2.boundingRect(c) for c in contours]
NUM_PX_COMMA = 6
MIN_CHAR_AREA = 5 * 9
char_sized_bounding_rects = [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]
if char_sized_bounding_rects:
    minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
    for x, y, w, h in char_sized_bounding_rects:
        minx = min(minx, x)
        miny = min(miny, y)
        maxx = max(maxx, x + w)
        maxy = max(maxy, y + h)
    x, y, w, h = minx, miny, maxx - minx, maxy - miny
    cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)]
else:
    # If we morphed out all of the text, assume an empty image.
    cropped = MAX_COLOR_VAL * np.ones(shape=(20, 100), dtype=np.uint8)
bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)

In [177]:
# With 002-006, the problem is different. Here the cleaning captures one pixel in the number.
# The cropping then assumes that this is the edge of the picture and takes half of the number out

In [178]:
cv2.imshow('image', cv2.resize(image, (150, 100)))
k = cv2.waitKey(0) & 0xFF
if k == 27:         # wait for ESC key to exit
    cv2.destroyAllWindows()

In [179]:
cv2.imshow('image', cv2.resize(cleaned, (150, 100)))
k = cv2.waitKey(0) & 0xFF
if k == 27:         # wait for ESC key to exit
    cv2.destroyAllWindows()

In [184]:
cv2.imshow('image', cv2.resize(opened, (150, 100)))
k = cv2.waitKey(0) & 0xFF
if k == 27:         # wait for ESC key to exit
    cv2.destroyAllWindows()

In [183]:
cv2.imshow('image', cv2.resize(bordered, (150, 100)))
k = cv2.waitKey(0) & 0xFF
if k == 27:         # wait for ESC key to exit
    cv2.destroyAllWindows()

In [187]:
cont = cv2.drawContours(image, contours, -1, (0, 255, 0), 3)
cv2.imshow('image', cv2.resize(cont, (150, 100)))
k = cv2.waitKey(0) & 0xFF
if k == 27:         # wait for ESC key to exit
    cv2.destroyAllWindows()

## Improving Cell Detection

In [None]:
# I just needed to change the minimum cell width. 

In [205]:
import cv2
import os

def extract_cell_images_from_table(image):
# First image is blurred to reduce noise
    BLUR_KERNEL_SIZE = (17, 17)
    STD_DEV_X_DIRECTION = 0
    STD_DEV_Y_DIRECTION = 0
    blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
# Then thresholded to facilitate transformations
    MAX_COLOR_VAL = 255
    BLOCK_SIZE = 15
    SUBTRACT_FROM_MEAN = -2
    
    img_bin = cv2.adaptiveThreshold(
        ~blurred,
        MAX_COLOR_VAL,
        cv2.ADAPTIVE_THRESH_MEAN_C,
        cv2.THRESH_BINARY,
        BLOCK_SIZE,
        SUBTRACT_FROM_MEAN,
    )
# Finding Vertical and Horizontal Lines
    vertical = horizontal = img_bin.copy()
    SCALE = 5
    image_width, image_height = horizontal.shape
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
    horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
    vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
    
    horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
    vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
    
    mask = horizontally_dilated + vertically_dilated
    
# Finding Contours of the lines
    contours, heirarchy = cv2.findContours(
        mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE,
    )
    
    perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
    epsilons = [0.05 * p for p in perimeter_lengths]
    approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
    
    # Filter out contours that aren't rectangular. Those that aren't rectangular
    # are probably noise.
    approx_rects = [p for p in approx_polys if len(p) == 4]
    bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
    
    # Filter out rectangles that are too narrow or too short.
    MIN_RECT_WIDTH = 30
    MIN_RECT_HEIGHT = 10
    bounding_rects = [
        r for r in bounding_rects if MIN_RECT_WIDTH < r[2] and MIN_RECT_HEIGHT < r[3]
    ]
    
    # The largest bounding rectangle is assumed to be the entire table.
    # Remove it from the list. We don't want to accidentally try to OCR
    # the entire table.
    largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
    bounding_rects = [b for b in bounding_rects if b is not largest_rect]
    
    cells = [c for c in bounding_rects]
    def cell_in_same_row(c1, c2):
        c1_center = c1[1] + c1[3] - c1[3] / 2
        c2_bottom = c2[1] + c2[3]
        c2_top = c2[1]
        return c2_top < c1_center < c2_bottom
    
    orig_cells = [c for c in cells]
    rows = []
    while cells:
        first = cells[0]
        rest = cells[1:]
        cells_in_same_row = sorted(
            [
                c for c in rest
                if cell_in_same_row(c, first)
            ],
            key=lambda c: c[0]
        )
    
        row_cells = sorted([first] + cells_in_same_row, key=lambda c: c[0])
        rows.append(row_cells)
        cells = [
            c for c in rest
            if not cell_in_same_row(c, first)
        ]
    
    # Sort rows by average height of their center.
    def avg_height_of_center(row):
        centers = [y + h - h / 2 for x, y, w, h in row]
        return sum(centers) / len(centers)
    
    rows.sort(key=avg_height_of_center)
    cell_images_rows = []
    for row in rows:
        cell_images_row = []
        for x, y, w, h in row:
            cell_images_row.append(image[y:y+h, x:x+w])
        cell_images_rows.append(cell_images_row)
    return cell_images_rows

def main(f):
    results = []
    directory, filename = os.path.split(f)
    table = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
    rows = extract_cell_images_from_table(table)
    cell_img_dir = os.path.join(directory, "cells")
    os.makedirs(cell_img_dir, exist_ok=True)
    paths = []
    for i, row in enumerate(rows):
        for j, cell in enumerate(row):
            cell_filename = "{:03d}-{:03d}.png".format(i, j)
            path = os.path.join(cell_img_dir, cell_filename)
            cv2.imwrite(path, cell)
            paths.append(path)
    return paths


In [209]:
main('/home/hennes/Internship/trial/table-000.png')

['/home/hennes/Internship/trial/cells/000-000.png',
 '/home/hennes/Internship/trial/cells/000-001.png',
 '/home/hennes/Internship/trial/cells/000-002.png',
 '/home/hennes/Internship/trial/cells/000-003.png',
 '/home/hennes/Internship/trial/cells/000-004.png',
 '/home/hennes/Internship/trial/cells/000-005.png',
 '/home/hennes/Internship/trial/cells/000-006.png',
 '/home/hennes/Internship/trial/cells/000-007.png',
 '/home/hennes/Internship/trial/cells/000-008.png',
 '/home/hennes/Internship/trial/cells/001-000.png',
 '/home/hennes/Internship/trial/cells/001-001.png',
 '/home/hennes/Internship/trial/cells/001-002.png',
 '/home/hennes/Internship/trial/cells/001-003.png',
 '/home/hennes/Internship/trial/cells/001-004.png',
 '/home/hennes/Internship/trial/cells/001-005.png',
 '/home/hennes/Internship/trial/cells/001-006.png',
 '/home/hennes/Internship/trial/cells/001-007.png',
 '/home/hennes/Internship/trial/cells/001-008.png',
 '/home/hennes/Internship/trial/cells/001-009.png',
 '/home/henn