# Image Table Scanner

improving cell detection and ocr by changing kernel values

In the original modules, the smallest cells were not detected because they were smaller than the minimum threshold value for detection.
I solved this by decreasing the minimum threshold for detection.

Prior to ocr, the ocr_image module would clean the image of each cell. This led to certain numbers being cropped and others deleted entirely.
I fixed this by increasing the kernel heigth and width for detecting cell boundaries. Now numbers are not cleaned out.

In [None]:
# probably not needed

from PIL import Image # same as above
from pdf2image import convert_from_path # to convert pdf to image

In [1]:
# Importing other necessary packages
import glob
import pandas as pd
import numpy as np
import cv2 # image transformation
import os

from io import StringIO # to convert string to csv
import time # to measure time

# to add the path where to search for modules
import sys
sys.path.append('/home/hennes/Internship/table_scanner')

# Importing table_ocr modules 
from table_ocr import pdf_to_images
from table_ocr import extract_tables
from table_ocr import extract_cells
from table_ocr import ocr_image
from table_ocr import ocr_to_csv

## Creating Pipeline

In [4]:
# Importing images
pdf_to_images.pdf_to_images('AC1_Form20.pdf')

KeyboardInterrupt: 

In [None]:
# Preprocessing Table
pdf_to_images.preprocess_img('trial.png')


In [None]:
# Extracting Table Image from PDF Page image
extract_tables.main(['trial.png'])

In [None]:
# Extract individual cell images
extract_cells.main('trial/table-000.png')

In [None]:
# perform OCR on each image
for image in [x for x in glob.glob('/home/hennes/Internship/trial/cells/*') if x.endswith('.png')]:
    ocr_image.main(image, None) # have to give 'None' as argument, because not executed in shell script

In [None]:
# Put OCRed str into csv
files = [x for x in glob.glob('/home/hennes/Internship/trial/cells/ocr_data/*') if x.endswith('.txt')]
files.sort() # files need to be alphabetically sorted
output = ocr_to_csv.main(files)
csv = StringIO(output)

In [None]:
# Turning csv into dataframe
# Skipping the first two rows because they have fewer columns than rest
# Also useful for chaining of tables later
df = pd.read_csv(csv,  header = None, skiprows=[0, 1])

## Checking Time needed for OCR

In [None]:
import math
import os
import sys

import cv2
import numpy as np
import pytesseract

def crop_to_text(image):
    MAX_COLOR_VAL = 255
    BLOCK_SIZE = 15
    SUBTRACT_FROM_MEAN = -2

    img_bin = cv2.adaptiveThreshold(
        ~image,
        MAX_COLOR_VAL,
        cv2.ADAPTIVE_THRESH_MEAN_C,
        cv2.THRESH_BINARY,
        BLOCK_SIZE,
        SUBTRACT_FROM_MEAN,
    )

    img_h, img_w = image.shape
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img_w * 0.5), 1))
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img_h * 0.9), 1))
    horizontal_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
    vertical_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
    both = horizontal_lines + vertical_lines
    cleaned = img_bin - both

    # Get rid of little noise.
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
    opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)
    opened = cv2.dilate(opened, kernel)

    contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    bounding_rects = [cv2.boundingRect(c) for c in contours]
    NUM_PX_COMMA = 6
    MIN_CHAR_AREA = 5 * 9
    char_sized_bounding_rects = [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]
    if char_sized_bounding_rects:
        minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
        for x, y, w, h in char_sized_bounding_rects:
            minx = min(minx, x)
            miny = min(miny, y)
            maxx = max(maxx, x + w)
            maxy = max(maxy, y + h)
        x, y, w, h = minx, miny, maxx - minx, maxy - miny
        cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)]
    else:
        # If we morphed out all of the text, assume an empty image.
        cropped = MAX_COLOR_VAL * np.ones(shape=(20, 100), dtype=np.uint8)
    bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
    return bordered
def ocr_image(image, config):
    return pytesseract.image_to_string(
        image,
        config=config
    )


In [None]:
image_file = '/home/hennes/Internship/trial/cells/002-004.png'
tess_args = None

# Measuring Time it takes 
start = time.time()

directory, filename = os.path.split(image_file)
filename_sans_ext, ext = os.path.splitext(filename)
image = cv2.imread(image_file, cv2.IMREAD_GRAYSCALE)

# Measuring time
end = time.time()
print("Image Reading time =", end - start)


# Measuring Time it takes 
start = time.time()

cropped = crop_to_text(image)
ocr_data_dir = os.path.join(directory, "ocr_data")
os.makedirs(ocr_data_dir, exist_ok=True)
out_imagepath = os.path.join(ocr_data_dir, filename)
out_txtpath = os.path.join(ocr_data_dir, "{}.gt.txt".format(filename_sans_ext))
cv2.imwrite(out_imagepath, cropped)
if not tess_args:
    d = os.path.dirname(sys.modules["table_ocr"].__file__)
    tessdata_dir = os.path.join(d, "tessdata")
    tess_args = ["--psm", "7", "-l", "table-ocr", "--tessdata-dir", tessdata_dir]

# Measuring time
end = time.time()
print('Cropping time =', end - start)
    
    
# Measuring Time it takes 
start = time.time()    
    
txt = ocr_image(cropped, " ".join(tess_args))
with open(out_txtpath, "w") as txt_file:
    txt_file.write(txt)

# Measuring time
end = time.time()
print('OCR time =', end - start)



## Implementing parallel instances

use concurrent.futures to run multiple instances of ocr_image at the same time

In [4]:
import concurrent.futures
import functools

# I had to use functools.partial to be able to input an iterable (imagelist) and a non-iterable 
# argument to the executor.map function.

os.environ['OMP_THREAD_LIMIT'] = '1'
p_ocr_image = functools.partial(ocr_image.main, None)

In [7]:
# set number of simultaneous threads for tesseract
os.environ['OMP_THREAD_LIMIT'] = '1'

start = time.time()

# perform OCR on each image
image_list = [x for x in glob.glob('/home/hennes/Internship/trial/cells/*') if x.endswith('.png')]
with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
    executor.map(p_ocr_image, image_list)
    
end = time.time()
print('OCR time =', end - start)

OCR time = 19.546766757965088
