# Image Table Scanner

improving cell detection and ocr by changing kernel values

In the original modules, the smallest cells were not detected because they were smaller than the minimum threshold value for detection.
I solved this by decreasing the minimum threshold for detection.

Prior to ocr, the ocr_image module would clean the image of each cell. This led to certain numbers being cropped and others deleted entirely.
I fixed this by increasing the kernel heigth and width for detecting cell boundaries. Now numbers are not cleaned out.

In [None]:
# probably not needed

from PIL import Image # same as above
from pdf2image import convert_from_path # to convert pdf to image

In [3]:
# Importing other necessary packages
import glob
import pandas as pd
import numpy as np
import cv2 # image transformation
import os
import re
import concurrent # for parallel instances
import functools # for creating partial functions

from io import StringIO # to convert string to csv
import time # to measure time

# to add the path where to search for modules
import sys
sys.path.append('/home/hennes/Internship/table_scanner')

# Importing table_ocr modules 
from table_ocr import pdf_to_images
from table_ocr import extract_tables
from table_ocr import extract_cells
from table_ocr import ocr_image
from table_ocr import ocr_to_csv

## Creating Pipeline

In [2]:
## Creating Pipeline

folder = "/home/hennes/Downloads/2021 Form 20 Digitized Data"
pdflist = [pdf for pdf in glob.glob(folder+'/*') if pdf.endswith(".pdf")]

In [71]:
# in final code, run everything per pdf file, so that each constituency results in one file

In [72]:
# Importing images

with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
    executor.map(pdf_to_images.pdf_to_images, pdflist)

In [4]:
imglist = [img for img in glob.glob(folder+'/*') if img.endswith('.png')]

In [74]:
# Preprocessing Table

with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
    executor.map(pdf_to_images.preprocess_img, imglist)

In [5]:
# Extracting Table Image from PDF Page image
imglist = [[img] for img in imglist]

with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
    executor.map(extract_tables.main, imglist)

In [6]:
# list of individual tables in subfolders 

dirlist = [directory for directory in glob.glob(folder+'/*/*') if directory.endswith('.png')]

In [7]:
# Extract individual cell images

with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
    executor.map(extract_cells.main, dirlist)

In [8]:
# cells folder is first in list of two objects in individual image folders.
# That is why this code works (but only if executed after cells were extracted).

dirlist = [directory for directory in glob.glob(folder+'/*/*/')]
celllists = [glob.glob(cellfolder+'*') for cellfolder in dirlist]

In [9]:
# perform OCR on each image

os.environ['OMP_THREAD_LIMIT'] = '1'
p_ocr_image = functools.partial(ocr_image.main, None)

for image_list in celllists:
    # perform OCR on each image
    with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
        executor.map(p_ocr_image, image_list)

In [10]:
# get the names for the individual pages

pages = sorted([filename for directory, filename in
         [os.path.split(x) for x in glob.glob(folder+'/*') if not x.endswith(('.pdf', '.png'))]])

# create list of alphabetically ordered lists of ocred files

ocrlists = [sorted(y) for y in
            [glob.glob(f'/home/hennes/Downloads/2021 Form 20 Digitized Data/{x}/cells/ocr_data/*.txt') for x in pages]]
zippie = zip(pages, ocrlists)

In [11]:
# Put OCRed str into csv
gathered_data = []

for y, x in zippie:
    output = ocr_to_csv.main(x)
    csv = StringIO(output)
    print("Converting into csv from" + y)
    
    # Turning csv into dataframe
    # Skipping the first two rows because they have fewer columns than rest
    # Also useful for chaining of tables later
    df = pd.read_csv(csv,  header = None, skiprows=[0, 1])
    gathered_data.append(df)
    print("sucessfully appended data from" + y)

df = pd.concat(gathered_data)

save_folder = '/home/hennes/Internship/constituencies/'
constituency_name = pages[0][0:3]
df.to_csv(save_folder+constituency_name+'.csv')

Converting into csv fromAC1_Form20-000
sucessfully appended data fromAC1_Form20-000
Converting into csv fromAC1_Form20-001
sucessfully appended data fromAC1_Form20-001
Converting into csv fromAC1_Form20-002
sucessfully appended data fromAC1_Form20-002
Converting into csv fromAC1_Form20-003
sucessfully appended data fromAC1_Form20-003
Converting into csv fromAC1_Form20-004
sucessfully appended data fromAC1_Form20-004
Converting into csv fromAC1_Form20-005
sucessfully appended data fromAC1_Form20-005
Converting into csv fromAC1_Form20-006
sucessfully appended data fromAC1_Form20-006
Converting into csv fromAC1_Form20-007
sucessfully appended data fromAC1_Form20-007
Converting into csv fromAC1_Form20-008
sucessfully appended data fromAC1_Form20-008
Converting into csv fromAC1_Form20-009
sucessfully appended data fromAC1_Form20-009
Converting into csv fromAC1_Form20-010
sucessfully appended data fromAC1_Form20-010
Converting into csv fromAC1_Form20-011
sucessfully appended data fromAC1_For

## Fixing Table Extraction

In [26]:
cv2.drawContours(image, contours, -1, (0, 255, 0), 3)

array([[  0,   0,   0, ..., 255, 255, 255],
       [  0,   0,   0, ..., 255, 255, 255],
       [  0,   0,   0, ..., 255, 255, 255],
       ...,
       [247, 255, 248, ..., 188,  74,  70],
       [247, 252, 249, ..., 250, 251, 170],
       [246, 251, 251, ..., 243, 228, 170]], dtype=uint8)

In [27]:
cv2.imshow('image', cv2.resize(image, (1066, 800)))
k = cv2.waitKey(0) & 0xFF
if k == 27:         # wait for ESC key to exit
    cv2.destroyAllWindows()

In [25]:
image = cv2.imread(('/home/hennes/Downloads/2021 Form 20 Digitized Data/AC1_Form20-010/table-000.png'), cv2.IMREAD_GRAYSCALE)

BLUR_KERNEL_SIZE = (9, 9)
STD_DEV_X_DIRECTION = 0
STD_DEV_Y_DIRECTION = 0
blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
# Then thresholded to facilitate transformations
MAX_COLOR_VAL = 255
BLOCK_SIZE = 15
SUBTRACT_FROM_MEAN = -2

img_bin = cv2.adaptiveThreshold(
    ~blurred,
    MAX_COLOR_VAL,
    cv2.ADAPTIVE_THRESH_MEAN_C,
    cv2.THRESH_BINARY,
    BLOCK_SIZE,
    SUBTRACT_FROM_MEAN,
)
# Finding Vertical and Horizontal Lines
vertical = horizontal = img_bin.copy()
SCALE = 10
image_width, image_height = horizontal.shape
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)

horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))

mask = horizontally_dilated + vertically_dilated

# Finding Contours of the lines
contours, heirarchy = cv2.findContours(
    mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE,
)

perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
epsilons = [0.05 * p for p in perimeter_lengths]
approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]

# Filter out contours that aren't rectangular. Those that aren't rectangular
# are probably noise.
approx_rects = [p for p in approx_polys if len(p) == 4]
bounding_rects = [cv2.boundingRect(a) for a in approx_polys]

# Filter out rectangles that are too narrow or too short.
MIN_RECT_WIDTH = 30     
MIN_RECT_HEIGHT = 10
bounding_rects = [
    r for r in bounding_rects if MIN_RECT_WIDTH < r[2] and MIN_RECT_HEIGHT < r[3]
]

# The largest bounding rectangle is assumed to be the entire table.
# Remove it from the list. We don't want to accidentally try to OCR
# the entire table.
largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
bounding_rects = [b for b in bounding_rects if b is not largest_rect]

cells = [c for c in bounding_rects]
def cell_in_same_row(c1, c2):
    c1_center = c1[1] + c1[3] - c1[3] / 2
    c2_bottom = c2[1] + c2[3]
    c2_top = c2[1]
    return c2_top < c1_center < c2_bottom

orig_cells = [c for c in cells]
rows = []
while cells:
    first = cells[0]
    rest = cells[1:]
    cells_in_same_row = sorted(
        [
            c for c in rest
            if cell_in_same_row(c, first)
        ],
        key=lambda c: c[0]
    )

    row_cells = sorted([first] + cells_in_same_row, key=lambda c: c[0])
    rows.append(row_cells)
    cells = [
        c for c in rest
        if not cell_in_same_row(c, first)
    ]

# Sort rows by average height of their center.
def avg_height_of_center(row):
    centers = [y + h - h / 2 for x, y, w, h in row]
    return sum(centers) / len(centers)

rows.sort(key=avg_height_of_center)
cell_images_rows = []
for row in rows:
    cell_images_row = []
    for x, y, w, h in row:
        cell_images_row.append(image[y:y+h, x:x+w])
    cell_images_rows.append(cell_images_row)
