# Image Table Scanner

improving cell detection and ocr by changing kernel values

In the original modules, the smallest cells were not detected because they were smaller than the minimum threshold value for detection.
I solved this by decreasing the minimum threshold for detection.

Prior to ocr, the ocr_image module would clean the image of each cell. This led to certain numbers being cropped and others deleted entirely.
I fixed this by increasing the kernel heigth and width for detecting cell boundaries. Now numbers are not cleaned out.

In [1]:
# Importing other necessary packages
import glob
import pandas as pd
import numpy as np
import cv2 # image transformation
import os
import re
import concurrent # for parallel instances
import functools # for creating partial functions
import shutil
from pathlib import Path

from io import StringIO # to convert string to csv
import time # to measure time

# to add the path where to search for modules
import sys
sys.path.append('/home/hennes/Internship/table_scanner')

# Importing table_ocr modules 
from table_ocr import pdf_to_images
from table_ocr import extract_tables
from table_ocr import extract_cells
from table_ocr import ocr_image
from table_ocr import ocr_to_csv

## Creating Pipeline

In [5]:
## Creating Pipeline

folder = "/home/hennes/Internship/pdfs"
save_folder = '/home/hennes/Internship/constituencies/'
old = '/home/hennes/Internship/old_files/'
pdflist = [pdf for pdf in glob.glob(folder+'/*')]

# only collect pdfs that have not yet been turned into csv

In [7]:
# in final code, run everything per pdf file, so that each constituency results in one file

In [4]:
# Importing images
for pdf in pdflist:
    pdf_to_images.pdf_to_images(pdf)

KeyboardInterrupt: 

In [7]:
imglist = [img for img in glob.glob(folder+'/*') if img.endswith('.png')]

In [4]:
# Preprocessing Table

with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
    executor.map(pdf_to_images.preprocess_img, imglist)

In [7]:
# Extracting Table Image from PDF Page image
imglist = [[img] for img in imglist]

with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
    executor.map(extract_tables.main, imglist)

In [10]:
# list of individual tables in subfolders 

dirlist = sorted([directory for directory in glob.glob(folder+'/*/*') if directory.endswith('.png')])

In [11]:
# Create partial function for cell extraction in which dilation is specified
dilate=None

p_extract_cells = functools.partial(extract_cells.main, dilate)

# Extract individual cell images
with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
    executor.map(p_extract_cells, dirlist)

In [3]:
# cells folder is first in list of two objects in individual image folders.
# That is why this code works (but only if executed after cells were extracted).

dirlist = [directory for directory in glob.glob(folder+'/*/*/')]
celllists = [glob.glob(cellfolder+'*') for cellfolder in dirlist]

In [12]:
thresh=True
no_noise = None
tess_args = None

for image_list in celllists:
    for x in image_list:
        ocr_image.main(tess_args, thresh, no_noise, x)

KeyboardInterrupt: 

In [44]:
# perform OCR on each image
thresh=True
no_noise = True
tess_args = None

os.environ['OMP_THREAD_LIMIT'] = '1'
p_ocr_image = functools.partial(ocr_image.main, None, thresh, no_noise)

for image_list in celllists:
    # perform OCR on each image
    with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
        executor.map(p_ocr_image, image_list)

In [3]:
def move_files(folder, old):
    # if folder already exist, delete it. Then move folders to old directory.
    [shutil.rmtree(old+'/'+e)
     for e in next(os.walk(folder))[1] if Path(old+'/'+e).is_dir()]

    [shutil.move(folder+'/'+e, old)
     for e in next(os.walk(folder))[1] if not e.endswith('.pdf')]

    # if files already exist, delete them. Then move files to old directory.
    [os.remove(old+'/'+e)
     for e in next(os.walk(folder))[2] if not e.endswith('.pdf') and Path(old+'/'+e).is_file()]

    [shutil.move(e, old)
     for e in glob.glob(folder+'/*') if not e.endswith('.pdf')]

In [4]:
# Put OCRed str into csv
try:
    gathered_data = []
    # get the names for the individual pages

    pages = sorted([filename for directory, filename in
             [os.path.split(x) for x in glob.glob(folder+'/*') if not x.endswith(('.pdf', '.png'))]])

    # create list of alphabetically ordered lists of ocred files

    ocrlists = [sorted(y) for y in
                [glob.glob(f'{folder}/{x}/cells/ocr_data/*.txt') for x in pages]]
    zippie = zip(pages, ocrlists)

    for y, x in zippie:
        output = ocr_to_csv.main(x)
        csv = StringIO(output)
        print(f'working on {y}')

        # Turning csv into dataframe
        # Skipping the first two rows because they have fewer columns than rest
        # Also useful for chaining of tables later

        df = pd.read_csv(csv,  header = None, skiprows=[0, 1])
        gathered_data.append(df)
        df = pd.concat(gathered_data)

        constituency_name = pages[0][0:5]
        df.to_csv(save_folder+constituency_name+'.csv')

    print(f'Saved {constituency_name} to folder.')

    move_files(folder, old)
except Exception as e:
    print(e)
    print('will try again ignoring one more line.')
        # get the names for the individual pages

    try:
        gathered_data = []
        pages = sorted([filename for directory, filename in
                 [os.path.split(x) for x in glob.glob(folder+'/*') if not x.endswith(('.pdf', '.png'))]])

        # create list of alphabetically ordered lists of ocred files

        ocrlists = [sorted(y) for y in
                    [glob.glob(f'{folder}/{x}/cells/ocr_data/*.txt') for x in pages]]
        zippie = zip(pages, ocrlists)

        for y, x in zippie:
            output = ocr_to_csv.main(x)
            csv = StringIO(output)
            print(f'working on {y}')
            # Turning csv into dataframe
            # Skipping the first two rows because they have fewer columns than rest
            # Also useful for chaining of tables later

            df = pd.read_csv(csv,  header = None, skiprows=[0, 1, 2])
            gathered_data.append(df)
            df = pd.concat(gathered_data)

            constituency_name = pages[0][0:5]
            df.to_csv(save_folder+constituency_name+'.csv')

        print(f'Saved {constituency_name} to folder.')

        move_files(folder, old)
        
    except Exception as e:
        print(e)
        exit()

working on AC166-01
working on AC166-02
working on AC166-03
working on AC166-04
working on AC166-05
working on AC166-06
working on AC166-07
working on AC166-08
working on AC166-09
working on AC166-10
working on AC166-11
working on AC166-12
working on AC166-13
working on AC166-14
working on AC166-15
working on AC166-16
working on AC166-17
Saved AC166 to folder.


## Another skewing code

This one seems to work reliably

In [101]:
import numpy as np
from scipy.ndimage import interpolation as inter

def correct_skew(image, delta=1, limit=5):
    def determine_score(arr, angle):
        data = inter.rotate(arr, angle, reshape=False, order=0)
        histogram = np.sum(data, axis=1)
        score = np.sum((histogram[1:] - histogram[:-1]) ** 2)
        return histogram, score

    thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] 

    scores = []
    angles = np.arange(-limit, limit + delta, delta)
    for angle in angles:
        histogram, score = determine_score(thresh, angle)
        scores.append(score)

    best_angle = angles[scores.index(max(scores))]

    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, best_angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, \
              borderMode=cv2.BORDER_REPLICATE)

    return best_angle, rotated

In [111]:
image = cv2.imread(('/home/hennes/Internship/pdfs/AC003-001.png'), cv2.IMREAD_GRAYSCALE)
angle, rotated = correct_skew(image)
print(angle)

-2


In [112]:
cv2.imshow('image', cv2.resize(rotated, (1065, 800)))
k = cv2.waitKey(0) & 0xFF
if k == 27:         # wait for ESC key to exit
    cv2.destroyAllWindows()

## Fixing Table Detection

In [176]:
[cv2.rectangle(image,(x,y),(x+w,y+h),(0,255,0),2) for x, y, w, h in bounding_rects]

[array([[255, 255, 255, ..., 255, 255, 255],
        [255, 255, 255, ..., 255, 255, 255],
        [255, 255, 255, ..., 255, 255, 255],
        ...,
        [255, 255, 255, ..., 255, 255, 255],
        [255, 255, 255, ..., 255, 255, 255],
        [255, 255, 255, ..., 255, 255, 255]], dtype=uint8)]

In [177]:
cv2.drawContours(image, cont, -1, (0, 255, 0), 3)

array([[255, 255, 255, ..., 255, 255, 255],
       [255, 255, 255, ..., 255, 255, 255],
       [255, 255, 255, ..., 255, 255, 255],
       ...,
       [255, 255, 255, ..., 255, 255, 255],
       [255, 255, 255, ..., 255, 255, 255],
       [255, 255, 255, ..., 255, 255, 255]], dtype=uint8)

In [248]:
cv2.imshow('image', thresh)
k = cv2.waitKey(0) & 0xFF
if k == 27:         # wait for ESC key to exit
    cv2.destroyAllWindows()

In [246]:
image = cv2.imread(('/home/hennes/Internship/pdfs/PC001-009.png'), cv2.IMREAD_GRAYSCALE)


BLUR_KERNEL_SIZE = (17, 17)
STD_DEV_X_DIRECTION = 0
STD_DEV_Y_DIRECTION = 0
blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
MAX_COLOR_VAL = 255
BLOCK_SIZE = 15
SUBTRACT_FROM_MEAN = -2

img_bin = cv2.adaptiveThreshold(
    ~blurred,
    MAX_COLOR_VAL,
    cv2.ADAPTIVE_THRESH_MEAN_C,
    cv2.THRESH_BINARY,
    BLOCK_SIZE,
    SUBTRACT_FROM_MEAN,
)

kernel = np.ones((3,3),np.uint8)
img_bin = cv2.dilate(img_bin,kernel,iterations = 1)

vertical = horizontal = img_bin.copy()
SCALE = 8
image_width, image_height = horizontal.shape
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)

horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))

mask = horizontally_dilated + vertically_dilated
contours, heirarchy = cv2.findContours(
    mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
)
cont = contours.copy()

MIN_TABLE_AREA = 1e5
contours = [c for c in contours if cv2.contourArea(c) > MIN_TABLE_AREA]
perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
epsilons = [0.1 * p for p in perimeter_lengths]
approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
bounding_rects = [cv2.boundingRect(a) for a in approx_polys]

txt = None

# if AC option is True, then do a full OCR of the image. Search for the count of segment in the text.
# If there are two occurrences, the AC number should be written behind the second one. If there is
# only one occurrence, then the number should be written behind 'constituency'. Extract the number,
# which should be composed of 1-3 digits. This will be used in the naming of the folder later.

if AC == True:
    try:
        thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
        txt = pytesseract.image_to_string(thresh)
        if len(re.findall(r'[Ss]egment', txt)) == 2:
            txt = txt.split('egment')[-1]
            txt = re.search(r'\d{1,3}', txt).group(0)
        if len(re.findall(r'[Ss]egment', txt)) == 1:
            txt = txt.split('onstituency')[-1]
            txt = re.search(r'\d{1,3}', txt).group(0)
        print(txt)
    except:
        print('Something went wrong with the extraction of the AC number.')
        None


images = [image[y:5+y+h, x-10:x+w+5] for x, y, w, h in bounding_rects]
print('1.')
if images[0].size == 0 or len(images[0]) < 50:
    images = [image[y:10+y+h, x-5:x+w+15] for x, y, w, h in bounding_rects]
    print('2.')
    if images[0].size == 0 or len(images[0]) < 50:
        images = [image[y:5+y+h, x:x+w+5] for x, y, w, h in bounding_rects]
        print('3.')
        if images[0].size == 0 or len(images[0]) < 50:
            images = [image[y:5+y+h, x:x+w] for x, y, w, h in bounding_rects]
            print('4.')
            if images[0].size == 0 or len(images[0]) < 50:
                print('Something went wrong with the table extraction.')

24
1.


In [244]:
txt

'24'

## Fixing OCR Cell Cropping

In [241]:
import math

In [156]:
cv2.rectangle(image,(x,y),(x+w,y+h),(0,255,0),2)

array([[197, 241, 237, ..., 129,  70,   7],
       [212, 228, 244, ..., 237, 131,   7],
       [211, 248, 248, ..., 252, 121,  13],
       ...,
       [249, 249, 255, ..., 247, 109,   6],
       [228, 248, 250, ..., 255, 123,  24],
       [214, 255, 255, ..., 250, 110,  10]], dtype=uint8)

In [243]:
cv2.imshow('image', cv2.resize(bordered, (213, 160)))
k = cv2.waitKey(0) & 0xFF
if k == 27:         # wait for ESC key to exit
    cv2.destroyAllWindows()

In [201]:
cv2.drawContours(image, horizontal_lines, -1, (0, 255, 0), 3)

error: OpenCV(4.5.3) /tmp/pip-req-build-afu9cjzs/opencv/modules/imgproc/src/drawing.cpp:2501: error: (-215:Assertion failed) npoints > 0 in function 'drawContours'


In [242]:
image = cv2.imread(('/home/hennes/Internship/old_files/AC170-01/cells/002-008.png'), cv2.IMREAD_GRAYSCALE)
T, image = cv2.threshold(image, 0, 255, cv2.THRESH_OTSU)

MAX_COLOR_VAL = 255
BLOCK_SIZE = 15
SUBTRACT_FROM_MEAN = -2

img_bin = cv2.adaptiveThreshold(
    ~image,
    MAX_COLOR_VAL,
    cv2.ADAPTIVE_THRESH_MEAN_C,
    cv2.THRESH_BINARY,
    BLOCK_SIZE,
    SUBTRACT_FROM_MEAN,
)

img_h, img_w = image.shape
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img_w * 0.5), 1))
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(img_h * 0.9)))
horizontal_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
vertical_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
both = horizontal_lines + vertical_lines
cleaned = img_bin - both

# Get rid of little noise.
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (1, 1))
opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)
opened = cv2.dilate(opened, kernel)

contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
bounding_rects = [cv2.boundingRect(c) for c in contours]
NUM_PX_COMMA = 6
MIN_CHAR_AREA = 5 * 9
char_sized_bounding_rects = [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]
if char_sized_bounding_rects:
    minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
    for x, y, w, h in char_sized_bounding_rects:
        minx = min(minx, x)
        miny = min(miny, y)
        maxx = max(maxx, x + w)
        maxy = max(maxy, y + h)
    x, y, w, h = minx, miny, maxx - minx, maxy - miny
    cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)]
else:
    # If we morphed out all of the text, assume an empty image.
    cropped = MAX_COLOR_VAL * np.ones(shape=(20, 100), dtype=np.uint8)
bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)

## Fixing Cell Extraction

In [11]:
image = cv2.imread(('/home/hennes/Internship/old_files/AC185-01/table-000.png'))

In [12]:
BLUR_KERNEL_SIZE = (5, 5)
STD_DEV_X_DIRECTION = 0
STD_DEV_Y_DIRECTION = 0
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
# Then thresholded to facilitate transformations
MAX_COLOR_VAL = 255
BLOCK_SIZE = 15
SUBTRACT_FROM_MEAN = -2

img_bin = cv2.adaptiveThreshold(
    ~blurred,
    MAX_COLOR_VAL,
    cv2.ADAPTIVE_THRESH_MEAN_C,
    cv2.THRESH_BINARY,
    BLOCK_SIZE,
    SUBTRACT_FROM_MEAN,
)
kernel = np.ones((3,3),np.uint8)
img_bin = cv2.dilate(img_bin,kernel,iterations = 1)

# Finding Vertical and Horizontal Lines
vertical = horizontal = img_bin.copy()
SCALE = 11
image_width, image_height = horizontal.shape
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)

horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))

mask = horizontally_dilated + vertically_dilated

# Finding Contours of the lines
contours, heirarchy = cv2.findContours(
    mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE,
)

perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
epsilons = [0.05 * p for p in perimeter_lengths]
approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]

# Filter out contours that aren't rectangular. Those that aren't rectangular
# are probably noise.
approx_rects = [p for p in approx_polys if len(p) == 4]
bounding_rects = [cv2.boundingRect(a) for a in approx_rects]

# Filter out rectangles that are too narrow or too short.
MIN_RECT_WIDTH = 20     
MIN_RECT_HEIGHT = 10
bounding_rects = [
    r for r in bounding_rects if MIN_RECT_WIDTH < r[2] and MIN_RECT_HEIGHT < r[3]
]

# The largest bounding rectangle is assumed to be the entire table.
# Remove it from the list. We don't want to accidentally try to OCR
# the entire table.
largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
bounding_rects = [b for b in bounding_rects if b is not largest_rect]

cells = [c for c in bounding_rects]
def cell_in_same_row(c1, c2):
    c1_center = c1[1] + c1[3] - c1[3] / 2
    c2_bottom = c2[1] + c2[3]
    c2_top = c2[1]
    return c2_top < c1_center < c2_bottom

orig_cells = [c for c in cells]
rows = []
while cells:
    first = cells[0]
    rest = cells[1:]
    cells_in_same_row = sorted(
        [
            c for c in rest
            if cell_in_same_row(c, first)
        ],
        key=lambda c: c[0]
    )

    row_cells = sorted([first] + cells_in_same_row, key=lambda c: c[0])
    rows.append(row_cells)
    cells = [
        c for c in rest
        if not cell_in_same_row(c, first)
    ]

# Sort rows by average height of their center.
def avg_height_of_center(row):
    centers = [y + h - h / 2 for x, y, w, h in row]
    return sum(centers) / len(centers)

rows.sort(key=avg_height_of_center)
cell_images_rows = []
for row in rows:
    cell_images_row = []
    for x, y, w, h in row:
        cell_images_row.append(image[y:y+h, x:x+w])
    cell_images_rows.append(cell_images_row)

In [239]:
[cv2.rectangle(image,(x,y),(x+w,y+h),(0,255,0),2) for x,y,h,w in bounding_rects]

[array([[[251, 251, 251],
         [251, 251, 251],
         [252, 252, 252],
         ...,
         [254, 254, 254],
         [254, 254, 254],
         [254, 254, 254]],
 
        [[250, 250, 250],
         [251, 251, 251],
         [254, 254, 254],
         ...,
         [254, 254, 254],
         [254, 254, 254],
         [254, 254, 254]],
 
        [[249, 249, 249],
         [251, 251, 251],
         [253, 253, 253],
         ...,
         [254, 254, 254],
         [254, 254, 254],
         [254, 254, 254]],
 
        ...,
 
        [[254, 254, 254],
         [254, 254, 254],
         [254, 254, 254],
         ...,
         [254, 254, 254],
         [254, 254, 254],
         [254, 254, 254]],
 
        [[253, 253, 253],
         [253, 253, 253],
         [253, 253, 253],
         ...,
         [252, 252, 252],
         [254, 254, 254],
         [253, 253, 253]],
 
        [[251, 251, 251],
         [251, 251, 251],
         [251, 251, 251],
         ...,
         [246, 246, 246],
  

In [14]:
cv2.drawContours(image, contours, -1, (57, 255, 20), 3)

array([[[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       ...,

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]]

In [15]:
cv2.imshow('image', cv2.resize(image, (1226, 921)))
k = cv2.waitKey(0) & 0xFF
if k == 27:         # wait for ESC key to exit
    cv2.destroyAllWindows()

## Trying out AC naming

In [3]:
def find_tables(image, AC, old, SCALE):
    BLUR_KERNEL_SIZE = (17, 17)
    STD_DEV_X_DIRECTION = 0
    STD_DEV_Y_DIRECTION = 0
    blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
    MAX_COLOR_VAL = 255
    BLOCK_SIZE = 15
    SUBTRACT_FROM_MEAN = -2
    
    img_bin = cv2.adaptiveThreshold(
        ~blurred,
        MAX_COLOR_VAL,
        cv2.ADAPTIVE_THRESH_MEAN_C,
        cv2.THRESH_BINARY,
        BLOCK_SIZE,
        SUBTRACT_FROM_MEAN,
    )
    vertical = horizontal = img_bin.copy()
    image_width, image_height = horizontal.shape
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
    horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
    vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
    
    horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
    vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))
    
    mask = horizontally_dilated + vertically_dilated
    contours, heirarchy = cv2.findContours(
        mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE,
    )

    MIN_TABLE_AREA = 1e5
    contours = [c for c in contours if cv2.contourArea(c) > MIN_TABLE_AREA]
    perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
    epsilons = [0.1 * p for p in perimeter_lengths]
    approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
    bounding_rects = [cv2.boundingRect(a) for a in approx_polys]

    if AC == True:
        try:
            nimg = image
            txt = pytesseract.image_to_string(nimg)
            print(len(re.findall(r'[Ss]egment', txt)))
            if len(re.findall(r'[Ss]egment', txt)) == 2:
                txt = txt.split('egment')[-1]
                txt = re.search(r'\d{1,3}', txt).group(0)
                print(f'number is {txt}')
            elif len(re.findall(r'[Ss]egment', txt)) == 1:
                txt = txt.split('onstituency')[-1]
                txt = re.search(r'\d{1,3}', txt).group(0)
                print(f'number is {txt}')
            elif len(re.findall(r'[Ss]egment', txt)) == 0:
                txt = old
                print(f'number is {txt}')
        except:
            print('Something went wrong with the extraction of the AC number.')
            print(old)
            txt = old
            print(txt)
            None
        print(f'number should still be {txt}')


    # The link where a lot of this code was borrowed from recommends an
    # additional step to check the number of "joints" inside this bounding rectangle.
    # A table should have a lot of intersections. We might have a rectangular image
    # here though which would only have 4 intersections, 1 at each corner.
    # Leaving that step as a future TODO if it is ever necessary.
    images = [image[y:5+y+h, x-10:x+w+10] for x, y, w, h in bounding_rects]
    if images[0].size == 0 or len(images[0]) < 50:
        images = [image[y:5+y+h, x-5:x+w+5] for x, y, w, h in bounding_rects]
        if images[0].size == 0 or len(images[0]) < 50:
            images = [image[y:5+y+h, x:x+w+5] for x, y, w, h in bounding_rects]
            if images[0].size == 0 or len(images[0]) < 50:
                images = [image[y:y+h, x:x+w] for x, y, w, h in bounding_rects]
    
    return images, txt

In [9]:
import pytesseract
AC = True

results = []
old = None
for f in imglist:
    directory, filename = os.path.split(f)
    image = cv2.imread(f, cv2.IMREAD_GRAYSCALE)
    tables, txt = find_tables(image, AC, old, SCALE=8)
    old = txt
    print(f'Extracted text: {txt}')
    if tables[0].size == 0 or len(tables[0]) < 50:
        tables = find_tables(image, SCALE=20)
        if tables[0].size == 0 or len(tables[0]) < 50:
            print(f'Extraction error: {files}.')
    files = []
    filename_sans_extension = os.path.splitext(filename)[0]
    if AC == True:
        txt = "{:03d}".format(int(txt))
        filename_sans_extension = f'AC{txt}-{filename_sans_extension.split("-")[-1]}'
    if tables:
        os.makedirs(os.path.join(directory, filename_sans_extension), exist_ok=True)
    for i, table in enumerate(tables):
        table_filename = "table-{:03d}.png".format(i)
        table_filepath = os.path.join(
            directory, filename_sans_extension, table_filename
        )
        files.append(table_filepath)
    print(f'done for {f}')
       

1
number is 57
number should still be 57
Extracted text: 57
done for /home/hennes/Internship/pdfs/PC009-01.png
0
number is 57
number should still be 57
Extracted text: 57
done for /home/hennes/Internship/pdfs/PC009-02.png
0
number is 57
number should still be 57
Extracted text: 57
done for /home/hennes/Internship/pdfs/PC009-03.png
0
number is 57
number should still be 57
Extracted text: 57
done for /home/hennes/Internship/pdfs/PC009-04.png
0
number is 57
number should still be 57
Extracted text: 57
done for /home/hennes/Internship/pdfs/PC009-05.png
0
number is 57
number should still be 57
Extracted text: 57
done for /home/hennes/Internship/pdfs/PC009-06.png
0
number is 57
number should still be 57
Extracted text: 57
done for /home/hennes/Internship/pdfs/PC009-07.png
0
number is 57
number should still be 57
Extracted text: 57
done for /home/hennes/Internship/pdfs/PC009-08.png
0
number is 57
number should still be 57
Extracted text: 57
done for /home/hennes/Internship/pdfs/PC009-09.png
0

0
number is 66
number should still be 66
Extracted text: 66
done for /home/hennes/Internship/pdfs/PC009-75.png


In [8]:
imglist = sorted(imglist)