In [4]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import cv2

import pathlib as pl
from tqdm import tqdm
import pytesseract as pt
import os

In [2]:
def sort_contours(cnts, method='left-to-right'):
    reverse = False
    i = 0
    if method == 'right-to-left' or method == 'bottom-to-top':
        reverse = True
    if method == 'top-to-bottom' or method == 'bottom-to-top':
        i = 1
    boundingBoxes = [cv2.boundingRect(c) for c in cnts]
    (cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes),
    key=lambda b:b[1][i], reverse=reverse))
    return (cnts, boundingBoxes)

def ocr_core(img, iterations=4, loop=None):
    if loop:
        loop.set_postfix_str('Preprocessing...')

    thresh, img_bin = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    kernel_len = np.array(img).shape[1] // 100
    ver_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_len))
    hor_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_len, 1))
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))

    image_1 = cv2.erode(img_bin, ver_kernel, iterations=iterations)
    vertical_lines = cv2.dilate(image_1, ver_kernel, iterations=iterations)

    image_2 = cv2.erode(img_bin, hor_kernel, iterations=iterations)
    horizontal_lines = cv2.dilate(image_2, hor_kernel, iterations=iterations)

    img_vh = cv2.addWeighted(vertical_lines, 0.5, horizontal_lines, 0.5, 0.0)
    img_vh = cv2.erode(~img_vh, kernel, iterations=iterations // 2)

    thresh, img_vh = cv2.threshold(img_vh, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    bitxor = cv2.bitwise_xor(img,img_vh)
    bitnot = cv2.bitwise_not(bitxor)

    contours, hierarchy = cv2.findContours(img_vh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

    contours, boundingBoxes = sort_contours(contours, method='top-to-bottom')

    heights = [boundingBoxes[i][3] for i in range(len(boundingBoxes))]

    mean = np.mean(heights)

    box = []
    for c in contours:
        x, y, w, h = cv2.boundingRect(c)
        if (w < 1000 and h < 500):
            image = cv2.rectangle(img, (x,y), (x+w,y+h), (0,255,0), 2)
            box.append([x,y,w,h])

    row = []
    column = []
    j = 0

    for i in range(len(box)):
        if(i == 0):
            column.append(box[i])
            previous = box[i]
        else:
            if(box[i][1] <= previous[1] + mean / 2):
                column.append(box[i])
                previous = box[i]

                if(i == len(box)-1):
                    row.append(column)
            else:
                row.append(column)
                column = []
                previous = box[i]
                column.append(box[i])

    countcol = 0
    for i in range(len(row)):
        countcol = len(row[i])
        if countcol > countcol:
            countcol = countcol

    center = [int(row[i][j][0]+row[i][j][2]/2) for j in range(len(row[i])) if row[0]]

    center = np.array(center)
    center.sort()

    finalboxes = []
    for i in range(len(row)):
        lis=[]
        for k in range(countcol):
            lis.append([])
        for j in range(len(row[i])):
            diff = abs(center-(row[i][j][0]+row[i][j][2]/4))
            minimum = min(diff)
            indexing = list(diff).index(minimum)
            lis[indexing].append(row[i][j])
        finalboxes.append(lis)

    outer = []
    for i in range(len(finalboxes)):
        for j in range(len(finalboxes[i])):
            inner = ''
            if(len(finalboxes[i][j]) == 0):
                outer.append('')
            else:
                for k in range(len(finalboxes[i][j])):
                    y, x, w, h = finalboxes[i][j][k][0], finalboxes[i][j][k][1], finalboxes[i][j][k][2], finalboxes[i][j][k][3]
                    finalimg = bitnot[x:x+h, y:y+w]
                    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 1))
                    border = cv2.copyMakeBorder(finalimg, 2, 2, 2, 2, cv2.BORDER_CONSTANT,value=[255, 255])
                    resizing = cv2.resize(border, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
                    dilation = cv2.dilate(resizing, kernel, iterations=1)
                    erosion = cv2.erode(dilation, kernel, iterations=2)

                    out = pt.image_to_string(erosion, config='--psm 3', lang='rus')
                    inner = inner + ' ' + out.replace('\n', '').strip(' ')
                outer.append(inner.strip(' '))

        if loop:
            loop.set_postfix_str(f'OCR {(i+1)/len(finalboxes)*100:.0f}% done...')

    arr = np.array(outer)
    df = pd.DataFrame(arr.reshape(len(row), countcol))
    df = df[~(df == '').all(axis=1)]

    if loop:
        loop.set_postfix_str(f'Got {df.shape[0]} rows, {df.shape[1]} cols')

    return df

In [6]:
images_dir = pl.Path('./../gen_images_from_excel/images')
output_path = pl.Path('./output.csv')
target_header = ('inn_ucb', 'ogrn_ucb', 'spark_id_ucb', 'LegalAddressFTS')
os.system(f'rm -f {output_path}')

images_all = list(images_dir.glob('*.png'))
loop = tqdm(images_all, total=len(images_all), desc='Processing images')
for image_path in loop:
    image = cv2.imread(str(image_path))
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    df = ocr_core(gray, loop=loop)

    # header = target_header if not output_path.exists() else False
    df.to_csv('output.csv', mode='a', header=False, index=False)  # header=header,

Processing images: 100%|██████████| 4/4 [00:14<00:00,  3.63s/it, Got 25 rows, 4 cols]
