In [7]:
import numpy as np
import os
import cv2
import pytesseract
import matplotlib.pyplot as plt
import matplotlib
import networkx as nx
from glue_lines import GlueLines

In [8]:
tree = os.walk('../images')
files = [item for item in tree][0][2]
files = list(filter(lambda name: name != '.DS_Store', files))

In [13]:
def is_to_union(rect1, rect2, threshold):
    return (rect2[0][0] <= rect1[1][0] + threshold and
           rect2[0][1] <= rect1[1][1] + threshold and
           rect1[0][1] <= rect2[1][1] + threshold)

def union_rectangles(rectangles, threshold=10):
    rectangles = sorted(rectangles, key=lambda rec: rec[0][0])
    unioned = [False] * len(rectangles)
    new_rectangles = []
    for i, cur_rect in enumerate(rectangles):
        if not unioned[i]:
            for j in range(i + 1, len(rectangles)):
                if is_to_union(cur_rect, rectangles[j], threshold):
                    cur_rect = [(cur_rect[0][0], 
                                 min(cur_rect[0][1], rectangles[j][0][1])), 
                                (max(cur_rect[1][0], rectangles[j][1][0]), 
                                 max(cur_rect[1][1], rectangles[j][1][1]))]
                    unioned[j] = True
            unioned[i] = True
            new_rectangles.append(cur_rect)
    return new_rectangles

SAVING_SIZE = (32, 32)

img_num = 0
for file in files:
    img = cv2.imread(f'../images/{file}', cv2.IMREAD_GRAYSCALE)
    img = cv2.bitwise_not(img)
    
    dst = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    cdst = cv2.cvtColor(dst, cv2.COLOR_GRAY2BGR)
    cdst2 = cv2.cvtColor(dst, cv2.COLOR_GRAY2BGR)

    lines = cv2.HoughLinesP(image=dst, 
                            rho=1, 
                            theta=(np.pi / 180),
                            threshold=50,
                            lines=None,
                            minLineLength=45,
                            maxLineGap=5
    )
    if lines is not None:
        lines = [line[0] for line in lines]

        gl = GlueLines(lines)
        needed_lines = gl.get_glued_lines()
        for l in needed_lines:
            cv2.line(cdst2, (l[0], l[1]), (l[2], l[3]), (0, 0, 0), 3, cv2.LINE_AA)

    threshold_area = 40
    black_cdst2 = cv2.cvtColor(cdst2, cv2.COLOR_BGR2GRAY)
    letters, _ = cv2.findContours(black_cdst2, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
    rectangles = []
    for letter in letters:
        (left, top, width, height) = cv2.boundingRect(letter)
        if width * height >= threshold_area:
            tl = (left, top)
            br = (left + width, top + height)
            rectangles.append([tl, br])
    new_rectangles = union_rectangles(rectangles)
    for rect in new_rectangles:
        cur_img = cdst2[rect[0][1]:rect[1][1], rect[0][0]:rect[1][0]]
        width = rect[1][0] - rect[0][0]
        height = rect[1][1] - rect[0][1]
        img_size = max(width, height) 
        sqared_img = np.zeros(shape=(img_size, img_size, 3))
        w_indent = (img_size - width) // 2
        h_indent = (img_size - height) // 2
        sqared_img[h_indent:(h_indent + height), w_indent:(w_indent + width)] = cur_img
        saving_img = cv2.resize(sqared_img, SAVING_SIZE, interpolation=cv2.INTER_AREA)
        img_num += 1
        cv2.imwrite(f'../data/{img_num}.png', saving_img)