In [1]:
from PIL import Image
import cv2
from pytesseract import pytesseract
import numpy as np

In [2]:
# Defining path to tesseract.exe 
pytesseract.tesseract_cmd = '/bin/tesseract'

### Working on the original image (tree + text)

In [3]:
image_path = '../test_data/Dendrogram.png'
gray_image_path = '../test_data/Dendrogram_gray.png'

In [4]:
img = cv2.imread(image_path)
img = cv2.resize(img, (int(img.shape[0]*2), int(img.shape[1]*2)))
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
ret, thresh1 = cv2.threshold(img, 0, 255, cv2.THRESH_OTSU |
                                        cv2.THRESH_BINARY_INV)
if thresh1[0,0]<200:
    thresh1 = np.where(thresh1>200, 0, 255)
# To read the text from an image (e.g. cropped fragment containing the labels), we need to invert white and black --- see below
# cv2.imwrite(gray_image_path, thresh1)
# text = pytesseract.image_to_string(gray_image_path)
# text[:-2]

In [5]:
rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (12, 12))
if thresh1[0,0]>200:
    thresh1 = np.where(thresh1>200, 0, 255)

dilation = cv2.dilate(thresh1.astype('uint8'), rect_kernel, iterations = 3)
cv2.imwrite('../test_data/dilation_image.jpg', dilation)
contours, hierarchy = cv2.findContours(dilation, cv2.RETR_EXTERNAL,
                                            cv2.CHAIN_APPROX_NONE)
boxes = [cv2.boundingRect(cnt) for cnt in contours]

In [6]:
def draw_countours(image, boxes):
    for box in boxes:
        x, y, w, h = box
        
        # Draw the bounding box on the text area
        rect = cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
        
        cv2.imwrite('../test_data/rectanglebox.jpg', rect)


In [7]:
def save_boxes_as_images(image, boxes, char_height=35):
    boxes_sorted_y = sorted(boxes, key=lambda x: x[1]+x[3], reverse=True) # height
    tree_box = boxes_sorted_y.pop()
    x, y, w, h = tree_box
    cropped = image[y:y + h, x:x + w]
    cv2.imwrite('../test_data/only_tree.png', cropped)
    
    boxes_labels_sorted_x = sorted(boxes_sorted_y, key=lambda x: x[0]+x[2], reverse=False) # left to right
    for b_idx, box in enumerate(boxes_labels_sorted_x):
        x, y, w, h = box
        cropped = image[y:y + h, x:x + w]
        cropped = cv2.resize(cropped, (int(cropped.shape[0]*char_height/cropped.shape[1]), char_height))
        cv2.imwrite(f'../test_data/{b_idx}.png', cropped)

In [8]:
im2 = img.copy()
save_boxes_as_images(im2, boxes)
draw_countours(im2, boxes)

Let's try find a box size which allows us read the text

In [9]:
for h in range(60, 80, 2):
    im2 = img.copy()
    save_boxes_as_images(im2, boxes, char_height=h)
    for b_idx in range(len(boxes)-1):
        box_path = f'../test_data/{b_idx}.png'
        gray_box_path = f'../test_data/{b_idx}_gray.png'
        text = pytesseract.image_to_string(box_path)
        if any([s in text for s in 'ABCDEFGHI']):
            print(text)
            print(h)
        # let's try make it gray again and - if necessary - reverse the colors
        img_b = cv2.imread(box_path)
        img_b = cv2.cvtColor(img_b, cv2.COLOR_BGR2GRAY)
        ret, thresh1 = cv2.threshold(img_b, 0, 255, cv2.THRESH_OTSU |
                                                cv2.THRESH_BINARY_INV)
        if thresh1[0,0]>200:
            thresh1 = np.where(thresh1>200, 0, 255)
        cv2.imwrite(gray_box_path, thresh1)
        text = pytesseract.image_to_string(gray_box_path)
        if any([s in text for s in 'ABCDEFGHI']):
            print(text)
            print(h)
draw_countours(im2, boxes)


Manually cropped fragment

In [10]:
image_path = '../test_data/Dendrogram_only_text.png'
gray_image_path = '../test_data/Dendrogram_only_text_gray.png'

In [11]:
img = cv2.imread(image_path)
img = cv2.resize(img, (300, 75))
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
ret, thresh1 = cv2.threshold(img, 0, 255, cv2.THRESH_OTSU |
                                        cv2.THRESH_BINARY_INV)
if thresh1[0,0]<200:
    thresh1 = np.where(thresh1>200, 0, 255)
cv2.imwrite(gray_image_path, thresh1)
text = pytesseract.image_to_string(gray_image_path)
text[:-2]

'ABC DEF GH I'