Testing OpenCV's EAST text detection

In [None]:
%matplotlib inline
import random
import time
from pathlib import Path
from shutil import copy

import cv2
import numpy as np
import pandas as pd
import pytesseract
from fuzzywuzzy import fuzz, process
from imutils.object_detection import non_max_suppression
from ipywidgets import IntProgress, Label, VBox
from IPython.display import display
from matplotlib import pyplot as plt
from PIL import Image, ImageFilter
from skimage.measure import compare_ssim as ssim

import img_qc.img_qc as img_qc

plt.rc('figure', figsize=(20.0, 10.0))

In [None]:
def get_top_percent_of_image(cv2_image, percentage):
    height, width = cv2_image.shape[:2]
    x1, y1, x2, y2 = 0, 0, width, int(height * percentage)
    image_cropped = cv2_image[y1:y2, x1:x2]
    return image_cropped


def crop_image_for_ocr(image_path, percentage=0.4, top_and_sides_padding=100):
    image = cv2.imread(str(image_path))
    height, width = image.shape[:2]
    x1, y1, x2, y2 = 0, 0, width, int(height * percentage)
    x1 += top_and_sides_padding
    y1 += top_and_sides_padding
    x2 -= top_and_sides_padding
    y2 += top_and_sides_padding
    image = image[y1:y2, x1:x2]
    return image


def if_rgb_convert_to_gray(np_image):
    if len(np_image.shape) > 2:
        np_image = cv2.cvtColor(np_image, cv2.COLOR_RGB2GRAY)
        
    return np_image


def bgr_imshow(bgr_image):
    bgr_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2RGB)
    plt.imshow(bgr_image)
    plt.show()
    
    
def decode_predictions(scores, geometry):
    # grab the number of yas and columns from the scores volume, then
    # initialize our set of bounding box rectangles and corresponding
    # confidence socres
    (number_of_rows, number_of_columns) = scores.shape[2:4]
    rectangles = []
    confidences = []

    # loop over the number of ys
    for y in range(0, number_of_rows):
        # extract the scores (probabilities), followed by the geometrical
        # data used to derive potential bounding box coordinates that surround text
        scores_data = scores[0, 0, y]
        x_data_0 = geometry[0, 0, y]
        x_data_1 = geometry[0, 1, y]
        x_data_2 = geometry[0, 2, y]
        x_data_3 = geometry[0, 3, y]
        angles_data = geometry[0, 4, y]

        # loop over the number of columns
        for x in range(0, number_of_columns):
            # if our score does not have a sufficient probability, ignore it
            # print(f'confidence/min confidence: {scores_data[x]}/{minimum_confidence}')
            if scores_data[x] < minimum_confidence:
                continue

            # compute the offset factor as our resulting feature maps will
            # be 4x smaller than the input image
            (offset_x, offset_y) = (x * 4.0, y * 4.0)

            # extract the rotation angle for the prediction and then
            # compute sin and cosine
            angle = angles_data[x]
            cos = np.cos(angle)
            sin = np.sin(angle)

            # use the geometry volume to derive the width and height of the bounding box
            bounding_box_height = x_data_0[x] + x_data_2[x]
            bounding_box_width = x_data_1[x] + x_data_3[x]

            # compute both the starting and ending (x, y)-coordinates for the
            # text prediction bounding box
            end_x = int(offset_x + (cos * x_data_1[x]) + (sin * x_data_2[x]))
            end_y = int(offset_y - (sin * x_data_1[x]) + (cos * x_data_2[x]))
            start_x = int(end_x - bounding_box_width)
            start_y = int(end_y - bounding_box_height)

            # add bounding box coordinates and probability score to respective lists
            rectangles.append((start_x, start_y, end_x, end_y))
            confidences.append(scores_data[x])
        
    # return a tuple of the bounding boxes and associated confidences
    return (rectangles, confidences)

In [None]:
data_dir_path = Path('/Volumes/fluffy/ProjectCeres/00_for_CRL/agrtfn')

page_1_paths_list = sorted(data_dir_path.glob('**/*_0001.tif'))
# remove macOS '.' index files
page_1_paths_list = [x for x in page_1_paths_list if not str(x.stem).startswith('.')]
len(page_1_paths_list)

In [None]:
# progress bar
progress_label = Label('Images to copy')
progress_bar = IntProgress(min=0, max=len(page_1_paths_list))
progress_widget = VBox([progress_label, progress_bar])
display(progress_widget)

# download copies of all page 1 files
for index, image_path in enumerate(page_1_paths_list, start=1):
    progress_label.value = image_path.name
    copy_to_path = Path('data/').joinpath(image_path.name)
    copy(image_path, copy_to_path)
    progress_bar.value = index

This is processing all issues in {data_dir_path} from the Tennessee Farm News

In [None]:
data_dir_path = Path('data/')
page_1_paths_list = sorted(data_dir_path.glob('images/*.tif'))
page_1_paths_list = [x for x in page_1_paths_list if not str(x.stem).startswith('.')]
print(f'{len(page_1_paths_list)} images in page 1 paths list')

In [None]:
number_of_grayscale = 0
number_of_rgb = 0
gray_image_paths_list = []
rgb_image_paths_list = []
for image_path in page_1_paths_list:
    image = Image.open(image_path)
    if image.mode == 'L':
        number_of_grayscale += 1
        gray_image_paths_list.append(image_path)
    elif image.mode == 'RGB':
        number_of_rgb += 1
        rgb_image_paths_list.append(image_path)
print(f'# of grayscale: {number_of_grayscale}')
print(f'      # of rgb: {number_of_rgb}')
print(f'  total images: {number_of_grayscale + number_of_rgb}')

In [None]:
image_path = page_1_paths_list[2500]

In [None]:
months = [
    'january',
    'february',
    'march',
    'april',
    'may',
    'june',
    'july',
    'august',
    'september',
    'october',
    'november',
    'december'
]

def return_date(image):
    
    date = []
    
    # blur image and convert to grayscale if necessary
    image = cv2.bilateralFilter(image, 9, 9, 9)
    # image = cv2.GaussianBlur(image, (9, 9), 0)
    if len(image.shape) > 2:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # show image    
    # plt.imshow(image, cmap='gray'), plt.show()
    
    # binarize
    binarized = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 55, 11)
    
    # OCR
    config='-c tessedit_char_whitelist=0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ\  --psm 6'
    ocr_text = pytesseract.image_to_string(binarized, lang='eng', config=config)
    # Get verbose data including boxes, confidences, line and page numbers
    ocr_dataframe = pytesseract.image_to_data(binarized, lang='eng', output_type=pytesseract.Output.DATAFRAME, config=config)
    test_boxes = pytesseract.image_to_boxes(binarized, lang='eng', config=config)
    
    # split OCR results and look for "months"
    # for line in text.split('\n'):
    #     words = line.split()
    #     for index, word in enumerate(words):
    #         if word.lower() in months:
    #             date.append(words[index:])
                
    # plt.imshow(binarized, cmap='gray'), plt.show()
    
    return ocr_dataframe, ocr_text, binarized, test_boxes

In [None]:
roi_box_list = ['text', 'left', 'top', 'width', 'height']

ocr_dataframe = return_date(image_path)
for index, ocr_result in enumerate(ocr_dataframe['text']):
    if isinstance(ocr_result, float):
        pass
    else:
        if ocr_result.lower() in months:
            # print(ocr_dataframe.iloc[index])
            box = []
            for edge in roi_box_list:
                box.append(ocr_dataframe.iloc[index][edge])
# print(box)

print(box)
box = [int(x) if x != box[0] else x for x in box]
text, x1, y1, box_width, box_height = box
x2 = x1 + box_width
y2 = y1 + box_height

# print(box)
image_to_crop = cv2.imread(str(image_path))
image_cropped = image_to_crop[y1:y2, x1:x2]
bgr_imshow(image_cropped)

In [None]:
def deskew(image):

    # convert the image to grayscale and flip the foreground
    # and background to ensure foreground is now "white" and
    # the background is "black"
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # threshold the image, setting all foreground pixels to
    # 255 and all background pixels to 0
    gray = cv2.medianBlur(gray, 5)
    gray = cv2.bitwise_not(gray)
    binarized = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
    # binarized = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, 55, 11)
    # plt.imshow(binarized, cmap='gray'), plt.show()
    
    # grab the (x, y) coordinates of all pixel values that
    # are greater than zero, then use these coordinates to
    # compute a rotated bounding box that contains all
    # coordinates
    coords = np.column_stack(np.where(binarized > 0))
    angle = cv2.minAreaRect(coords)[-1]

    # the `cv2.minAreaRect` function returns values in the
    # range [-90, 0); as the rectangle rotates clockwise the
    # returned angle trends to 0 -- in this special case we
    # need to add 90 degrees to the angle
    if angle < -45:
        angle = -(90 + angle)

    # otherwise, just take the inverse of the angle to make
    # it positive
    else:
        angle = -angle
    
    # print(angle)
    # rotate the image to deskew it
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    # bgr_imshow(image)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    # bgr_imshow(rotated)
    
    return rotated

In [None]:
percentage = 0.4

for image_path in page_1_paths_list[1500:1503]:
    print(image_path.stem)
    image = cv2.imread(str(image_path))
    height, width = image.shape[:2]
    x1, y1, x2, y2 = 0, 0, width, int(height * percentage)
    top_and_sides_padding = 100
    x1 += top_and_sides_padding
    y1 += top_and_sides_padding
    x2 -= top_and_sides_padding
    y2 += top_and_sides_padding
    image = image[y1:y2, x1:x2]
    
    rotated = deskew(image)
    
    variables = [ocr_dataframe, box, text, x1, y1, box_width, box_height, x2, y2]
    for variable in variables:
        variable = ''
    
    # OCR and load results as Pandas dataframe
    ocr_dataframe, ocr_text, binarized, test_boxes = return_date(rotated)
    
    words = ocr_text.lower().split()
    best_word = {}
    best_ratio = 90
    for month in months:
        # print(month)
        result = process.extractOne(month, words)
        word, ratio = result[:2]
        if ratio > best_ratio:
            best_word.update({month: word})
    print(best_word)
    print(100 * '*')
    print(test_boxes)

In [None]:
percentage = 0.4
top_and_sides_padding = 100
roi_box_list = ['text', 'left', 'top', 'width', 'height']

for image_path in page_1_paths_list[::100]:
    print(image_path.stem)
    
    variables = [ocr_dataframe, ocr_text, binarized, box, text, x1, y1, box_width, box_height, x2, y2]
    for variable in variables:
        variable = ''
    
    image = crop_image_for_ocr(image_path, percentage=percentage, top_and_sides_padding=top_and_sides_padding)
    
    original = image.copy()
    
    # OCR and load results as Pandas dataframe
    ocr_dataframe, ocr_text, binarized = return_date(image)
    
    find_date = True
    for index, ocr_result in enumerate(ocr_dataframe['text']):
        if find_date:
            # print(find_date)
            if isinstance(ocr_result, float):  # skip NaN values
                pass
            else:
                if ocr_result.lower() in months:
                    box_month, box_day, box_year = [], [], []
                    boxes = [box_month, box_day, box_year]
                    for box in boxes:
                        if isinstance(ocr_dataframe.iloc[index]['text'], str):  # skip NaN values
                            for data in roi_box_list:
                                box.append(ocr_dataframe.iloc[index][data])
                            index += 1
                    find_date = False
                    # print('NOW FALSE')
    if not find_date:
        if len(image.shape) < 3:
            image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
        bgr_imshow(image)
        plt.imshow(binarized, cmap='gray'), plt.show()
        
        fig, axes = plt.subplots(1, 3, figsize=(30, 10))
        fig.suptitle(image_path.stem, fontsize=16)
        for index, box in enumerate(boxes):
            print(index, box)

            box = [int(x) if x != box[0] else x for x in box]
            try: 
                text, x1, y1, box_width, box_height = box
                x2 = x1 + box_width
                y2 = y1 + box_height

                # print(box)
                
                image_cropped = image[y1:y2, x1:x2]
                image_cropped_rgb = cv2.cvtColor(image_cropped, cv2.COLOR_BGR2RGB)
                axes[index].imshow(image_cropped_rgb)
                axes[index].set_title(text)
            except ValueError:
                pass
        plt.show()
    else:
        print('No date found')
        
        if len(original.shape) < 3:
            original = cv2.cvtColor(original, cv2.COLOR_GRAY2BGR)
        bgr_imshow(original)
        plt.imshow(binarized, cmap='gray'), plt.show()
    print(ocr_text)
        

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(30, 10))

for index, box in enumerate(boxes):
    print(box)
    box = [int(x) if x != box[0] else x for x in box]
    text, x1, y1, box_width, box_height = box
    x2 = x1 + box_width
    y2 = y1 + box_height

    # print(box)
    image_to_crop = cv2.imread(str(image_path))
    image_cropped = image_to_crop[y1:y2, x1:x2]
    image_cropped_rgb = cv2.cvtColor(image_cropped, cv2.COLOR_BGR2RGB)
    axes[index].imshow(image_cropped_rgb)
    axes[index].set_title(text)
plt.show()

In [None]:
ocr_dataframe

In [None]:
image = cv2.imread(str(image_path))

image = get_top_percent_of_image(image, 0.3)

bgr_imshow(image)

#image = cv2.resize(image,(0,0),fx=7,fy=7)

image = cv2.GaussianBlur(image,(9,9),0)

#image = cv2.medianBlur(image,9)

image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

image = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 55, 11) 

plt.imshow(image, cmap='gray')
plt.show()

#bgr_imshow(image)

In [None]:
config='-c tessedit_char_whitelist=0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ,\  --psm 6'
text = pytesseract.image_to_string(image,
                                   lang='eng',
                                   config=config)

In [None]:
months = [
    'january',
    'february',
    'march',
    'april',
    'may',
    'june',
    'july',
    'august',
    'september',
    'october',
    'november',
    'december'
]

for line in text.split('\n'):
    words = line.split()
    for index, word in enumerate(words):
        if word.lower() in months:
            print(word)
            print(words[index:])

In [None]:
image_path = page_1_paths_list[-100]
resize_width = 1600  # must be a multiple of 32
resize_height = 1600  # must be a multiple of 32
minimum_confidence = 0.5
padding = 25

In [None]:
# load input image and get dimensions
image = cv2.imread(str(image_path))
original = image.copy()
(original_height, original_width) = image.shape[:2]

# set the new width and height then determine the ration in change
(new_width, new_height) = resize_width, resize_height
ratio_width = original_width / float(new_width)
ratio_height = original_height / float(new_height)

# resize the image and get the new dimensions
image = cv2.resize(image, (new_width, new_height))
(height, width) = image.shape[:2]

In [None]:
# define the two output layer names for the EAST detector model
# 1st: output probabilities
# 2nd: used to derive the bounding box coordinates of text
layer_names = [
    "feature_fusion/Conv_7/Sigmoid",
    "feature_fusion/concat_3"
]

# load the pre-trained EAST detector
print("[INFO] loading EAST text detector . . . ")
net = cv2.dnn.readNet('data/frozen_east_text_detection.pb')

In [None]:
# construct a blob from the image and perform a forward pass of the model
# to obtain the two output layer sets
blob = cv2.dnn.blobFromImage(image, 1.0, (width, height), 
                             (123.68, 116.78, 103.94), swapRB=True, crop=False)
start = time.time()
net.setInput(blob)
(scores, geometry) = net.forward(layer_names)
end = time.time()

# show timing information on text predition
print(f'[INFO] text detection took {round((end - start), 6)} seconds')

In [None]:
# grab the number of rows and columns from the scores volume, then
# initialize our set of bounding box rectangles and corresponding
# confidence scores
(number_of_rows, number_of_columns) = scores.shape[2:4]
rectangles = []
confidences = []

# loop over the number of ys
for y in range(0, number_of_rows):
    # extract the scores (probabilities), followed by the geometrical
    # data used to derive potential bounding box coordinates that surround text
    scores_data = scores[0, 0, y]
    x_data_0 = geometry[0, 0, y]
    x_data_1 = geometry[0, 1, y]
    x_data_2 = geometry[0, 2, y]
    x_data_3 = geometry[0, 3, y]
    angles_data = geometry[0, 4, y]
    
    # loop over the number of columns
    for x in range(0, number_of_columns):
        # if our score does not have a sufficient probability, ignore it
        if scores_data[x] < minimum_confidence:
            continue
        
        # compute the offset factor as our resulting feature maps will
        # be 4x smaller than the input image
        (offset_x, offset_y) = (x * 4.0, y * 4.0)
        
        # extract the rotation angle for the prediction and then
        # compute sin and cosine
        angle = angles_data[x]
        cos = np.cos(angle)
        sin = np.sin(angle)
        
        # use the geometry volume to derive the width and height of the bounding box
        bounding_box_height = x_data_0[x] + x_data_2[x]
        bounding_box_width = x_data_1[x] + x_data_3[x]
        
        # compute both the starting and ending (x, y)-coordinates for the
        # text prediction bounding box
        end_x = int(offset_x + (cos * x_data_1[x]) + (sin * x_data_2[x]))
        end_y = int(offset_y - (sin * x_data_1[x]) + (cos * x_data_2[x]))
        start_x = int(end_x - bounding_box_width)
        start_y = int(end_y - bounding_box_height)
        
        # add bounding box coordinates and probability score to respective lists
        rectangles.append((start_x, start_y, end_x, end_y))
        confidences.append(scores_data[x])

In [None]:
# apply non-maxima suppression to suppress weak, overlapping bouding boxes
boxes = non_max_suppression(np.array(rectangles), probs=confidences)

# loop over the bounding boxes
for (start_x, start_y, end_x, end_y) in boxes:
    # scale the bounding box coordinates based on the respective ratios
    start_x = int(start_x * ratio_width)
    start_y = int(start_y * ratio_height)
    end_x = int(end_x * ratio_width)
    end_y = int(end_y * ratio_height)
    
    # draw the bounding box on the image
    cv2.rectangle(original, (start_x, start_y), (end_x, end_y), (0, 255, 0), 8)

# show the output image
bgr_imshow(original)

In [None]:
minimum_confidence = 0.9
padding = 5
(rectangles, confidences) = decode_predictions(scores, geometry)

# apply non-maxima suppression to suppress weak, overlapping bouding boxes
boxes = non_max_suppression(np.array(rectangles), probs=confidences)

# initialize the list of results
results = []

# loop over the bounding boxes
for (start_x, start_y, end_x, end_y) in boxes:
    # scale the bounding box coordinates based on the respective ratios
    start_x = int(start_x * ratio_width)
    start_y = int(start_y * ratio_height)
    end_x = int(end_x * ratio_width)
    end_y = int(end_y * ratio_height)
    
    # obtain a better OCR by adding padding
    delta_x = int((end_x - start_x) * padding)
    delta_y = int((end_y - start_y) * padding)
    
    # apply the padding to each side of the bounding box
    start_x = max(0, start_x - delta_x)
    start_y = max(0, start_y - delta_y)
    end_x = min(original_width, end_x + (delta_x * 2))
    end_y = min(original_height, end_y + (delta_y * 2))
    
    # extract the actual padded ROI
    roi = original[start_y:end_y, start_x:end_x]
    
    # set tesseract parameters
    # language: english, oem: 1, LSTM neural net model, psm: 7, single line of text
    config = ("-l eng --oem 1 --psm 7")
    text = pytesseract.image_to_string(roi, config=config)
    
    # add the bounding box coordinates and OCR'd text to the list of results
    results.append(((start_x, start_y, end_x, end_y), text))
print(len(results))

In [None]:
# sort the results bounding box coordinates from top to bottom
results = sorted(results, key=lambda r:r[0][1])

# loop over the results
for ((start_x, start_y, end_x, end_y), text) in results[:10]:
    # display the text OCR'd by Tesseract
    print('OCR TEXT')
    print('========')
    print(f'{text}\n')
    
    # strip out the non-ASCII text and draw the texton the image using OpenCV
    # draw the text and a bounding box surrounding the text region of the input image
    text = ''.join([c if ord(c) < 128 else "" for c in text]).strip()
    output = original.copy()
    cv2.rectangle(output, (start_x, start_y), (end_x, end_y),
                 (0, 0, 255), 6)
    cv2.putText(output, text, (start_x, start_y - 20),
               cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 255), 9)
    
    # show the output image
    bgr_imshow(output)