# Automatic filtering images

If not evaluating the method, please set `EVALUATION = False`

In [None]:
EVALUATION = True

In [None]:
import cv2
import numpy as np
from matplotlib import pyplot as plt
import pytesseract
import subprocess

# Path to tesseract executable (in case it isn't in your PATH)
try:
    subprocess.call(["tesseract"])
except FileNotFoundError:
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [None]:
def comparison_plot(image1, image2, operation, title1="Source Image"):
    plt.subplot(121), plt.imshow(image1, 'gray'), plt.title(title1)
    plt.xticks([]), plt.yticks([])
    plt.subplot(122), plt.imshow(image2, 'gray'), plt.title(operation)
    plt.xticks([]), plt.yticks([])
    plt.show()

In [None]:
def tesseract_text(image, operation):
    text = pytesseract.image_to_string(image)
    print("\033[92m{}\033[00m".format(operation + '\n' + text))

In [None]:
def canny_edge_detection(image):
    edges = cv2.Canny(image, 80, 100, apertureSize=3)
    if not EVALUATION:
        comparison_plot(image, edges, "Canny edge detection", "Black and White image")
        tesseract_text(edges, "Canny edge detection")
    return edges

In [None]:
def denoising(image):
    dst = cv2.fastNlMeansDenoising(image, None, 30.0, 7, 21)
    if not EVALUATION:
        comparison_plot(image, dst, "Denoised")
        tesseract_text(dst, "Denoising")
    return dst

In [None]:
def gaussian_blur(image):
    blur = cv2.GaussianBlur(image, (7, 7), 0)
    if not EVALUATION:
        comparison_plot(image, blur, "Blurred")
        tesseract_text(blur, "Blurred")
    return blur

In [None]:
def thresholding(image):
    ret, th3 = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return th3

In [None]:
def erosion(image, erosion_size=5):
    erosion_shape = cv2.MORPH_RECT

    element = cv2.getStructuringElement(erosion_shape, (2 * erosion_size + 1, 2 * erosion_size + 1),
                                        (erosion_size, erosion_size))

    erosion_dst = cv2.erode(image, element)
    if not EVALUATION:
        comparison_plot(image, erosion_dst, "Eroded")
        tesseract_text(erosion_dst, "Eroded")
    return erosion_dst

In [None]:
def dilation(image, dilation_size=5):
    dilation_shape = cv2.MORPH_RECT
    element = cv2.getStructuringElement(dilation_shape, (2 * dilation_size + 1, 2 * dilation_size + 1),
                                        (dilation_size, dilation_size))
    dilatation_dst = cv2.dilate(image, element)
    if not EVALUATION:
        comparison_plot(image, dilatation_dst, "Dilation")
        tesseract_text(dilatation_dst, "Dilation")
    return dilatation_dst

In [None]:
def opening(image, dilation_size=5, iteration=1):
    dilation_shape = cv2.MORPH_RECT
    element = cv2.getStructuringElement(dilation_shape, (2 * dilation_size + 1, 2 * dilation_size + 1),
                                        (dilation_size, dilation_size))
    opening_src = cv2.morphologyEx(image, cv2.MORPH_OPEN, element, iterations=iteration)
    if not EVALUATION:
        comparison_plot(image, opening_src, "Opening")
        tesseract_text(opening_src, "Opening")
    return opening_src

In [None]:
def closing(image, dilation_size=1, iteration=1):
    dilation_shape = cv2.MORPH_RECT
    element = cv2.getStructuringElement(dilation_shape, (2 * dilation_size + 1, 2 * dilation_size + 1),
                                        (dilation_size, dilation_size))
    closing_src = cv2.morphologyEx(image, cv2.MORPH_CLOSE, element, iterations=iteration)
    if not EVALUATION:
        comparison_plot(image, closing_src, "Closing")
        tesseract_text(closing_src, "Closing")
    return closing_src

In [None]:
if __name__ == "__main__":
    plt.rcParams['figure.figsize'] = [15, 10]
    img = cv2.imread('../images/005.jpg')
    assert img is not None, "file could not be read, check with os.path.exists()"
    if not EVALUATION:
        tesseract_text(img, "Original")
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    canny_res = canny_edge_detection(img)
    denois_res = denoising(canny_res)
    gaussian_res = gaussian_blur(denois_res)
    thresh_res = thresholding(gaussian_res)

    eroded_res = erosion(thresh_res, 1)
    dilated_res = dilation(eroded_res, 1)
    closing_res = closing(thresh_res, 2, 1)
    opening(closing_res, 2, 1)

## Evaluation

In [None]:
import cv2
import numpy as np
import pytesseract
import os
import subprocess
import pandas as pd
import re
from Levenshtein import distance

if not EVALUATION:
    assert False

image_names = [str(i).zfill(3) + ".jpg" for i in range(1, 11)]

# Remove 004.jpg: this image is not meant to be preprocessed as it is already black text on a white background
image_names.remove("004.jpg")

PARENT_DIR = os.path.dirname(os.path.dirname(os.path.realpath("FILEPATH")))

if os.path.exists(os.path.join(PARENT_DIR, "results", "automatic_filtering.tsv")):
    os.remove(os.path.join(PARENT_DIR, "results", "automatic_filtering.tsv"))

for image_name in image_names:
    # Path to tesseract executable (in case it isn't in your PATH)
    try:
        subprocess.call(["tesseract"])
    except FileNotFoundError:
        pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

    image = cv2.imread(os.path.join(PARENT_DIR, "images", image_name), cv2.IMREAD_COLOR)

    if image is None:
        print('Error opening image')

    img = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    canny_res = canny_edge_detection(img)
    denois_res = denoising(canny_res)
    gaussian_res = gaussian_blur(denois_res)
    thresh_res = thresholding(gaussian_res)
    eroded_res = erosion(thresh_res, 1)
    dilated_res = dilation(eroded_res, 1)

    method_text = pytesseract.image_to_string(dilated_res)
    if method_text == "":
        method_text = " "

    baseline_text = pytesseract.image_to_string(image)
    if baseline_text == "":
        baseline_text = " "

    # Remove special characters from the text (\n, \t, \r, and multiple spaces all become a single space)
    baseline_text = re.sub(r'[\n\t\r]+', ' ', baseline_text)
    baseline_text = re.sub(r' +', ' ', baseline_text)
    method_text = re.sub(r'[\n\t\r]+', ' ', method_text)
    method_text = re.sub(r' +', ' ', method_text)

    ground_truth = pd.read_csv(os.path.join(PARENT_DIR, "results", "ground_truth.tsv"), sep="\t")

    if not os.path.exists(os.path.join(PARENT_DIR, "results", "automatic_filtering.tsv")) or len(ground_truth) != len(pd.read_csv(os.path.join(PARENT_DIR, "results", "automatic_filtering.tsv"), sep="\t")):
        automatic_filtering = pd.DataFrame(columns=["input", "text", "baseline_text", "method_text", "ocr", "baseline_dist", "method_dist"])
        # Add all rows from ground_truth to automatic_filtering, with values for "ocr" set to 0
        automatic_filtering["input"] = ground_truth["input"]
        automatic_filtering["text"] = ground_truth["text"]
        automatic_filtering["baseline_text"] = ""
        automatic_filtering["method_text"] = ""
        automatic_filtering["ocr"] = 0
        # distance is the Levenshtein distance between the ground truth and the OCR result, defaulting to -1
        automatic_filtering["baseline_dist"] = -1
        automatic_filtering["method_dist"] = -1

        automatic_filtering.to_csv(os.path.join(PARENT_DIR, "results", "automatic_filtering.tsv"), sep="\t", index=False)

    automatic_filtering = pd.read_csv(os.path.join(PARENT_DIR, "results", "automatic_filtering.tsv"), sep="\t", dtype={"method_text": "string", "baseline_text": "string"})

    # Insert/update the automatic_filtering dataframe
    automatic_filtering.loc[automatic_filtering["input"] == image_name, "text"] = ground_truth[ground_truth["input"] == image_name]["text"].values[0]
    automatic_filtering.loc[automatic_filtering["input"] == image_name, "baseline_text"] = baseline_text
    automatic_filtering.loc[automatic_filtering["input"] == image_name, "method_text"] = method_text
    automatic_filtering.loc[automatic_filtering["input"] == image_name, "ocr"] = 1
    automatic_filtering.loc[automatic_filtering["input"] == image_name, "baseline_dist"] = distance(ground_truth[ground_truth["input"] == image_name]["text"].values[0], baseline_text)
    automatic_filtering.loc[automatic_filtering["input"] == image_name, "method_dist"] = distance(ground_truth[ground_truth["input"] == image_name]["text"].values[0], method_text)

    automatic_filtering.to_csv(os.path.join(PARENT_DIR, "results", "automatic_filtering.tsv"), sep="\t", index=False)

automatic_filtering = pd.read_csv(os.path.join(PARENT_DIR, "results", "automatic_filtering.tsv"), sep="\t")
automatic_filtering
