# OCR on squared paper with shadow removal

In [None]:
import cv2
import numpy as np
from matplotlib import pyplot as plt
import pytesseract
import os
import subprocess

# Path to tesseract executable (in case it isn't in your PATH)
try:
    subprocess.call(["tesseract"])
except FileNotFoundError:
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

PARENT_DIR = os.path.dirname(os.path.dirname(os.path.realpath("FILEPATH")))
image_name = "011.jpg"
image = cv2.imread(os.path.join(PARENT_DIR, "images", image_name), cv2.IMREAD_COLOR)

# Check if image is loaded fine
if image is None:
    print('Error opening image')

# Tesseract OCR before processing
text = pytesseract.image_to_string(image)
print("Before processing:\n" + "\033[92m{}\033[00m".format(text))
plt.imshow(image)

In [None]:
# image = cv2.compareHist(image)
img = np.copy(image)

# Shadow removal of the image
rgb_planes = cv2.split(img)

result_planes = []
result_norm_planes = []
for plane in rgb_planes:
    dilated_img = cv2.dilate(plane, np.ones((7, 7), np.uint8))
    bg_img = cv2.medianBlur(dilated_img, 21)
    diff_img = 255 - cv2.absdiff(plane, bg_img)
    norm_img = cv2.normalize(diff_img, None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8UC1)
    result_planes.append(diff_img)
    result_norm_planes.append(norm_img)

result = cv2.merge(result_planes)
result_norm = cv2.merge(result_norm_planes)

# Show source image
plt.imshow(result_norm, cmap="gray")

In [None]:
# Apply adaptiveThreshold
result_norm = cv2.cvtColor(result_norm, cv2.COLOR_BGR2GRAY)
adaptive = cv2.adaptiveThreshold(result_norm, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 11, 2)
otsu = bw = cv2.threshold(result_norm, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, np.ones((3, 3), np.uint8))
bw = cv2.bitwise_not(bw)

# Show difference between adaptive and otsu thresholding
plt.subplot(121), plt.imshow(adaptive, 'gray'), plt.title('adaptive')
plt.xticks([]), plt.yticks([])
plt.subplot(122), plt.imshow(otsu, 'gray'), plt.title('otsu')
plt.xticks([]), plt.yticks([])
plt.show()

In [None]:
# Create the images that will use to extract the horizontal and vertical lines
horizontal = np.copy(bw)
vertical = np.copy(bw)

# Specify size on horizontal axis
cols = horizontal.shape[1]
horizontal_size = cols // 30
# Specify size on vertical axis
rows = vertical.shape[0]
verticalsize = rows // 30
# Create structure element for extracting horizontal lines through morphology operations
horizontalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1))
verticalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
# Apply morphology operations
horizontal = cv2.morphologyEx(bw, cv2.MORPH_OPEN, horizontalStructure, iterations=2)
vertical = cv2.morphologyEx(bw, cv2.MORPH_OPEN, verticalStructure, iterations=3)

# horizontal = cv2.erode(horizontal, np.ones((int(bw.shape[1] * 0.08), 1), np.uint8), iterations=1)
# horizontal = cv2.dilate(horizontal, horizontalStructure)
# vertical = cv2.erode(vertical, np.ones((int(bw.shape[0] * 0.08), 1), np.uint8), iterations=1)
# vertical = cv2.erode(vertical, verticalStructure)

vertical = cv2.dilate(vertical, verticalStructure)
horizontal = cv2.add(vertical,horizontal)

horizontal = cv2.dilate(horizontal, np.ones((3, 3)))
horizontal = cv2.bitwise_not(horizontal)
# Show extracted grid
plt.imshow(horizontal, cmap='gray')

In [None]:
remove_grid = cv2.bitwise_not(cv2.bitwise_and(bw, horizontal))
text = pytesseract.image_to_string(remove_grid)
print("Without opening:\n" + "\033[92m{}\033[00m".format(text))

plt.rcParams['figure.figsize'] = [16, 10]

plt.subplot(121), plt.imshow(remove_grid, cmap='gray'), plt.title('Only removed grid')
plt.xticks([]), plt.yticks([])
# Doing some opening/closing to remove some major noise
opening = cv2.morphologyEx(remove_grid, cv2.MORPH_OPEN, np.ones((5, 5)), iterations=2)
closing = cv2.morphologyEx(opening, cv2.MORPH_CLOSE, np.ones((3, 3)))
text = pytesseract.image_to_string(closing)
print("With opening and closing:\n" + "\033[92m{}\033[00m".format(text))

plt.subplot(122), plt.imshow(closing, cmap='gray'), plt.title('With opening/closing added')
plt.xticks([]), plt.yticks([])
plt.show()

# Evaluation
Evaluate the method on all required images.
Compute Levenshtein distance between the OCR output and the ground truth.

In [None]:
import cv2
import numpy as np
import pytesseract
import os
import subprocess
import pandas as pd
import re
from Levenshtein import distance

# Images for squared paper OCR are from 011.jpg to 020.jpg
image_names = [str(i).zfill(3) + ".jpg" for i in range(11, 21)]

PARENT_DIR = os.path.dirname(os.path.dirname(os.path.realpath("FILEPATH")))

# Remove the previous results/squared_paper_ocr.tsv if it exists
if os.path.exists(os.path.join(PARENT_DIR, "results", "squared_paper_ocr.tsv")):
    os.remove(os.path.join(PARENT_DIR, "results", "squared_paper_ocr.tsv"))

for image_name in image_names:
    # Path to tesseract executable (in case it isn't in your PATH)
    try:
        subprocess.call(["tesseract"])
    except FileNotFoundError:
        pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

    image = cv2.imread(os.path.join(PARENT_DIR, "images", image_name), cv2.IMREAD_COLOR)

    # Check if image is loaded fine
    if image is None:
        print('Error opening image')

    # image = cv2.compareHist(image)
    img = np.copy(image)

    # Shadow removal of the image
    rgb_planes = cv2.split(img)

    result_planes = []
    result_norm_planes = []
    for plane in rgb_planes:
        dilated_img = cv2.dilate(plane, np.ones((7, 7), np.uint8))
        bg_img = cv2.medianBlur(dilated_img, 21)
        diff_img = 255 - cv2.absdiff(plane, bg_img)
        norm_img = cv2.normalize(diff_img, None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8UC1)
        result_planes.append(diff_img)
        result_norm_planes.append(norm_img)

    result = cv2.merge(result_planes)
    result_norm = cv2.merge(result_norm_planes)

    # Apply adaptiveThreshold
    result_norm = cv2.cvtColor(result_norm, cv2.COLOR_BGR2GRAY)
    adaptive = cv2.adaptiveThreshold(result_norm, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 11, 2)
    otsu = bw = cv2.threshold(result_norm, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

    bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, np.ones((3, 3), np.uint8))
    bw = cv2.bitwise_not(bw)

    # Create the images that will use to extract the horizontal and vertical lines
    horizontal = np.copy(bw)
    vertical = np.copy(bw)

    # Specify size on horizontal axis
    cols = horizontal.shape[1]
    horizontal_size = cols // 30
    # Specify size on vertical axis
    rows = vertical.shape[0]
    verticalsize = rows // 30
    # Create structure element for extracting horizontal lines through morphology operations
    horizontalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1))
    verticalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, verticalsize))
    # Apply morphology operations
    horizontal = cv2.morphologyEx(bw, cv2.MORPH_OPEN, horizontalStructure, iterations=2)
    vertical = cv2.morphologyEx(bw, cv2.MORPH_OPEN, verticalStructure, iterations=3)

    vertical = cv2.dilate(vertical, verticalStructure)
    horizontal = cv2.add(vertical,horizontal)

    horizontal = cv2.dilate(horizontal, np.ones((3, 3)))
    horizontal = cv2.bitwise_not(horizontal)

    remove_grid = cv2.bitwise_not(cv2.bitwise_and(bw, horizontal))

    # Doing some opening/closing to remove some major noise
    opening = cv2.morphologyEx(remove_grid, cv2.MORPH_OPEN, np.ones((5, 5)), iterations=2)
    closing = cv2.morphologyEx(opening, cv2.MORPH_CLOSE, np.ones((3, 3)))

    method_text = pytesseract.image_to_string(closing)
    if method_text == "":
        method_text = " "

    baseline_text = pytesseract.image_to_string(image)
    if baseline_text == "":
        baseline_text = " "

    # Remove special characters from the text (\n, \t, \r, and multiple spaces all become a single space)
    baseline_text = re.sub(r'[\n\t\r]+', ' ', baseline_text)
    baseline_text = re.sub(r' +', ' ', baseline_text)
    method_text = re.sub(r'[\n\t\r]+', ' ', method_text)
    method_text = re.sub(r' +', ' ', method_text)
    # print("Baseline text:\n" + "\033[92m{}\033[00m".format(baseline_text))
    # print("Method text:\n" + "\033[92m{}\033[00m".format(method_text))

    # Load the ground truth (results/ground_truth.tsv)
    ground_truth = pd.read_csv(os.path.join(PARENT_DIR, "results", "ground_truth.tsv"), sep="\t")

    # Create results/squared_paper_ocr.tsv if it doesn't exist or if it has a different number of rows than ground_truth.tsv
    if not os.path.exists(os.path.join(PARENT_DIR, "results", "squared_paper_ocr.tsv")) or len(ground_truth) != len(pd.read_csv(os.path.join(PARENT_DIR, "results", "squared_paper_ocr.tsv"), sep="\t")):
        squared_paper_ocr = pd.DataFrame(columns=["input", "text", "baseline_text", "method_text", "ocr", "baseline_dist", "method_dist"])
        # Add all rows from ground_truth to squared_paper_ocr, with values for "ocr" set to 0
        squared_paper_ocr["input"] = ground_truth["input"]
        squared_paper_ocr["text"] = ground_truth["text"]
        squared_paper_ocr["baseline_text"] = ""
        squared_paper_ocr["method_text"] = ""
        squared_paper_ocr["ocr"] = 0
        # distance is the Levenshtein distance between the ground truth and the OCR result, defaulting to -1
        squared_paper_ocr["baseline_dist"] = -1
        squared_paper_ocr["method_dist"] = -1
        # Save the squared_paper_ocr dataframe
        squared_paper_ocr.to_csv(os.path.join(PARENT_DIR, "results", "squared_paper_ocr.tsv"), sep="\t", index=False)

    # Load the squared_paper_ocr.tsv
    squared_paper_ocr = pd.read_csv(os.path.join(PARENT_DIR, "results", "squared_paper_ocr.tsv"), sep="\t", dtype={"method_text": "string", "baseline_text": "string"})

    # Insert/update the squared_paper_ocr dataframe
    squared_paper_ocr.loc[squared_paper_ocr["input"] == image_name, "text"] = ground_truth[ground_truth["input"] == image_name]["text"].values[0]
    squared_paper_ocr.loc[squared_paper_ocr["input"] == image_name, "baseline_text"] = baseline_text
    squared_paper_ocr.loc[squared_paper_ocr["input"] == image_name, "method_text"] = method_text
    squared_paper_ocr.loc[squared_paper_ocr["input"] == image_name, "ocr"] = 1
    squared_paper_ocr.loc[squared_paper_ocr["input"] == image_name, "baseline_dist"] = distance(ground_truth[ground_truth["input"] == image_name]["text"].values[0], baseline_text)
    squared_paper_ocr.loc[squared_paper_ocr["input"] == image_name, "method_dist"] = distance(ground_truth[ground_truth["input"] == image_name]["text"].values[0], method_text)

    # Save the squared_paper_ocr dataframe
    squared_paper_ocr.to_csv(os.path.join(PARENT_DIR, "results", "squared_paper_ocr.tsv"), sep="\t", index=False)
    # squared_paper_ocr

# Load the squared_paper_ocr.tsv
squared_paper_ocr = pd.read_csv(os.path.join(PARENT_DIR, "results", "squared_paper_ocr.tsv"), sep="\t")
squared_paper_ocr