In [61]:
import cv2 as cv
import numpy
from PIL import Image
import scipy.ndimage
import scipy.signal
import pytesseract
import difflib
import os
import numpy as np

In [3]:
image_folder = "./images"
text_folder = "./source"
images = ["sample01.png", "sample02.png"]
texts = ["sample01.txt", "sample02.txt"]

In [4]:
def evaluate(actual, expected, print_score=True):
    s = difflib.SequenceMatcher(None, actual, expected)
    if print_score:
        print("{:.5f}".format(s.ratio()))
    # print(s.get_matching_blocks())
    return s.ratio()

# Base Image with OCR

In [5]:
for idx, image_name in enumerate(images):
    image = Image.open(os.path.join(image_folder, image_name))
    print(image.format, image.mode)
    image = image.convert("RGB")
    result = pytesseract.image_to_string(image)

    with open(os.path.join(text_folder, texts[idx]), 'r') as f:
            base_text = f.readlines()
            base_text = "".join(base_text)
            # base_text = [line.strip() for line in base_text]

    print(result)
    evaluate(result, base_text)

PNG RGBA
Parking: You may park anywhere on the ce
king. Keep in mind the carpool hours and park
afternoon

Under School Age Children:While we love
inappropriate to have them on campus @ )
that they may be invited or can accompany :
you adhere to our _ policy for the benefit of

 

0.42293
PNG LA
Sonnet for Lena

 

0.05207


In [6]:
def otsu_thresholding(image_pil):
    img_cv = cv.cvtColor(numpy.array(image_pil), cv.COLOR_RGB2GRAY)
    # Otsu's thresholding
    ret, th = cv.threshold(img_cv, 0, 255, cv.THRESH_BINARY + cv.THRESH_OTSU)
    img_th_rgb = cv.cvtColor(th, cv.COLOR_GRAY2RGB)
    img_th_pil = Image.fromarray(img_th_rgb)
    # img_th_pil.show()
    return img_th_pil

In [57]:
def threshold_image(image_np, threshold=0):
    # Set pixels with value less than threshold to 0, otherwise set is as 255
    image_result_np = np.where(image_np < threshold, 0, 1)
    # Convert numpy array back to PIL image object
    image_result = Image.fromarray((image_result_np * 255).astype(np.uint8))
    return image_result


def otsu_thresholding_in(image, max_value=255, is_normalized=False):
    # Image must be in grayscale
    image_np = np.array(image)
    # Set total number of bins in the histogram
    bins_num = 256  # Since our image is 8 bits, we used 256 for now
    # Get the image histogram
    hist, bin_edges = np.histogram(image_np, bins=bins_num)
    # Get normalized histogram if it is required
    if is_normalized:
        hist = np.divide(hist.ravel(), hist.max())
    # Calculate centers of bins
    bin_mids = (bin_edges[:-1] + bin_edges[1:]) / 2.
    # Iterate over all thresholds (indices) and get the probabilities w1(t), w2(t)
    weight1 = np.cumsum(hist)
    weight2 = np.cumsum(hist[::-1])[::-1]

    # Get the class means \mu0(t)
    mean1 = np.cumsum(hist * bin_mids) / weight1
    # Get the class means \mu1(t)
    mean2 = (np.cumsum((hist * bin_mids)[::-1]) / weight2[::-1])[::-1]

    inter_class_variance = weight1[:-1] * weight2[1:] * (mean1[:-1] - mean2[1:]) ** 2

    # Maximize the inter_class_variance function val
    index_of_max_val = np.argmax(inter_class_variance)

    threshold = bin_mids[:-1][index_of_max_val]
    image_result = threshold_image(image_np, threshold)

    return image_result, threshold

# Otsu thresholding

In [38]:
for idx, image_name in enumerate(images):
    image = Image.open(os.path.join(image_folder, image_name))
    # print(image.format, image.mode)
    image = image.convert("RGB")

    with open(os.path.join(text_folder, texts[idx]), 'r') as f:
            base_text = f.readlines()
            base_text = "".join(base_text)
            # base_text = [line.strip() for line in base_text]

    image_th = otsu_thresholding(image)
    result_th = pytesseract.image_to_string(image_th)
    image_th.show()

    evaluate(result_th, base_text)

0.41019
0.03374


### Self implementation of Otsu thresholding

In [58]:
for idx, image_name in enumerate(images):
    image = Image.open(os.path.join(image_folder, image_name))
    # print(image.format, image.mode)
    image = image.convert("L")

    with open(os.path.join(text_folder, texts[idx]), 'r') as f:
            base_text = f.readlines()
            base_text = "".join(base_text)
            # base_text = [line.strip() for line in base_text]

    image_th, thresh = otsu_thresholding_in(image)
    print(f"Threshold pixel value={thresh}")
    image_th.show()
    result_th = pytesseract.image_to_string(image_th)
    

    evaluate(result_th, base_text)

Threshold pixel value=125.314453125
0.41019
Threshold pixel value=141.1875
0.03374


In [136]:
def adaptive_gaussian_tresholding(image_pil):
    img_cv = cv.cvtColor(numpy.array(image_pil), cv.COLOR_RGB2GRAY)
    th = cv.adaptiveThreshold(img_cv, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, \
                              cv.THRESH_BINARY, 11, 8)
    img_th_rgb = cv.cvtColor(th, cv.COLOR_GRAY2RGB)
    img_th_pil = Image.fromarray(img_th_rgb)
    # img_th_pil.show()
    return img_th_pil

In [130]:
# https://stackoverflow.com/questions/29731726/how-to-calculate-a-gaussian-kernel-matrix-efficiently-in-numpy
def gaussian_kernel(kernel_size=7, std=1):
    gaussian_kernel_1d = scipy.signal.gaussian(kernel_size, std=std).reshape(kernel_size, 1)
    # print(gaussian_kernel_1d)
    gaussian_kernel_2d = np.outer(gaussian_kernel_1d, gaussian_kernel_1d)
    # print(gaussian_kernel_2d)
    return gaussian_kernel_2d / gaussian_kernel_2d.sum()


# https://www.mathworks.com/matlabcentral/fileexchange/8647-local-adaptive-thresholding
# https://homepages.inf.ed.ac.uk/rbf/HIPR2/adpthrsh.htm
def adaptive_gaussian_thresholding_in(image, max_value=255, block_size=7, C=0, std=1):
    # Image must be in grayscale
    image_np = np.array(image)

    kernel = gaussian_kernel(block_size, std=std)
    # print(f"kernel={kernel}")

    image_convolved_np = scipy.signal.convolve2d(image_np, kernel, mode='same', boundary='symm')
    image_result_np = image_convolved_np - image_np - C
    # print(image_result_np)

    image_result = threshold_image(image_result_np)

    return image_result


# https://www.mathworks.com/matlabcentral/fileexchange/8647-local-adaptive-thresholding
def adaptive_mean_thresholding_in(image, max_value=255, block_size=7, C=0):
    # Image must be in grayscale
    image_np = np.array(image)

    kernel = np.ones((block_size, block_size)) / (block_size ** 2)
    image_convolved_np = scipy.signal.convolve2d(image_np, kernel, mode='same', boundary='symm')
    image_result_np = image_convolved_np - image_np - C
    image_result = threshold_image(image_result_np)

    return image_result

In [131]:
print(gaussian_kernel(3,1))

[[0.07511361 0.1238414  0.07511361]
 [0.1238414  0.20417996 0.1238414 ]
 [0.07511361 0.1238414  0.07511361]]


# Adaptive Gaussian

In [137]:
for idx, image_name in enumerate(images):
    image = Image.open(os.path.join(image_folder, image_name))
    # print(image.format, image.mode)
    image = image.convert("RGB")

    with open(os.path.join(text_folder, texts[idx]), 'r') as f:
            base_text = f.readlines()
            base_text = "".join(base_text)
            # base_text = [line.strip() for line in base_text]

    image_adaptive_gaussian = adaptive_gaussian_tresholding(image)
    # image_adaptive_gaussian.show()
    result_adaptive_gaussian = pytesseract.image_to_string(image_adaptive_gaussian)
    print(result_adaptive_gaussian)

    print("Adaptive gaussian:")
    evaluate(result_adaptive_gaussian, base_text)

Parking: You may park anywhere on the campus where there are no signs prohibiting par-
king. Keep in mind the carpool hours and park accordingly so you do.not get blocked in the
afternoon

Under School Age Children:While we love the younger children, it can be disruptive and
inappropriate to have them on campus during school hours.. There may be special times
that they may be invited or can accompany a parent volunteer, but otherwise we ask that
you adhere to our _ policy for the benefit of the students and staff.

Adaptive gaussian:
0.98935
Sonnet for Lena

 

Ordear Lena, your benuiy ia so vas
be ib

  
   

 

Te is bared senued ines bee db
L thought the eitiee workl Tweed bi

last Fiest whon [trical tao
T funtisd it ser checks bel

  
    
 

    
 
   

iu Ctrays fend aad the ye
hile thes sethacks ge
rd tea With liaeks fore or there

‘Thir
Juul

Tani

 

Anite seven:

 

 

Adaptive gaussian:
0.12692


### Self implementation of Adaptive Gaussian thresholding

In [135]:
for idx, image_name in enumerate(images):
    image = Image.open(os.path.join(image_folder, image_name))
    # print(image.format, image.mode)
    image = image.convert("L")

    with open(os.path.join(text_folder, texts[idx]), 'r') as f:
            base_text = f.readlines()
            base_text = "".join(base_text)
            # base_text = [line.strip() for line in base_text]

    image_th = adaptive_gaussian_thresholding_in(image, block_size=7, std=3, C=8)
    image_th.show()
    result_th = pytesseract.image_to_string(image_th)
    

    evaluate(result_th, base_text)

0.99029
0.05313


In [143]:
# Parameters fine-tuning
accuracy = [0,0]
block_size_optimum = [0,0]
std_optimum = [0,0]
C_optimum = [0,0]
for idx, image_name in enumerate(images):
    image = Image.open(os.path.join(image_folder, image_name))
    # print(image.format, image.mode)
    image = image.convert("L")

    with open(os.path.join(text_folder, texts[idx]), 'r') as f:
            base_text = f.readlines()
            base_text = "".join(base_text)
            # base_text = [line.strip() for line in base_text]
    for C in range(0,10):
        for block_size in range(3,13,2):
            for std in range(1,3):
                image_th = adaptive_gaussian_thresholding_in(image, block_size=block_size,std=std,C=C)
                # image_th.show()
                result_th = pytesseract.image_to_string(image_th)
                score = evaluate(result_th, base_text,False)
                if accuracy[idx] < score:
                    print(f"Found better accuracy of {score} for image {image_name} with parameters {block_size} {std} {C}")
                    accuracy[idx] = score
                    block_size_optimum[idx] = block_size
                    std_optimum[idx] = std
                    C_optimum[idx] = C
print(accuracy)
print(block_size_optimum)
print(std_optimum)
print(C_optimum)

Found better accuracy of 0.0037593984962406013 for image sample01.png with parameters 3 1 0
Found better accuracy of 0.007366482504604052 for image sample01.png with parameters 7 2 0
Found better accuracy of 0.017152658662092625 for image sample01.png with parameters 9 1 0
Found better accuracy of 0.3133004926108374 for image sample01.png with parameters 3 1 2
Found better accuracy of 0.3192389006342495 for image sample01.png with parameters 3 2 2
Found better accuracy of 0.6883365200764818 for image sample01.png with parameters 3 1 3
Found better accuracy of 0.8460076045627376 for image sample01.png with parameters 7 1 3
Found better accuracy of 0.8525214081826832 for image sample01.png with parameters 9 1 3
Found better accuracy of 0.9198473282442748 for image sample01.png with parameters 5 2 4
Found better accuracy of 0.9244186046511628 for image sample01.png with parameters 5 1 5
Found better accuracy of 0.9282945736434108 for image sample01.png with parameters 7 1 5
Found better a

In [10]:
def apply_gaussian_blur(image_pil, kernel_size=(5, 5)):
    img_cv = cv.cvtColor(numpy.array(image_pil), cv.COLOR_RGB2BGR)
    img_blur = cv.GaussianBlur(img_cv, kernel_size, 0)
    img_cv_rgb = cv.cvtColor(img_blur, cv.COLOR_BGR2RGB)
    img_pil = Image.fromarray(img_cv_rgb)
    return img_pil


# Gaussian Blur + Adaptive Gaussian Thresholding

In [11]:
for kernel_size in range(3,11,2):
    for idx, image_name in enumerate(images):
        image = Image.open(os.path.join(image_folder, image_name))
        # print(image.format, image.mode)
        image = image.convert("RGB")

        with open(os.path.join(text_folder, texts[idx]), 'r') as f:
                base_text = f.readlines()
                base_text = "".join(base_text)
                # base_text = [line.strip() for line in base_text]

        image_gaussian_blur = apply_gaussian_blur(image,(kernel_size, kernel_size))
        image_adaptive_gaussian = adaptive_gaussian_tresholding(image_gaussian_blur)
        # image_adaptive_gaussian.show()
        result_adaptive_gaussian = pytesseract.image_to_string(image_adaptive_gaussian)
        score = evaluate(result_adaptive_gaussian, base_text, print_score=False)
        print(f"Gaussian blur ({kernel_size},{kernel_size}) + Adaptive gaussian for {image_name} score: {score:.5f}")


Gaussian blur (3,3) + Adaptive gaussian for sample01.png score: 0.00000
Gaussian blur (3,3) + Adaptive gaussian for sample02.png score: 0.35653
Gaussian blur (5,5) + Adaptive gaussian for sample01.png score: 0.87978
Gaussian blur (5,5) + Adaptive gaussian for sample02.png score: 0.46845
Gaussian blur (7,7) + Adaptive gaussian for sample01.png score: 0.92692
Gaussian blur (7,7) + Adaptive gaussian for sample02.png score: 0.20173
Gaussian blur (9,9) + Adaptive gaussian for sample01.png score: 0.46169
Gaussian blur (9,9) + Adaptive gaussian for sample02.png score: 0.14229


In [63]:
for kernel_size in range(5,8,2):
    for idx, image_name in enumerate(images):
        image = Image.open(os.path.join(image_folder, image_name))
        # print(image.format, image.mode)
        image = image.convert("RGB")

        with open(os.path.join(text_folder, texts[idx]), 'r') as f:
                base_text = f.readlines()
                base_text = "".join(base_text)
                # base_text = [line.strip() for line in base_text]

        image_gaussian_blur = apply_gaussian_blur(image,(kernel_size, kernel_size))
        image_adaptive_gaussian = adaptive_gaussian_tresholding(image_gaussian_blur)
        image_adaptive_gaussian.show()
        result_adaptive_gaussian = pytesseract.image_to_string(image_adaptive_gaussian)
        score = evaluate(result_adaptive_gaussian, base_text, print_score=False)
        print(f"Gaussian blur ({kernel_size},{kernel_size}) + Adaptive gaussian for {image_name} score: {score:.5f}")
        print(result_adaptive_gaussian)

Gaussian blur (5,5) + Adaptive gaussian for sample01.png score: 0.87978
 

Parking: You may park anywhere on the campus where there are no ‘signs prohibiting par
king. Keep in mind the carpool hours and park accordingly so you do not get blocked In the ©
. afternoon te tet get eae ULL ese Soy

 

    

Under School Age Children:While we love the younger children, it can be disruptive and
inappropriate to have them on campus during ‘school hours.. There may be special times .:.
that they may be invited or can accompany a parent volunteer, but otherwise we ask that -
you adhere to our policy for the benefit of the students and staff; 0. - eet

 

   

Gaussian blur (5,5) + Adaptive gaussian for sample02.png score: 0.46845
Sonnet for Lena

O dear Lena, your beatty fa sa vast

Tels hard sometiines to describe [t fant.

I thought the entire world [ would impress
If only your portrait [ could compress.

Alas! First when I tried to tise VQ

1 found that your checks belong te only you.
Your s

In [67]:
def bilateral_filter(image_pil, d=9, sigma_color=100, sigma_space=100):
    img_cv = cv.cvtColor(numpy.array(image_pil), cv.COLOR_RGB2BGR)
    img_blur = cv.bilateralFilter(img_cv, d=d, sigmaColor=sigma_color, sigmaSpace=sigma_space)
    img_cv_rgb = cv.cvtColor(img_blur, cv.COLOR_BGR2RGB)
    img_pil = Image.fromarray(img_cv_rgb)
    return img_pil

# Bilateral Filter + Adaptive Gaussian Thresholding


In [74]:
for d in range(5,12,2):
    for sigma in range(50,210,50):
        for idx, image_name in enumerate(images):
            image = Image.open(os.path.join(image_folder, image_name))
            # print(image.format, image.mode)
            image = image.convert("RGB")

            with open(os.path.join(text_folder, texts[idx]), 'r') as f:
                    base_text = f.readlines()
                    base_text = "".join(base_text)
                    # base_text = [line.strip() for line in base_text]

            image_bilateral_filter = bilateral_filter(image,d=d, sigma_color=sigma, sigma_space=sigma)
            image_adaptive_gaussian = adaptive_gaussian_tresholding(image_bilateral_filter)
            # image_adaptive_gaussian.show()
            result_adaptive_gaussian = pytesseract.image_to_string(image_adaptive_gaussian)
            score = evaluate(result_adaptive_gaussian, base_text, print_score=False)
            print(f"Bilateral Filter d={d} sigma={sigma} with AGT for {image_name} score: {score:.5f}")
            # print(result_adaptive_gaussian)

Bilateral Filter d=5 sigma=50 with ADT for sample01.png score: 0.58845
Bilateral Filter d=5 sigma=50 with ADT for sample02.png score: 0.08819
Bilateral Filter d=5 sigma=100 with ADT for sample01.png score: 0.56747
Bilateral Filter d=5 sigma=100 with ADT for sample02.png score: 0.34597
Bilateral Filter d=5 sigma=150 with ADT for sample01.png score: 0.56825
Bilateral Filter d=5 sigma=150 with ADT for sample02.png score: 0.26224
Bilateral Filter d=5 sigma=200 with ADT for sample01.png score: 0.51197
Bilateral Filter d=5 sigma=200 with ADT for sample02.png score: 0.27738
Bilateral Filter d=7 sigma=50 with ADT for sample01.png score: 0.98651
Bilateral Filter d=7 sigma=50 with ADT for sample02.png score: 0.31445
Bilateral Filter d=7 sigma=100 with ADT for sample01.png score: 0.98554
Bilateral Filter d=7 sigma=100 with ADT for sample02.png score: 0.27208
Bilateral Filter d=7 sigma=150 with ADT for sample01.png score: 0.78454
Bilateral Filter d=7 sigma=150 with ADT for sample02.png score: 0.21

In [79]:
for d in range(3,7,1):
    for sigma in range(10,160,10):
        for idx, image_name in enumerate(images):
            if image_name=="sample01.png":
                continue
            image = Image.open(os.path.join(image_folder, image_name))
            # print(image.format, image.mode)
            image = image.convert("RGB")

            with open(os.path.join(text_folder, texts[idx]), 'r') as f:
                    base_text = f.readlines()
                    base_text = "".join(base_text)
                    # base_text = [line.strip() for line in base_text]

            image_bilateral_filter = bilateral_filter(image,d=d, sigma_color=sigma, sigma_space=sigma)
            image_adaptive_gaussian = adaptive_gaussian_tresholding(image_bilateral_filter)
            # image_adaptive_gaussian.show()
            result_adaptive_gaussian = pytesseract.image_to_string(image_adaptive_gaussian)
            score = evaluate(result_adaptive_gaussian, base_text, print_score=False)
            print(f"Bilateral Filter d={d} sigma={sigma} with AGT for {image_name} score: {score:.5f}")
            # print(result_adaptive_gaussian)

Bilateral Filter d=3 sigma=10 with AGT for sample02.png score: 0.03360
Bilateral Filter d=3 sigma=20 with AGT for sample02.png score: 0.01860
Bilateral Filter d=3 sigma=30 with AGT for sample02.png score: 0.09091
Bilateral Filter d=3 sigma=40 with AGT for sample02.png score: 0.05786
Bilateral Filter d=3 sigma=50 with AGT for sample02.png score: 0.05901
Bilateral Filter d=3 sigma=60 with AGT for sample02.png score: 0.05919
Bilateral Filter d=3 sigma=70 with AGT for sample02.png score: 0.18182
Bilateral Filter d=3 sigma=80 with AGT for sample02.png score: 0.04219
Bilateral Filter d=3 sigma=90 with AGT for sample02.png score: 0.42155
Bilateral Filter d=3 sigma=100 with AGT for sample02.png score: 0.03897
Bilateral Filter d=3 sigma=110 with AGT for sample02.png score: 0.04514
Bilateral Filter d=3 sigma=120 with AGT for sample02.png score: 0.32682
Bilateral Filter d=3 sigma=130 with AGT for sample02.png score: 0.33437
Bilateral Filter d=3 sigma=140 with AGT for sample02.png score: 0.32631
B

In [81]:
for idx, image_name in enumerate(images):
    if image_name=="sample01.png":
        d=9
        sigma=100
    else:
        d=3
        sigma=90
    image = Image.open(os.path.join(image_folder, image_name))
    # print(image.format, image.mode)
    image = image.convert("RGB")

    with open(os.path.join(text_folder, texts[idx]), 'r') as f:
            base_text = f.readlines()
            base_text = "".join(base_text)
            # base_text = [line.strip() for line in base_text]

    image_bilateral_filter = bilateral_filter(image,d=d, sigma_color=sigma, sigma_space=sigma)
    image_adaptive_gaussian = adaptive_gaussian_tresholding(image_bilateral_filter)
    image_adaptive_gaussian.show()
    result_adaptive_gaussian = pytesseract.image_to_string(image_adaptive_gaussian)
    score = evaluate(result_adaptive_gaussian, base_text, print_score=False)
    print(f"Bilateral Filter d={d} sigma={sigma} with AGT for {image_name} score: {score:.5f}")
    print(result_adaptive_gaussian)

Bilateral Filter d=9 sigma=100 with AGT for sample01.png score: 0.98643
Parking: You may park anywhere on the campus where there are no signs prohibiting par-
king. Keep in mind the carpool hours and park accordingly so you do not get blocked in the
afternoon

Under School Age Children:While we love the younger children, it can be disruptve and
inappropriate to have them on campus during school hours. There may be special tenes
that they may be invited or can accompany a parent volunteer, but otherwise we ask that
you adhere to our —_ policy for the benefit of the students and staff.

Bilateral Filter d=3 sigma=90 with AGT for sample02.png score: 0.42155
é ‘Sonnet for Lena,

° dear Lena, your  brcraily inao vot
1 Is hard sometitnea to describe It fast.
[thought the entire world | would impress

~-L Ionly your portrait I could compress.

_ Alns! Firat when T tried to use VQ

_ Lfound that your cheeks belong to only you.
Your silky hair contains m thonsand fines
Hard ta match with sums 