# Engines to test:
### 1. Tesseract
### 2. PaddleOCR
### 3. EasyOCR

## Dataset:

https://guillaumejaume.github.io/FUNSD/

In [11]:
import os
import json
import pandas as pd
import cv2
import pytesseract
import numpy as np
import time
from pdf2image import convert_from_path
from skimage.filters import threshold_niblack, threshold_sauvola
from Levenshtein import distance as levenshtein_distance
from multiprocessing import Pool, cpu_count


In [6]:
def get_path(default_path, prompt):
    path = input(prompt + f" (default: {default_path}): ")
    return path if path else default_path

annotations_folder = get_path('annotations', "Enter the path to the annotations folder")
images_folder = get_path('images', "Enter the path to the images folder")

if not os.path.exists(annotations_folder):
    raise FileNotFoundError(f"The specified annotations folder does not exist: {annotations_folder}")
if not os.path.exists(images_folder):
    raise FileNotFoundError(f"The specified images folder does not exist: {images_folder}")

data = []

for filename in os.listdir(annotations_folder):
    if filename.endswith('.json'):
        with open(os.path.join(annotations_folder, filename), 'r') as file:
            annotation_data = json.load(file)
            image_filename = filename.split('.')[0] + '.png'
            text = ""
            for item in annotation_data['form']:
                text += item['text'] + " "

            data.append({'filename': image_filename, 'text': text.strip()})

df = pd.DataFrame(data)

print(df.head())

            filename                                               text
0       82092117.png  TO: DATE: 3 Fax: NOTE: 82092117 614 -466 -5087...
1  82200067_0069.png  TO: FROM: x SUBJECT:   DIVISION: DIVISION: DIV...
2  82250337_0338.png  TO: FROM: DATE: MANUFACTURER: BRAND: Oct. Dec....
3       82251504.png  17 cc: : From: Area: Region: 5 X Chains: Indep...
4  82252956_2958.png  AUG 4 SEP 15 JUN 23 MAY 12 REGION: DIVISION: 7...


#### Utility functions for preprocessing

In [18]:
def apply_otsu_binarization(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, bin_image = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return bin_image

def apply_niblack_binarization(image, window_size, k=-0.2):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    bin_image = threshold_niblack(gray, window_size=window_size, k=k)
    return (gray > bin_image).astype(np.uint8) * 255

def apply_sauvola_binarization(image, window_size):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    bin_image = threshold_sauvola(gray, window_size=window_size)
    return (gray > bin_image).astype(np.uint8) * 255

def straighten_image(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 50, 150, apertureSize=3)
    
    lines = cv2.HoughLines(edges, 1, np.pi / 180, 200)
    if lines is not None:
        angles = []
        for rho, theta in lines[:, 0]:
            angle = (theta * 180 / np.pi) - 90
            angles.append(angle)
        median_angle = np.median(angles)
        (h, w) = image.shape[:2]
        M = cv2.getRotationMatrix2D((w // 2, h // 2), median_angle, 1.0)
        straightened = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
        return straightened
    return image  

def scale_image(image):
    return cv2.resize(image, None, fx=1.5, fy=1.5, interpolation=cv2.INTER_LINEAR)

def apply_sauvola_binarization(image, window_size):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    bin_image = threshold_sauvola(gray, window_size=window_size)
    return (gray > bin_image).astype(np.uint8) * 255

def clean_borders(image):
    # Ensure the image is grayscale
    if len(image.shape) == 3:
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    else:
        gray = image
    # Apply threshold
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    # Find contours
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    # Draw filled contours to remove border
    for contour in contours:
        cv2.drawContours(binary, [contour], 0, 0, -1)
    return cv2.bitwise_not(binary)

def remove_noise(image):
    return cv2.medianBlur(image, 5)

### Tesseract


In [8]:
%pip install pytesseract


Note: you may need to restart the kernel to use updated packages.


In [16]:
def perform_ocr_tesseract(image):
    text = pytesseract.image_to_string(image, config='--psm 6')
    return text.strip()

In [9]:
window_sizes = [11, 21, 31, 41, 51, 61, 71, 81]
results = []

def perform_ocr_tesseract(image):
    text = pytesseract.image_to_string(image, config='--psm 6')
    return text.strip()

def evaluate_ocr(df, preprocessing_func, window_size):
    df['ocr_text'] = df['filename'].apply(lambda x: perform_ocr_tesseract(preprocessing_func(cv2.imread(os.path.join(images_folder, x)), window_size)))
    df['levenshtein'] = df.apply(lambda row: levenshtein_distance(row['text'], row['ocr_text']), axis=1)
    avg_distance = df['levenshtein'].mean()
    return avg_distance

# Evaluate each preprocessing method with various window sizes
for size in window_sizes:
    print(f"Evaluating with window size: {size}")
    
    niblack_distance = evaluate_ocr(df.copy(), apply_niblack_binarization, size)
    results.append(('niblack', size, niblack_distance))
    
    sauvola_distance = evaluate_ocr(df.copy(), apply_sauvola_binarization, size)
    results.append(('sauvola', size, sauvola_distance))

# Identify the best method and window size
best_result = min(results, key=lambda x: x[2])
print(f"Best method: {best_result[0]}, Best window size: {best_result[1]}, Levenshtein distance: {best_result[2]}")

# Optional: save results to a CSV file for further analysis
results_df = pd.DataFrame(results, columns=['method', 'window_size', 'levenshtein_distance'])
results_df.to_csv('ocr_preprocessing_tesseract_results.csv', index=False)


Evaluating with window size: 11
Evaluating with window size: 21
Evaluating with window size: 31
Evaluating with window size: 41
Evaluating with window size: 51
Evaluating with window size: 61
Evaluating with window size: 71
Evaluating with window size: 81
Best method: sauvola, Best window size: 31, Levenshtein distance: 608.5


In [19]:
def perform_ocr_with_pipeline(image, steps, window_size):
    if 'straightening' in steps:
        image = straighten_image(image)
    if 'scaling' in steps:
        image = scale_image(image)
    if 'binarization' in steps:
        image = apply_sauvola_binarization(image, window_size)
    if 'border_cleaning' in steps:
        image = clean_borders(image)
    if 'noise_removal' in steps:
        image = remove_noise(image)
    return perform_ocr_tesseract(image)

def evaluate_ablation(args):
    filename, text, steps, window_size = args
    image = cv2.imread(os.path.join(images_folder, filename))
    ocr_text = perform_ocr_with_pipeline(image, steps, window_size)
    return levenshtein_distance(text, ocr_text)

full_pipeline_steps = ['straightening', 'scaling', 'binarization', 'border_cleaning', 'noise_removal']
results_ablation = []

window_size = 31  # Use the best window size from Phase 1
args_list = [(row['filename'], row['text'], full_pipeline_steps, window_size) for _, row in df.iterrows()]

with Pool(cpu_count()) as p:
    distances_full_pipeline = p.map(evaluate_ablation, args_list)

avg_distance_full_pipeline = np.mean(distances_full_pipeline)
results_ablation.append(('full_pipeline', avg_distance_full_pipeline))

for step in full_pipeline_steps:
    steps_without_step = [s for s in full_pipeline_steps if s != step]
    args_list = [(row['filename'], row['text'], steps_without_step, window_size) for _, row in df.iterrows()]
    with Pool(cpu_count()) as p:
        distances_ablation = p.map(evaluate_ablation, args_list)
    avg_distance_ablation = np.mean(distances_ablation)
    results_ablation.append((f'without_{step}', avg_distance_ablation))

results_ablation_df = pd.DataFrame(results_ablation, columns=['ablation_scenario', 'levenshtein_distance'])

print("Ablation Study Results:")
for scenario, distance in results_ablation:
    print(f"{scenario}: {distance}")


Ablation Study Results:
full_pipeline: 1056.32
without_straightening: 1056.32
without_scaling: 1056.32
without_binarization: 1056.32
without_border_cleaning: 713.88
without_noise_removal: 1056.32
