In [58]:
import os
from PIL import Image, ImageFilter
import numpy as np
import random

# Binarization
def binarize_image(image):
    img = image.convert('L')  # Convert image to grayscale
    # Binarize using a threshold
    threshold = 128
    binary_img = img.point(lambda p: 255 if p > threshold else 0)
    return binary_img

# Deskewing
def deskew_image(image):
    # Convert to NumPy array
    img_array = np.array(image)

    # Find the coordinates of the black (non-background) pixels
    coords = np.column_stack(np.where(img_array == 0))
    if len(coords) == 0:
        return image  # If there are no black pixels, return the original image

    # Calculate the angle of skew
    angle = np.rad2deg(np.arctan2(coords[:, 1].mean() - img_array.shape[0] / 2, coords[:, 0].mean() - img_array.shape[1] / 2))

    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle

    # Rotate the image
    pil_image = image.rotate(angle, expand=True)  # Use PIL to rotate
    return pil_image

# Noise removal
def remove_noise(image):
    # Apply a Gaussian blur to reduce noise
    denoised_image = image.filter(ImageFilter.GaussianBlur(radius=1))
    return denoised_image

# Preprocess an image
def preprocess_image(img):
    if isinstance(img, str):  # If the input is a file path, open the image
        img = Image.open(img)

    # Binarize the image (assuming this function is defined)
    binary_img = binarize_image(img)

    # Deskew the image (assuming this function is defined)
    deskewed_img = deskew_image(binary_img)

    # Resize the image to a fixed size (for example 1024x1024)
    resized_img = deskewed_img.resize((1024, 1024))

    return resized_img


In [59]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [60]:
import os
from PIL import Image, ImageEnhance, ImageFilter
import numpy as np
import random

# Load and preprocess data
def load_washington_data(images_folder, transcription_file):
    images = []
    transcriptions = []

    # Load images as PIL Image objects
    for filename in os.listdir(images_folder):
        if filename.endswith('.png'):
            img_path = os.path.join(images_folder, filename)
            img = Image.open(img_path)  # Open image using PIL
            images.append(img)

    # Load transcriptions
    with open(transcription_file, 'r') as f:
        transcriptions = [line.strip() for line in f.readlines()]

    return images, transcriptions

import random
from PIL import Image, ImageEnhance
from torchvision import transforms

# Define the AdvancedAugmentation class
class AdvancedAugmentation:
    def __init__(self):
        self.transforms = transforms.Compose([
            transforms.RandomRotation(15),  # Rotate images randomly within a 15-degree range
            transforms.RandomHorizontalFlip(),
            transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),  # Randomly change brightness, contrast, saturation, and hue
            transforms.RandomAffine(translate=(0.1, 0.1), degrees=10, shear=10)  # Random affine transformations
        ])

    def __call__(self, img):
        return self.transforms(img)

# Function to augment images
def augment_images_with_labels(images, transcriptions):
    augmented_images = []
    augmented_transcriptions = []

    # Initialize the advanced augmentation
    augmentation = AdvancedAugmentation()

    for img, transcription in zip(images, transcriptions):
        augmented = augmentation(img)  # Apply the advanced augmentation to the image
        augmented_images.append(augmented)
        augmented_transcriptions.append(transcription)  # Keep the original transcription for augmented image

    return augmented_images, augmented_transcriptions



# Split the data into train, validation, and test sets
def split_data(images, transcriptions, train_ratio=0.7, val_ratio=0.15):
    total_size = len(images)
    indices = list(range(total_size))
    random.shuffle(indices)

    train_size = int(total_size * train_ratio)
    val_size = int(total_size * val_ratio)

    train_indices = indices[:train_size]
    val_indices = indices[train_size:train_size + val_size]
    test_indices = indices[train_size + val_size:]

    train_images = [images[i] for i in train_indices]
    train_transcriptions = [transcriptions[i] for i in train_indices]

    val_images = [images[i] for i in val_indices]
    val_transcriptions = [transcriptions[i] for i in val_indices]

    test_images = [images[i] for i in test_indices]
    test_transcriptions = [transcriptions[i] for i in test_indices]

    return (train_images, train_transcriptions), (val_images, val_transcriptions), (test_images, test_transcriptions)


In [61]:
import os
from PIL import Image
import numpy as np
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torchvision import models
from sklearn.model_selection import train_test_split

# Main workflow
images_folder = '/content/drive/My Drive/washingtondb-v1.0/data/line_images_normalized'
transcription_file = '/content/drive/My Drive/washingtondb-v1.0/ground_truth/transcription.txt'

# Load data
images, transcriptions = load_washington_data(images_folder, transcription_file)

# Preprocess images
preprocessed_images = [preprocess_image(img) for img in images]

# Augment data
augmented_images, augmented_transcriptions = augment_images_with_labels(preprocessed_images, transcriptions)

# Combine original and augmented data
all_images = preprocessed_images + augmented_images
all_transcriptions = transcriptions + augmented_transcriptions  # Keep the same length of transcriptions

# Split data
(train_images, train_transcriptions), (val_images, val_transcriptions), (test_images, test_transcriptions) = split_data(all_images, all_transcriptions)

In [62]:
print(len(all_images))

1312


In [63]:
!pip install pytesseract




In [64]:

!apt-get update
!apt-get install tesseract-ocr

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Get:7 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Fetched 129 kB in 1s (114 kB/s)
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done

In [65]:
import pytesseract
custom_config = r'--oem 3 --psm 6'
pytesseract.pytesseract.tesseract_cmd = r'tesseract'


In [66]:
!pip install jiwer



In [87]:
# Function to extract text from the image line by line

# Function to clean the extracted text
def preprocess_extracted_text(extracted_text):
    extracted_text = extracted_text.replace('|', ' ').replace('-', '')
    return extracted_text.strip()

In [91]:
import pytesseract
from PIL import Image
from jiwer import wer

# Assuming all_images contains PIL.Image objects, not file names
# and ground_truth_lines contains the corresponding ground truth text

# Function to clean and preprocess ground truth lines
def preprocess_ground_truth(line):
    # Remove line numbers (assuming they are at the start and followed by a space)
    line = line.split(' ', 1)[-1] if ' ' in line else line
    # Replace "|" with spaces
    line = line.replace('|', ' ')
    # Remove dashes
    line = line.replace('-', '')
    return line.strip()

# Process each image in memory and calculate WER
for image, ground_truth in zip(all_images, all_transcriptions):
    # Ensure `image` is a PIL.Image object
    # Perform OCR on the image in memory using pytesseract
    extracted_text = pytesseract.image_to_string(image, config=custom_config)

    # Preprocess the extracted text to ensure it's on one line
    extracted_text = ' '.join(extracted_text.splitlines()).strip()

    # Preprocess the ground truth
    ground_truth = preprocess_ground_truth(ground_truth)

    # Calculate and print WER
    error_rate = wer(ground_truth, extracted_text)
    print(f"WER: {error_rate:.2%}")
    print(f"Extracted: {extracted_text}")
    print(f"Ground Truth: {ground_truth}\n")




[1;30;43mStreaming output truncated to the last 5000 lines.[0m
WER: 211.11%
Extracted: a 8 . ye ‘ gd  et 5 a | AS ay oa a AS S a ‘ A
Ground Truth: is sufficient to Break the best Officer that ever

WER: 100.00%
Extracted: cv ag re
Ground Truth: bore a Commis_ssions_pt

WER: 316.67%
Extracted: AS t AY cs ‘Sy ay nl e a eS 9 a ™~ ry S AS AP) \ at
Ground Truth: s_2s_7s_2s_pt Letterss_cm Orders and Instructionss_pt October

WER: 200.00%
Extracted: Ss a a \ e ) & 4 NK Co of ¥ “ ' ~ ’ ee
Ground Truth: s_2s_8ths_pt To Captain Peter Hoggs_cm of the Virginia

WER: 400.00%
Extracted: {/ 3 ry ra
Ground Truth: Regiments_pt

WER: 100.00%
Extracted: 
Ground Truth: Sirs_cm

WER: 162.50%
Extracted: 4 | a be y y 3 \ ' ' y roy 3
Ground Truth: I received yours of the s_6ths_pt of Octos_mi

WER: 114.29%
Extracted: re kg Ng NJ . S re yS
Ground Truth: bers_cm inclosing the Returns of your Companys_cm

WER: 114.29%
Extracted: r Y Cd rs aX) ee "4 eo
Ground Truth: only this days_pt Ferguson was sent after

WE