# OCR for Handwritten Forms

In [1]:
# Select image to extract text from
original_image = 'images/image_2.png'

### Image Preprocessing

In [None]:
# Import OpenCV library
import cv2

# Load the image
image = cv2.imread(original_image)

# Convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Apply adaptive thresholding
thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                               cv2.THRESH_BINARY, 11, 2)

# Save the processed image (optional)
processed_image = 'processed_images/processed_image_1.jpg'
cv2.imwrite(processed_image, thresh)

### Extract Text using EasyOCR library

In [None]:
# Import EasyOCR library
import easyocr

# Initialize reader object
reader = easyocr.Reader(['en'])

# Extract text from original image
result = reader.readtext(original_image, detail=0)

# Join the extracted text into a single string
extracted_text = '\n'.join(result)

# Check output
print(extracted_text)

# Save the extracted text to a file
output_text = 'output_text/easyocr_output_text_1.txt'
with open(output_text, 'w') as file:
    file.write(extracted_text)

### Tokenize output

In [None]:
# Import SpaCy library
import spacy

# Load the English model
nlp = spacy.load('en_core_web_sm')

# Read the saved text file
with open(output_text, 'r') as file:
    text = file.read()

# Tokenize using spaCy
doc = nlp(text)
tokens = [token.text for token in doc]

print(tokens)

### Extract text using Tesseract library

In [None]:
import pytesseract
from PIL import Image

# Open the image
img = Image.open(original_image)

custom_config = r'--psm 11'

# Extract text with the specified page segmentation mode
extracted_text = pytesseract.image_to_string(img, config=custom_config)

# Print the extracted text
print(extracted_text)

# Save the extracted text to a file
output_text = 'output_text/tesseract_output_text_1.txt'

with open(output_text, 'w') as file:
    file.write(extracted_text)

### Region-based Extraction using Contour Detection

In [None]:
# Load the image
image = cv2.imread(original_image)

# Convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Apply adaptive thresholding to binarize the image
thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)

# Use OpenCV to detect contours (boxes/fields in the form)
contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

# Loop through contours to extract text from specific regions
for contour in contours:
    x, y, w, h = cv2.boundingRect(contour)
    
    # Crop the region of interest (ROI)
    roi = image[y:y+h, x:x+w]
    
    # Convert the ROI to a PIL image for Tesseract
    roi_pil = Image.fromarray(roi)

    # Use Tesseract with appropriate PSM
    custom_config = r'--psm 11'
    text = pytesseract.image_to_string(roi_pil, config=custom_config)

    # Print or save extracted text from each ROI
    print(text)

### Region-based Extraction + True Label Mapping

In [None]:
import matplotlib.pyplot as plt

# Load the form image
image = cv2.imread(original_image)

# Convert to grayscale for better thresholding
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Apply thresholding to enhance the detection of black boxes
thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2)

# Detect edges to find the contours of the form boxes
edges = cv2.Canny(thresh, 30, 150)

# Find contours in the edges image
contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

# Define a list of labels corresponding to the form fields (manually)
field_labels = ["Date", "City", "State", "ZIP"]

# Initialize a dictionary to hold the extracted text and the associated labels
extracted_data = {}

# Sort contours by their position (optional, depends on the form structure)
contours = sorted(contours, key=lambda x: cv2.boundingRect(x)[1])  # Sort by the y-coordinate of each contour

# Loop through each contour and filter by size (exclude very large and very small contours)
for i, contour in enumerate(contours):
    x, y, w, h = cv2.boundingRect(contour)
    
    # Filter out contours that are too large or too small to be boxes
    if w < 50 or h < 20 or w > 500 or h > 200:  # Adjust these values based on your form's structure
        continue

    # Crop the region of interest (ROI) from the image
    roi = image[y:y+h, x:x+w]
    
    # Convert the ROI to a PIL image for Tesseract
    roi_pil = Image.fromarray(roi)
    
    # Use Tesseract to extract text from the ROI
    custom_config = r'--psm 11'
    extracted_text = pytesseract.image_to_string(roi_pil, config=custom_config)
    
    # Clean up the extracted text (optional)
    extracted_text = extracted_text.strip()

    # Map the extracted text to the corresponding label (assuming the contours align with the labels)
    label = field_labels[i % len(field_labels)]  # Avoid out-of-range errors
    extracted_data[label] = extracted_text

    # Optional: Draw the bounding box on the image for debugging
    cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)

# Function to display the image using Matplotlib
def show_image(img, title="Image"):
    # Convert BGR (used by OpenCV) to RGB (used by Matplotlib)
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    plt.figure(figsize=(10, 10))
    plt.imshow(img_rgb)
    plt.title(title)
    plt.axis('off')  # Hide axis
    plt.show()

# Show the final image with detected regions (boxes)
show_image(image, "Detected Regions")

# Print the extracted data (you can also save this to a text file)
print(extracted_data)

### Region-based Text Extraction + Provided Co-ordinates

In [None]:
import json
import os

# Function to load true labels for a specific image
def load_true_labels(image_id, labels_file='true_labels.json'):
    with open(labels_file) as f:
        true_labels_data = json.load(f)
    return true_labels_data.get(image_id, {})

# Load the form image
image = cv2.imread(original_image)
image_id = os.path.splitext(os.path.basename(original_image))[0]

# Convert to grayscale for better thresholding
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Load the coordinates and labels from the JSON file
with open('box_coordinates.json') as f:
    box_data = json.load(f)

# Load the true labels for this specific image
true_labels = load_true_labels(image_id)

# Initialize a dictionary to hold the extracted text
extracted_data = {}

# Loop through each box and extract the text
for box in box_data:
    label = box['label']
    x, y, w, h = box['x'], box['y'], box['w'], box['h']
    true_label = true_labels.get(label, "N/A")
    
    # Crop the region of interest (ROI) from the image
    roi = image[y:y+h, x:x+w]
    
    # Convert the ROI to a PIL image for Tesseract
    roi_pil = Image.fromarray(roi)
    
    # Use Tesseract to extract text from the ROI
    custom_config = r'--psm 6'
    extracted_text = pytesseract.image_to_string(roi_pil, config=custom_config)
    
    # Clean up the extracted text (optional)
    extracted_text = extracted_text.strip()
    
    # Map the extracted text to the corresponding label and true label
    extracted_data[label] = {
        'extracted_text': extracted_text,
        'true_label': true_label
    }

    # Optional: Draw the bounding box on the image for visualization
    cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)

# Show the final image with bounding boxes
show_image(image, "Detected Regions")

# Print the extracted data
print(extracted_data)

# Save the extracted data as a JSON file
output_file = f"output_text/{image_id}_extracted.json"
with open(output_file, 'w') as f:
    json.dump(extracted_data, f, indent=4)

### Text Extraction with pre-trained TrOCR model

In [2]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import torch

In [3]:
# Load the pre-trained model for handwritten text
processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten')

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

VisionEncoderDecoderModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=False)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=False)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fea

In [4]:
# Function to perform inference on an image
def predict_text(image_path):
    # Load and preprocess the image
    image = Image.open(image_path).convert("L")  # Convert to grayscale
    image = image.resize((image.width // 2, image.height // 2))

    # Add a new dimension to convert from (H, W) to (1, H, W)
    image = image.convert("RGB")  # Convert back to RGB for the model
    inputs = processor(images=image, return_tensors="pt").pixel_values.to(device)

    # Perform inference without tracking gradients
    with torch.no_grad():
        # Generate the text using max_new_tokens for better control over output length
        generated_ids = model.generate(inputs, max_new_tokens=100)  # Adjust max_new_tokens as needed

    # Decode the generated text, skipping special tokens
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return generated_text

# Example usage
predicted_text = predict_text(original_image)
print("Predicted Text:", predicted_text)

Predicted Text: 0 1
