# OCR for Handwritten Forms

In [1]:
# Select image to extract text from
original_image = 'images/image_1.jpg'

### Image Preprocessing

In [None]:
# Import OpenCV library
import cv2

# Load the image
image = cv2.imread(original_image)

# Convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Apply adaptive thresholding
thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                               cv2.THRESH_BINARY, 11, 2)

# Save the processed image (optional)
processed_image = 'processed_images/processed_image_1.jpg'
cv2.imwrite(processed_image, thresh)

### Extract Text using EasyOCR library

In [None]:
# Import EasyOCR library
import easyocr

# Initialize reader object
reader = easyocr.Reader(['en'])

# Extract text from original image
result = reader.readtext(original_image, detail=0)

# Join the extracted text into a single string
extracted_text = '\n'.join(result)

# Check output
print(extracted_text)

# Save the extracted text to a file
output_text = 'output_text/easyocr_output_text_1.txt'
with open(output_text, 'w') as file:
    file.write(extracted_text)

### Tokenize output

In [None]:
# Import SpaCy library
import spacy

# Load the English model
nlp = spacy.load('en_core_web_sm')

# Read the saved text file
with open(output_text, 'r') as file:
    text = file.read()

# Tokenize using spaCy
doc = nlp(text)
tokens = [token.text for token in doc]

print(tokens)

### Extract text using Tesseract library

In [None]:
import pytesseract
from PIL import Image

# Open the image
img = Image.open(original_image)

custom_config = r'--psm 11'

# Extract text with the specified page segmentation mode
extracted_text = pytesseract.image_to_string(img, config=custom_config)

# Print the extracted text
print(extracted_text)

# Save the extracted text to a file
output_text = 'output_text/tesseract_output_text_1.txt'

with open(output_text, 'w') as file:
    file.write(extracted_text)