# Tesseract OCR Demo

This notebook demonstrates how to use Tesseract OCR for text extraction from images. We'll cover installation, basic usage, configuration options, and preprocessing techniques to improve OCR accuracy.

## 1. Install and Import Dependencies

Install pytesseract and import required libraries including pytesseract, PIL/Pillow, and cv2 for image handling.

In [None]:
# Install required packages
# Note: Tesseract OCR engine must be installed on your system
# Ubuntu: sudo apt-get install tesseract-ocr
# macOS: brew install tesseract

# !pip install pytesseract pillow opencv-python matplotlib

In [None]:
# Import required libraries
import pytesseract
from PIL import Image
import cv2
import numpy as np
import matplotlib.pyplot as plt

# Verify Tesseract is installed
try:
    tesseract_version = pytesseract.get_tesseract_version()
    print(f"Tesseract version: {tesseract_version}")
except Exception as e:
    print(f"Tesseract not found. Please install it first: {e}")

## 2. Load and Display an Image

Load an image file using PIL or OpenCV and display it using matplotlib.

In [None]:
# Create a sample image with text for demonstration
# In practice, you would load an existing image file

from PIL import Image, ImageDraw, ImageFont

# Create a sample image with text
img_width, img_height = 600, 200
sample_image = Image.new('RGB', (img_width, img_height), color='white')
draw = ImageDraw.Draw(sample_image)

# Add sample text
sample_text = "Hello, Tesseract OCR!\nThis is a demo image."
try:
    # Try to use a system font
    font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 32)
except OSError:
    # Fall back to default font
    font = ImageFont.load_default()

draw.text((50, 50), sample_text, fill='black', font=font)

# Save the sample image
sample_image_path = "/tmp/sample_ocr_image.png"
sample_image.save(sample_image_path)
print(f"Sample image saved to: {sample_image_path}")

In [None]:
# Load and display the image using PIL
pil_image = Image.open(sample_image_path)
print(f"Image size: {pil_image.size}")
print(f"Image mode: {pil_image.mode}")

# Display the image
plt.figure(figsize=(10, 4))
plt.imshow(pil_image)
plt.title("Sample Image for OCR")
plt.axis('off')
plt.show()

In [None]:
# Alternative: Load using OpenCV
cv_image = cv2.imread(sample_image_path)
cv_image_rgb = cv2.cvtColor(cv_image, cv2.COLOR_BGR2RGB)

print(f"OpenCV image shape: {cv_image.shape}")

plt.figure(figsize=(10, 4))
plt.imshow(cv_image_rgb)
plt.title("Image loaded with OpenCV")
plt.axis('off')
plt.show()

## 3. Basic Text Extraction with Tesseract

Use `pytesseract.image_to_string()` to extract text from an image.

In [None]:
# Basic text extraction from PIL Image
extracted_text = pytesseract.image_to_string(pil_image)
print("Extracted text from PIL Image:")
print("-" * 40)
print(extracted_text)
print("-" * 40)

In [None]:
# Text extraction from OpenCV image (numpy array)
extracted_text_cv = pytesseract.image_to_string(cv_image_rgb)
print("Extracted text from OpenCV Image:")
print("-" * 40)
print(extracted_text_cv)
print("-" * 40)

In [None]:
# Extract text directly from file path
extracted_text_path = pytesseract.image_to_string(sample_image_path)
print("Extracted text from file path:")
print("-" * 40)
print(extracted_text_path)
print("-" * 40)

## 4. Configure Tesseract Options

Set custom configuration options like PSM (Page Segmentation Mode) and OEM (OCR Engine Mode) for different use cases.

In [None]:
# Page Segmentation Modes (PSM)
# 0  = Orientation and script detection (OSD) only.
# 1  = Automatic page segmentation with OSD.
# 2  = Automatic page segmentation, but no OSD, or OCR.
# 3  = Fully automatic page segmentation, but no OSD. (Default)
# 4  = Assume a single column of text of variable sizes.
# 5  = Assume a single uniform block of vertically aligned text.
# 6  = Assume a single uniform block of text.
# 7  = Treat the image as a single text line.
# 8  = Treat the image as a single word.
# 9  = Treat the image as a single word in a circle.
# 10 = Treat the image as a single character.
# 11 = Sparse text. Find as much text as possible in no particular order.
# 12 = Sparse text with OSD.
# 13 = Raw line. Treat the image as a single text line, bypassing Tesseract-specific hacks.

print("Available PSM modes and their descriptions:")
psm_modes = {
    0: "OSD only",
    1: "Automatic page segmentation with OSD",
    3: "Fully automatic page segmentation (Default)",
    4: "Single column of variable text sizes",
    6: "Single uniform block of text",
    7: "Single text line",
    8: "Single word",
    10: "Single character",
    11: "Sparse text",
}
for mode, desc in psm_modes.items():
    print(f"  PSM {mode}: {desc}")

In [None]:
# Using different PSM modes
# PSM 6: Assume a single uniform block of text
config_psm6 = '--psm 6'
text_psm6 = pytesseract.image_to_string(pil_image, config=config_psm6)
print("PSM 6 (Single block):")
print(text_psm6)

# PSM 7: Treat as a single text line
config_psm7 = '--psm 7'
text_psm7 = pytesseract.image_to_string(pil_image, config=config_psm7)
print("PSM 7 (Single line):")
print(text_psm7)

In [None]:
# OCR Engine Modes (OEM)
# 0 = Legacy engine only.
# 1 = Neural nets LSTM engine only.
# 2 = Legacy + LSTM engines.
# 3 = Default, based on what is available.

print("OCR Engine Modes:")
print("  OEM 0: Legacy engine only")
print("  OEM 1: LSTM neural network engine only")
print("  OEM 2: Legacy + LSTM engines")
print("  OEM 3: Default (based on availability)")

# Using OEM 1 (LSTM) with PSM 6
config_oem = '--oem 1 --psm 6'
text_oem = pytesseract.image_to_string(pil_image, config=config_oem)
print(f"\nWith OEM 1 + PSM 6:")
print(text_oem)

In [None]:
# Additional configuration options
# Whitelist specific characters
config_digits = '--psm 6 -c tessedit_char_whitelist=0123456789'
print("Digits only whitelist config:", config_digits)

# Blacklist specific characters
config_no_special = '--psm 6 -c tessedit_char_blacklist=@#$%^&*()'
print("Blacklist special chars config:", config_no_special)

# Example with standard text
text_default = pytesseract.image_to_string(pil_image, config='--psm 6')
print("\nDefault extraction:")
print(text_default)

## 5. Extract Text from Specific Image Regions

Crop specific regions of an image and extract text from those areas only.

In [None]:
# Create an image with multiple text regions
multi_region_img = Image.new('RGB', (800, 400), color='white')
draw = ImageDraw.Draw(multi_region_img)

try:
    font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 24)
except OSError:
    font = ImageFont.load_default()

# Add text in different regions
draw.text((50, 50), "Region 1: Header Text", fill='black', font=font)
draw.text((50, 150), "Region 2: Main Content", fill='blue', font=font)
draw.text((50, 250), "Region 3: Footer Info", fill='red', font=font)

# Draw rectangles around regions for visualization
draw.rectangle([40, 40, 350, 90], outline='green', width=2)
draw.rectangle([40, 140, 380, 190], outline='green', width=2)
draw.rectangle([40, 240, 350, 290], outline='green', width=2)

plt.figure(figsize=(12, 6))
plt.imshow(multi_region_img)
plt.title("Image with Multiple Text Regions")
plt.axis('off')
plt.show()

In [None]:
# Define regions of interest (ROI) as (left, upper, right, lower)
regions = {
    "header": (40, 40, 350, 90),
    "content": (40, 140, 380, 190),
    "footer": (40, 240, 350, 290),
}

# Extract text from each region
print("Text extracted from specific regions:")
print("=" * 50)

for region_name, box in regions.items():
    # Crop the region
    cropped = multi_region_img.crop(box)
    
    # Extract text from cropped region
    text = pytesseract.image_to_string(cropped, config='--psm 7').strip()
    
    print(f"\n{region_name.upper()}:")
    print(f"  Coordinates: {box}")
    print(f"  Text: '{text}'")

In [None]:
# Using OpenCV for region extraction
cv_multi = np.array(multi_region_img)

# Define ROI using OpenCV format [y1:y2, x1:x2]
roi_header = cv_multi[40:90, 40:350]
roi_content = cv_multi[140:190, 40:380]

# Display cropped regions
fig, axes = plt.subplots(1, 2, figsize=(12, 3))

axes[0].imshow(roi_header)
axes[0].set_title("Header Region")
axes[0].axis('off')

axes[1].imshow(roi_content)
axes[1].set_title("Content Region")
axes[1].axis('off')

plt.tight_layout()
plt.show()

# Extract text from OpenCV crops
print("Header:", pytesseract.image_to_string(roi_header, config='--psm 7').strip())
print("Content:", pytesseract.image_to_string(roi_content, config='--psm 7').strip())

## 6. Get Bounding Boxes for Detected Text

Use `image_to_boxes()` and `image_to_data()` to get character and word-level bounding boxes with confidence scores.

In [None]:
# Get character-level bounding boxes
boxes = pytesseract.image_to_boxes(pil_image)
print("Character-level bounding boxes (first 500 chars):")
print(boxes[:500])

In [None]:
# Visualize character bounding boxes
img_with_boxes = np.array(pil_image.copy())
h, w = img_with_boxes.shape[:2]

for box in boxes.splitlines():
    parts = box.split()
    if len(parts) >= 5:
        char, x1, y1, x2, y2 = parts[0], int(parts[1]), int(parts[2]), int(parts[3]), int(parts[4])
        # Tesseract uses bottom-left origin, convert to top-left
        cv2.rectangle(img_with_boxes, (x1, h - y2), (x2, h - y1), (0, 255, 0), 1)

plt.figure(figsize=(12, 4))
plt.imshow(img_with_boxes)
plt.title("Character Bounding Boxes")
plt.axis('off')
plt.show()

In [None]:
# Get word-level data with confidence scores using image_to_data()
import pandas as pd

data = pytesseract.image_to_data(pil_image, output_type=pytesseract.Output.DICT)
df = pd.DataFrame(data)

# Filter to only rows with text
df_text = df[df['text'].str.strip() != ''].copy()

print("Word-level OCR data:")
print(df_text[['level', 'left', 'top', 'width', 'height', 'conf', 'text']].to_string())

In [None]:
# Visualize word bounding boxes with confidence scores
img_words = np.array(pil_image.copy())

for _, row in df_text.iterrows():
    x, y, w, h = row['left'], row['top'], row['width'], row['height']
    conf = row['conf']
    text = row['text']
    
    # Color based on confidence (green = high, red = low)
    if conf > 80:
        color = (0, 255, 0)  # Green
    elif conf > 50:
        color = (255, 165, 0)  # Orange
    else:
        color = (255, 0, 0)  # Red
    
    cv2.rectangle(img_words, (x, y), (x + w, y + h), color, 2)

plt.figure(figsize=(12, 4))
plt.imshow(img_words)
plt.title("Word Bounding Boxes (Green=High Conf, Orange=Medium, Red=Low)")
plt.axis('off')
plt.show()

In [None]:
# Get detailed statistics on confidence scores
print("\nConfidence Score Statistics:")
print(f"  Mean: {df_text['conf'].mean():.2f}")
print(f"  Min: {df_text['conf'].min()}")
print(f"  Max: {df_text['conf'].max()}")
print(f"  Words with conf > 80: {len(df_text[df_text['conf'] > 80])}")

## 7. Handle Different Languages

Configure Tesseract to recognize text in different languages using the `lang` parameter.

In [None]:
# List available languages
try:
    langs = pytesseract.get_languages()
    print("Available Tesseract languages:")
    print(langs)
except Exception as e:
    print(f"Could not get languages: {e}")
    print("Common languages: eng, fra, deu, spa, ita, por, chi_sim, jpn, kor, ara")

In [None]:
# Create sample images with different languages
# Note: For non-English languages, you need to install the language packs
# Ubuntu: sudo apt-get install tesseract-ocr-<lang>

# English text
eng_text = pytesseract.image_to_string(pil_image, lang='eng')
print("English (eng):")
print(eng_text)

# Multiple languages (if you have text mixed with different languages)
# Use '+' to combine languages
try:
    multi_lang_text = pytesseract.image_to_string(pil_image, lang='eng+fra')
    print("\nEnglish + French (eng+fra):")
    print(multi_lang_text)
except Exception as e:
    print(f"Multi-language not available: {e}")

In [None]:
# Create an image with special characters/symbols
special_img = Image.new('RGB', (500, 150), color='white')
draw = ImageDraw.Draw(special_img)

try:
    font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 28)
except OSError:
    font = ImageFont.load_default()

# Text with numbers and symbols
special_text = "Price: $99.99 (50% OFF!)"
draw.text((30, 50), special_text, fill='black', font=font)

plt.figure(figsize=(10, 3))
plt.imshow(special_img)
plt.title("Text with Numbers and Symbols")
plt.axis('off')
plt.show()

# Extract with different configs
extracted = pytesseract.image_to_string(special_img, lang='eng', config='--psm 7')
print(f"Extracted: {extracted.strip()}")

## 8. Preprocess Images for Better OCR Results

Apply image preprocessing techniques like grayscale conversion, thresholding, and noise removal to improve OCR accuracy.

In [None]:
# Create a noisy/degraded image for demonstration
noisy_img = Image.new('RGB', (600, 150), color=(240, 240, 240))
draw = ImageDraw.Draw(noisy_img)

try:
    font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 28)
except OSError:
    font = ImageFont.load_default()

# Add text with low contrast
draw.text((50, 50), "Noisy Text Sample 123", fill=(100, 100, 100), font=font)

# Add noise
np_noisy = np.array(noisy_img)
noise = np.random.randint(-30, 30, np_noisy.shape, dtype=np.int16)
np_noisy = np.clip(np_noisy.astype(np.int16) + noise, 0, 255).astype(np.uint8)
noisy_img = Image.fromarray(np_noisy)

plt.figure(figsize=(10, 3))
plt.imshow(noisy_img)
plt.title("Original Noisy Image")
plt.axis('off')
plt.show()

print("OCR on noisy image:")
print(pytesseract.image_to_string(noisy_img, config='--psm 7').strip())

In [None]:
# Preprocessing function
def preprocess_image(image, method='threshold'):
    """
    Preprocess image for better OCR results.
    
    Methods:
    - 'grayscale': Convert to grayscale
    - 'threshold': Binary thresholding
    - 'adaptive': Adaptive thresholding
    - 'denoise': Noise removal
    - 'full': All preprocessing steps
    """
    # Convert PIL to OpenCV format
    img = np.array(image)
    
    # Convert to grayscale
    if len(img.shape) == 3:
        gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    else:
        gray = img.copy()
    
    if method == 'grayscale':
        return gray
    
    elif method == 'threshold':
        # Otsu's thresholding
        _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        return thresh
    
    elif method == 'adaptive':
        # Adaptive thresholding
        thresh = cv2.adaptiveThreshold(
            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
        )
        return thresh
    
    elif method == 'denoise':
        # Denoise
        denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
        return denoised
    
    elif method == 'full':
        # Full preprocessing pipeline
        # 1. Denoise
        denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
        # 2. Threshold
        _, thresh = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        return thresh
    
    return gray

In [None]:
# Compare different preprocessing methods
methods = ['grayscale', 'threshold', 'adaptive', 'denoise', 'full']

fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.flatten()

# Original
axes[0].imshow(noisy_img)
axes[0].set_title(f"Original\nOCR: '{pytesseract.image_to_string(noisy_img, config='--psm 7').strip()}'")
axes[0].axis('off')

# Preprocessed versions
for i, method in enumerate(methods, 1):
    processed = preprocess_image(noisy_img, method)
    ocr_result = pytesseract.image_to_string(processed, config='--psm 7').strip()
    
    axes[i].imshow(processed, cmap='gray')
    axes[i].set_title(f"{method.capitalize()}\nOCR: '{ocr_result}'")
    axes[i].axis('off')

plt.tight_layout()
plt.show()

In [None]:
# Additional preprocessing: Deskewing and scaling
def deskew(image):
    """Deskew an image using moments."""
    coords = np.column_stack(np.where(image > 0))
    if len(coords) == 0:
        return image
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = 90 + angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, 
                              borderMode=cv2.BORDER_REPLICATE)
    return rotated

def scale_image(image, scale_factor=2):
    """Scale image for better OCR (Tesseract works best with DPI >= 300)."""
    width = int(image.shape[1] * scale_factor)
    height = int(image.shape[0] * scale_factor)
    return cv2.resize(image, (width, height), interpolation=cv2.INTER_CUBIC)

# Demo scaling
processed = preprocess_image(noisy_img, 'full')
scaled = scale_image(processed, 2)

print(f"Original size: {processed.shape}")
print(f"Scaled size: {scaled.shape}")

ocr_original = pytesseract.image_to_string(processed, config='--psm 7').strip()
ocr_scaled = pytesseract.image_to_string(scaled, config='--psm 7').strip()

print(f"\nOCR (original): '{ocr_original}'")
print(f"OCR (scaled 2x): '{ocr_scaled}'")