# OCR with Google Tesseract
Reading legal PDFs with OCR

## Import libraries
Install dependencies

In [2]:
#!pip install pytesseract pdf2image

Import libraries

In [3]:
import requests
from pdf2image import convert_from_bytes
import pytesseract
import pandas as pd

## Data
Get caselaw data containing pdf URLs

In [4]:
data_path = "./data/caselaw_data.csv"
df = pd.read_csv(data_path)

A method for reading the PDF from URL

In [5]:
def get_pdf(url: str) -> bytes:
    """Fetches PDF based on URL."""
    response = requests.get(url)
    response.raise_for_status()
    
    return response.content

## Perform OCR
A method for converting PDF to images

In [6]:
def pdf2image(pdf_path: str, dpi=150) -> list: return convert_from_bytes(pdf_path, dpi)

A method for converting images into text

In [7]:
def images2text(images: list, verbose=False) -> str:
    """Converts images into text with Google's Tesseract."""
    pages_text = []
    for page_number, img in enumerate(images, start=1):
        text = pytesseract.image_to_string(img)
        pages_text.append(text)
        if verbose: print(f"Processed page {page_number}")

    return "\n".join(pages_text)

Extract text from PDFs

In [8]:
document_texts = []
for pdf_url in df["pdf_url"]:
    pdf = get_pdf(pdf_url)
    images = pdf2image(pdf)
    text = images2text(images)
    document_texts.append(text)
    print(f"Processed pdf: {pdf_url}")

Processed pdf: https://static.case.law/ala/295.pdf


KeyboardInterrupt: 