# OCR with Google Tesseract
Reading legal PDFs with OCR

## Import libraries
Install dependencies

In [1]:
#!pip install pytesseract pdf2image

Import libraries

In [13]:
import requests
from pdf2image import convert_from_bytes
import pytesseract
import pandas as pd

## Data
Get caselaw data containing pdf URLs

In [3]:
data_path = "./data/caselaw_data.csv"
df = pd.read_csv(data_path)

A method for reading the PDF from URL

In [4]:
def get_pdf(url: str) -> bytes:
    """Fetches PDF based on URL."""
    response = requests.get(url)
    response.raise_for_status()
    
    return response.content

## Perform OCR
A method for converting PDF to images

In [5]:
def pdf2image(pdf_path: str, dpi=150) -> list: return convert_from_bytes(pdf_path, dpi)

A method for converting images into text

In [6]:
def images2text(images: list, verbose=False) -> str:
    """Converts images into text with Google's Tesseract."""
    pages_text = []
    for page_number, img in enumerate(images, start=1):
        text = pytesseract.image_to_string(img)
        pages_text.append(text)
        if verbose: print(f"Processed page {page_number}")

    return "\n".join(pages_text)

Extract text from PDFs

In [7]:
document_texts = []
for idx, pdf_url in enumerate(df["pdf_url"], start=1):
    pdf = get_pdf(pdf_url)
    images = pdf2image(pdf)
    text = images2text(images)
    document_texts.append(text)
    print(f"({idx}/{len(df)}) Processed pdf: {pdf_url}")

(1/50) Processed pdf: https://static.case.law/ala/295.pdf
(2/50) Processed pdf: https://static.case.law/alaska/17.pdf
(3/50) Processed pdf: https://static.case.law/ariz/242.pdf
(4/50) Processed pdf: https://static.case.law/ark/375.pdf
(5/50) Processed pdf: https://static.case.law/cal/220.pdf
(6/50) Processed pdf: https://static.case.law/colo/200.pdf
(7/50) Processed pdf: https://static.case.law/conn/303.pdf
(8/50) Processed pdf: https://static.case.law/del/59.pdf
(9/50) Processed pdf: https://static.case.law/fla/160.pdf
(10/50) Processed pdf: https://static.case.law/ga/302.pdf
(11/50) Processed pdf: https://static.case.law/haw/140.pdf
(12/50) Processed pdf: https://static.case.law/idaho/162.pdf
(13/50) Processed pdf: https://static.case.law/ill/415.pdf
(14/50) Processed pdf: https://static.case.law/ind/275.pdf
(15/50) Processed pdf: https://static.case.law/iowa/261.pdf
(16/50) Processed pdf: https://static.case.law/kan/304.pdf
(17/50) Processed pdf: https://static.case.law/ky/314.pdf
(

Store the text into dataframe

In [8]:
df["text"] = document_texts

Save the dataframe locally

In [9]:
data_path = "./data/caselaw_data_text.csv"
df.to_csv(data_path, index=False)

## Save the dataset into Hugging Face
Login to HuggingFace (run `huggingface-cli login` with HuggingFace client)

Convert `pd.DataFrame` into HuggingFace `Dataset`

In [14]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)

Push to hub

In [15]:
username = "Juh6973" # Your username here
dataset_name = "caselaw_latest_volumes_by_state"
dataset.push_to_hub(f"{username}/{dataset_name}")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Juh6973/caselaw_latest_volumes_by_state/commit/0a345201f1a14c82b2d233263eca50815bc988cf', commit_message='Upload dataset', commit_description='', oid='0a345201f1a14c82b2d233263eca50815bc988cf', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Juh6973/caselaw_latest_volumes_by_state', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Juh6973/caselaw_latest_volumes_by_state'), pr_revision=None, pr_num=None)