# OCR Preprocessing Pipeline for Renaissance Documents

This notebook randomly samples pages from a set of PDF documents, converts them to images, and applies a standard OCR preprocessing pipeline:
- Grayscale conversion
- Noise removal (median filter)
- Binarization (Otsu’s threshold)
- Deskewing

You'll end up with clean, black-and-white PNG files ready for OCR.



### 1. Install Dependencies

Run the following cell to install necessary packages (requires Poppler installed on your system):


In [None]:

%pip install pdf2image opencv-python numpy gdown ipywidgets matplotlib opencv-python-headless


### 2. Imports and Configuration

Adjust the paths below to point to your PDF directory and desired output folder.

In [None]:
import os
import random
from pdf2image import convert_from_path, pdfinfo_from_path
import cv2
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from PIL import Image

# Disable PIL bomb check
Image.MAX_IMAGE_PIXELS = None

# Configuration
pdf_dir = '/data/pdfs'
pdf_samples_dir = '/data/pdf_samples'
png_dir = '/data/pngs'
processed_dir = '/data/processed_data'
processed_resized_dir = '/data/processed_resized'
POPPLER_PATH = 'C:/Users/katej/OneDrive/Documents/Downloads/Release-24.08.0-0/poppler-24.08.0/Library/bin/'
# Update to your own path

os.makedirs(processed_dir, exist_ok=True)

# Set DPI for conversion
DPI = 300

#### Download DataSets from Google Drive Folder

In [4]:
folder_id = "1B_dM138pLrQGxRqUn1wcFO2MEotGWr7h"
gdown_command = f"gdown --folder {folder_id} -O {pdf_dir}"
print('Running:', gdown_command)
!{gdown_command}


Running: gdown --folder 1B_dM138pLrQGxRqUn1wcFO2MEotGWr7h -O /data/pdfs
^C


Retrieving folder contents
Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1E7bnev2iGtg6AyGc1s8VNhLUUntdv3er
To: c:\data\pdfs\Ayala, Lorenzo de. Valladolid 1603.pdf

  0%|          | 0.00/6.42M [00:00<?, ?B/s]
 16%|█▋        | 1.05M/6.42M [00:00<00:00, 10.4MB/s]
 41%|████      | 2.62M/6.42M [00:00<00:00, 11.0MB/s]
 65%|██████▌   | 4.19M/6.42M [00:00<00:00, 11.5MB/s]
 90%|████████▉ | 5.77M/6.42M [00:00<00:00, 11.5MB/s]
100%|██████████| 6.42M/6.42M [00:00<00:00, 11.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=1L-hYw-dH52EQm5Di2ZhgPWEuscL3cF7f
To: c:\data\pdfs\Burgos, Juan de. Valladolid 1500.pdf

  0%|          | 0.00/52.7M [00:00<?, ?B/s]
  2%|▏         | 1.05M/52.7M [00:00<00:05, 9.35MB/s]
  4%|▍         | 2.10M/52.7M [00:00<00:07, 6.95MB/s]
  6%|▌         | 3.15M/52.7M [00:00<00:07, 6.42MB/s]
  8%|▊         | 4.19M/52.7M [00:00<00:07, 6.19MB/s]
 10%|▉         | 5.

Processing file 1E7bnev2iGtg6AyGc1s8VNhLUUntdv3er Ayala, Lorenzo de. Valladolid 1603.pdf
Processing file 1L-hYw-dH52EQm5Di2ZhgPWEuscL3cF7f Burgos, Juan de. Valladolid 1500.pdf
Processing file 17uxQuvlHBHSAWtb0UCKmGKIhg0WZPXfm Cansoles, Fernando. Mallorca 1541.pdf
Processing file 1oqOE-13uE6w2u6u-r-_9K6VRCRrrRGst Cerda - Estados mujeres OG BW.pdf
Processing file 1YkN5ZWdr6veG1JPNRrvKfK0ijFyNlkST Guevara - Reloj de Principes OG.pdf


In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from pdf2image import convert_from_path, pdfinfo_from_path
from ipywidgets import (
    widgets, interact, interactive_output, VBox, HBox,
    IntSlider, FloatSlider, Checkbox, Button, Output
)
import os

# --- Global state ---
POPPLER_PATH = 'C:/Users/katej/OneDrive/Documents/Downloads/Release-24.08.0-0/poppler-24.08.0/Library/bin/'
latest_data = {"image": None, "page": None, "pdf_path": None, "dpi": 300}

# --- PDF & processing ---
def load_pdf_page(pdf_path, page_num=1, dpi=300):
    img = convert_from_path(
        pdf_path,
        dpi=dpi,
        first_page=page_num,
        last_page=page_num,
        poppler_path=POPPLER_PATH
    )[0]
    return np.array(img)[:, :, ::-1]  # PIL RGB to OpenCV BGR

def apply_pipeline(
    img_bgr,
    min_area=50,
    clahe_clip=3.0,
    blur_kernel=55,
    denoise_h=10,
    do_denoise=True,
    do_bg_removal=True,
    do_contrast=True,
    do_binarize=True,
    do_close=True,
    do_filter=True
):
    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)

    if do_denoise:
        gray = cv2.fastNlMeansDenoising(gray, None, h=denoise_h, templateWindowSize=7, searchWindowSize=21)

    if do_bg_removal:
        blur_kernel = max(3, blur_kernel | 1)  # force odd
        bg = cv2.medianBlur(gray, blur_kernel)
        gray = cv2.divide(gray, bg, scale=255)

    if do_contrast:
        clahe = cv2.createCLAHE(clipLimit=clahe_clip, tileGridSize=(8, 8))
        enhanced = clahe.apply(gray)
        lo, hi = np.percentile(enhanced, [2, 98])
        gray = np.clip((enhanced - lo) * (255.0 / (hi - lo)), 0, 255).astype(np.uint8)

    if do_binarize:
        _, bw_inv = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
        bw = cv2.bitwise_not(bw_inv)
    else:
        bw = gray.copy()

    if do_close:
        bw = cv2.morphologyEx(bw, cv2.MORPH_CLOSE, np.ones((2, 2), np.uint8))

    if do_filter:
        num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(bw, connectivity=8)
        cleaned = np.zeros_like(bw)
        for i in range(1, num_labels):
            if stats[i, cv2.CC_STAT_AREA] >= min_area:
                cleaned[labels == i] = 255
        bw = cleaned

    return bw

# --- Widgets ---
pdf_path_widget = widgets.Text(
    value='data\pdfs\Padilla - 2 Noble perfecto_Extract.pdf',
    description='PDF Path:',
    layout=widgets.Layout(width='600px')
)
page_widget = IntSlider(min=1, max=10, step=1, value=1, description='Page')

min_area_widget = IntSlider(min=10, max=500, step=10, value=50, description='Min Area')
clahe_clip_widget = FloatSlider(min=1.0, max=10.0, step=0.5, value=3.0, description='CLAHE Clip')
blur_kernel_widget = IntSlider(min=3, max=101, step=2, value=55, description='Blur Kernel')
denoise_h_widget = IntSlider(min=0, max=30, step=1, value=10, description='Denoise H')

toggles = {
    "do_denoise": Checkbox(value=True, description="Denoise"),
    "do_bg_removal": Checkbox(value=True, description="Background Removal"),
    "do_contrast": Checkbox(value=True, description="CLAHE + Stretch"),
    "do_binarize": Checkbox(value=True, description="Binarize"),
    "do_close": Checkbox(value=True, description="Close Gaps"),
    "do_filter": Checkbox(value=True, description="Filter Components"),
}

# --- Update function ---
def update(pdf_path, page, min_area, clahe_clip, blur_kernel, denoise_h,
           do_denoise, do_bg_removal, do_contrast,
           do_binarize, do_close, do_filter):

    try:
        img_bgr = load_pdf_page(pdf_path, page, dpi=latest_data["dpi"])
    except Exception as e:
        print(f"Error loading PDF page: {e}")
        return

    result = apply_pipeline(
        img_bgr,
        min_area=min_area,
        clahe_clip=clahe_clip,
        blur_kernel=blur_kernel,
        denoise_h=denoise_h,
        do_denoise=do_denoise,
        do_bg_removal=do_bg_removal,
        do_contrast=do_contrast,
        do_binarize=do_binarize,
        do_close=do_close,
        do_filter=do_filter
    )

    # Store for saving
    latest_data["image"] = result
    latest_data["page"] = page
    latest_data["pdf_path"] = pdf_path

    # Display
    plt.figure(figsize=(14, 6))
    plt.subplot(1, 2, 1)
    plt.imshow(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB))
    plt.title("Original")
    plt.axis("off")

    plt.subplot(1, 2, 2)
    plt.imshow(result, cmap='gray')
    plt.title("Processed")
    plt.axis("off")

    plt.tight_layout()
    plt.show()

# --- Save logic ---
save_button = Button(description="Save current page", button_style='success')
save_all_checkbox = Checkbox(value=False, description="Save ALL pages")
save_output = Output()

def save_processed_image(img, out_path):
    Path(os.path.dirname(out_path)).mkdir(exist_ok=True)
    cv2.imwrite(out_path, img)

def on_save_clicked(b):
    with save_output:
        save_output.clear_output()
        if save_all_checkbox.value:
            print("Saving all pages...")
            try:
                info = pdfinfo_from_path(latest_data["pdf_path"], poppler_path=POPPLER_PATH)
                total_pages = info["Pages"]
                for pg in range(1, total_pages + 1):
                    img_bgr = load_pdf_page(latest_data["pdf_path"], page_num=pg, dpi=latest_data["dpi"])
                    result = apply_pipeline(
                        img_bgr,
                        min_area=min_area_widget.value,
                        clahe_clip=clahe_clip_widget.value,
                        blur_kernel=blur_kernel_widget.value,
                        denoise_h=denoise_h_widget.value,
                        do_denoise=toggles["do_denoise"].value,
                        do_bg_removal=toggles["do_bg_removal"].value,
                        do_contrast=toggles["do_contrast"].value,
                        do_binarize=toggles["do_binarize"].value,
                        do_close=toggles["do_close"].value,
                        do_filter=toggles["do_filter"].value,
                    )
                    out_path = os.path.join("output", f"{Path(latest_data['pdf_path']).stem}_p{pg:03d}.png")
                    save_processed_image(result, out_path)
                print(f"Saved {total_pages} pages to /output/")
            except Exception as e:
                print(f"Error saving pages: {e}")
        else:
            img = latest_data["image"]
            page_num = latest_data["page"]
            if img is None:
                print("No processed image to save yet.")
                return
            out_path = os.path.join("output", f"{Path(latest_data['pdf_path']).stem}_p{page_num:03d}.png")
            save_processed_image(img, out_path)
            print(f"Saved current page to {out_path}")

save_button.on_click(on_save_clicked)

# --- UI display ---
ui = VBox([
    pdf_path_widget,
    page_widget,
    min_area_widget,
    clahe_clip_widget,
    blur_kernel_widget,
    denoise_h_widget,
    VBox(list(toggles.values())),
    HBox([save_button, save_all_checkbox]),
    save_output
])

out = interactive_output(update, {
    "pdf_path": pdf_path_widget,
    "page": page_widget,
    "min_area": min_area_widget,
    "clahe_clip": clahe_clip_widget,
    "blur_kernel": blur_kernel_widget,
    "denoise_h": denoise_h_widget,
    **toggles
})

display(ui, out)


VBox(children=(Text(value='data\\pdfs\\Padilla - 2 Noble perfecto_Extract.pdf', description='PDF Path:', layou…

Output()

Example of Data Extraction. Ensure Nobleza Virtuosa and Noble Perfecto are preprocessed and saved in their respective folders.

In [10]:
# Define paths for image folders and outputs
image_folders = {
    "Nobleza Virtuosa": {
        "pdf": Path("data/pdfs/Padilla - Nobleza virtuosa_testExtract.pdf"),
        "processed": Path("data/processed_data_virtuosa"),
        "bounding_boxes": Path("output/craft/boxes_virtuosa"),
        "bounding_boxes_sorted": Path("output/craft/boxes_sorted_virtuosa"),
        "gt_docx": Path("data/transcriptions/Padilla - 1 Nobleza virtuosa_testTranscription.docx"),
        "text_file": Path("data/transcriptions/NoblezaVirtuosa_all_text.txt"),
        "split_text_output": Path("data/split_data_nobleza/textSplitted_NoblezaVirtuosa"),
    },

    "Noble Perfecto": {
        "pdf": Path("data/pdfs/Padilla - 2 Noble perfecto_Extract.pdf"),
        "processed": Path("data/processed_data_perfecto"),
        "output": Path("output/craft/boxes_perfecto"),
        "bounding_boxes": Path("output/craft/boxes_perfecto"),
        "bounding_boxes_sorted": Path("output/craft/boxes_sorted_perfecto"),
        "gt_docx": Path("data/transcriptions/Padilla - 2 Noble perfecto_Transcription.docx"),
        "text_file": Path("data/transcriptions/NoblePerfecto_all_text.txt"),
        "split_text_output": Path("data/split_data_perfecto/textSplitted_NoblePerfecto")
    }
}

Download the PDFs

In [None]:
PDF_DIR = Path("data/pdfs")
FOLDER_ID ='1wMSs07lgET5sjA-alaoZ3CcNF_cEpRE2'

gdown_command = f"gdown --folder https://drive.google.com/drive/folders/{FOLDER_ID} -O {PDF_DIR}"
print('Running:', gdown_command)
!{gdown_command}

Once they are preprocessed and saved. split them into individual pages.

In [8]:
from utils import split_and_save_image
import os

processed_dir = image_folders["Nobleza Virtuosa"]["processed"]
image_files = sorted([f for f in os.listdir(processed_dir) if f.lower().endswith('.png')])

last_image_number = 1

for fname in image_files:
    image_path = os.path.join(processed_dir, fname)
    last_image_number = split_and_save_image(
        image_path=image_path,
        output_folder=processed_dir,
        last_image_number=last_image_number
    )

processed_dir = image_folders["Noble Perfecto"]["processed"]
image_files = sorted([f for f in os.listdir(processed_dir) if f.lower().endswith('.png')])

last_image_number = 1

for fname in image_files:
    image_path = os.path.join(processed_dir, fname)
    last_image_number = split_and_save_image(
        image_path=image_path,
        output_folder=processed_dir,
        last_image_number=last_image_number
    )

Text Detection with CRAFT (https://github.com/clovaai/CRAFT-pytorch)

In [None]:
#It generally takes about ~3-4 mins
!python CRAFT_Model/CRAFT/BoundBoxFunc/test.py --result_folder=image_folders["Noble Perfecto"]["bounding_boxes"]  --test_folder=image_folders["Noble Perfecto"]["processed"]  --trained_model="CRAFT_Model/CRAFT/BoundBoxFunc/weights/craft_mlt_25k.pth"
#It generally takes about ~3-4 mins
!python CRAFT_Model/CRAFT/BoundBoxFunc/test.py --result_folder=image_folders["Nobleza Virtuosa"]["bounding_boxes"]  --test_folder=image_folders["Nobleza Virtuosa"]["processed"]  --trained_model="CRAFT_Model/CRAFT/BoundBoxFunc/weights/craft_mlt_25k.pth"

Sort Bounding Boxes

In [None]:
from utils import sort_bounding_boxes

sort_bounding_boxes(input_dir=image_folders["Nobleza Virtuosa"]["bounding_boxes"], output_dir=image_folders["Nobleza Virtuosa"]["bounding_boxes_sorted"],)
sort_bounding_boxes(input_dir=image_folders["Noble Perfecto"]["bounding_boxes"], output_dir=image_folders["Noble Perfecto"]["bounding_boxes_sorted"],)

Extract Transcriptions

In [11]:
from utils import extract_ground_truth_for_dataset

extract_ground_truth_for_dataset(
    docx_file=image_folders["Nobleza Virtuosa"]["gt_docx"],
    output_path=image_folders["Nobleza Virtuosa"]["text_file"]
)
extract_ground_truth_for_dataset(
    docx_file=image_folders["Noble Perfecto"]["gt_docx"],
    output_path=image_folders["Noble Perfecto"]["text_file"]
)

Ground truth text saved to data\transcriptions\NoblezaVirtuosa_all_text.txt
Ground truth text saved to data\transcriptions\NoblePerfecto_all_text.txt


Split Transcription By Page

In [None]:
from utils import process_textfiles
import os

for name, paths in image_folders.items():
    textfile = paths["text_file"]
    sorted_BoundBox_folder = paths["bounding_boxes_sorted"]
    output_folder = paths["split_text_output"]
    os.makedirs(output_folder, exist_ok=True)
    print(f"Splitting text for {name}...")
    process_textfiles(textfile, sorted_BoundBox_folder, output_folder)
    print(f"Text splitting complete for {name}! Output: {output_folder}")


Extract and Save Labelled Boxes

In [None]:
from utils import apply_extraction_to_folder_for_train

for name, paths in image_folders.items():
    apply_extraction_to_folder_for_train(
        image_folder=paths["processed"],
        bounding_box_folder=paths["bounding_boxes_sorted"],
        text_folder=paths["split_text_output"],
        output_folder="content/train_data/",
    )

In [None]:
from utils import apply_extraction_to_folder_for_test

for name, paths in image_folders.items():
    apply_extraction_to_folder_for_test(
        image_folder=paths["processed"],
        bounding_box_folder=paths["bounding_boxes_sorted"],
        output_folder="content/test_data/",
    )

Pad and Resize Images

In [None]:
from utils import pad_and_resize_images

pad_and_resize_images("content/train_data")
pad_and_resize_images("content/test_data")

Download Manually Corrected Dataset

In [None]:
import gdown
import zipfile
import os

# Google Drive file ID of the ZIP file
zip_output_path = 'data/downloaded_folder.zip'

# Construct download URL
gdown.download(f'https://drive.google.com/file/d/1AmciGFL8YFFgqdvhL7e5T4GPkoGhMnUq', zip_output_path, quiet=False)

# Unzip the downloaded file
with zipfile.ZipFile(zip_output_path, 'r') as zip_ref:
    zip_ref.extractall('unzipped_folder')

# Optional: remove the zip file after extraction
os.remove(zip_output_path)
