# OCR Preprocessing Pipeline for Renaissance Documents

This notebook randomly samples pages from a set of PDF documents, converts them to images, and applies a standard OCR preprocessing pipeline:
- Grayscale conversion
- Noise removal (median filter)
- Binarization (Otsu’s threshold)
- Deskewing

You'll end up with clean, black-and-white PNG files ready for OCR.



### 1. Install Dependencies

Run the following cell to install necessary packages (requires Poppler installed on your system):


In [None]:

%pip install pdf2image opencv-python numpy gdown ipywidgets matplotlib opencv-python-headless


### 2. Imports and Configuration

Adjust the paths below to point to your PDF directory and desired output folder.

In [2]:
import os
import random
from pdf2image import convert_from_path, pdfinfo_from_path
import cv2
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from PIL import Image

# Disable PIL bomb check
Image.MAX_IMAGE_PIXELS = None

# Configuration
pdf_dir = '/data/pdfs'
pdf_samples_dir = '/data/pdf_samples'
png_dir = '/data/pngs'
processed_dir = '/data/processed_data'
processed_resized_dir = '/data/processed_resized'
POPPLER_PATH = 'C:/Users/katej/OneDrive/Documents/Downloads/Release-24.08.0-0/poppler-24.08.0/Library/bin/'
# Update to your own path

os.makedirs(processed_dir, exist_ok=True)

# Set DPI for conversion
DPI = 300

#### Download DataSets from Google Drive Folder

In [4]:
folder_id = "1B_dM138pLrQGxRqUn1wcFO2MEotGWr7h"
gdown_command = f"gdown --folder {folder_id} -O {pdf_dir}"
print('Running:', gdown_command)
!{gdown_command}


Running: gdown --folder 1B_dM138pLrQGxRqUn1wcFO2MEotGWr7h -O /data/pdfs
^C


Retrieving folder contents
Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1E7bnev2iGtg6AyGc1s8VNhLUUntdv3er
To: c:\data\pdfs\Ayala, Lorenzo de. Valladolid 1603.pdf

  0%|          | 0.00/6.42M [00:00<?, ?B/s]
 16%|█▋        | 1.05M/6.42M [00:00<00:00, 10.4MB/s]
 41%|████      | 2.62M/6.42M [00:00<00:00, 11.0MB/s]
 65%|██████▌   | 4.19M/6.42M [00:00<00:00, 11.5MB/s]
 90%|████████▉ | 5.77M/6.42M [00:00<00:00, 11.5MB/s]
100%|██████████| 6.42M/6.42M [00:00<00:00, 11.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=1L-hYw-dH52EQm5Di2ZhgPWEuscL3cF7f
To: c:\data\pdfs\Burgos, Juan de. Valladolid 1500.pdf

  0%|          | 0.00/52.7M [00:00<?, ?B/s]
  2%|▏         | 1.05M/52.7M [00:00<00:05, 9.35MB/s]
  4%|▍         | 2.10M/52.7M [00:00<00:07, 6.95MB/s]
  6%|▌         | 3.15M/52.7M [00:00<00:07, 6.42MB/s]
  8%|▊         | 4.19M/52.7M [00:00<00:07, 6.19MB/s]
 10%|▉         | 5.

Processing file 1E7bnev2iGtg6AyGc1s8VNhLUUntdv3er Ayala, Lorenzo de. Valladolid 1603.pdf
Processing file 1L-hYw-dH52EQm5Di2ZhgPWEuscL3cF7f Burgos, Juan de. Valladolid 1500.pdf
Processing file 17uxQuvlHBHSAWtb0UCKmGKIhg0WZPXfm Cansoles, Fernando. Mallorca 1541.pdf
Processing file 1oqOE-13uE6w2u6u-r-_9K6VRCRrrRGst Cerda - Estados mujeres OG BW.pdf
Processing file 1YkN5ZWdr6veG1JPNRrvKfK0ijFyNlkST Guevara - Reloj de Principes OG.pdf


In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from pdf2image import convert_from_path, pdfinfo_from_path
from ipywidgets import (
    widgets, interact, interactive_output, VBox, HBox,
    IntSlider, FloatSlider, Checkbox, Button, Output
)
import os

# --- Global state ---
POPPLER_PATH = 'C:/Users/katej/OneDrive/Documents/Downloads/Release-24.08.0-0/poppler-24.08.0/Library/bin/'
latest_data = {"image": None, "page": None, "pdf_path": None, "dpi": 300}

# --- PDF & processing ---
def load_pdf_page(pdf_path, page_num=1, dpi=300):
    img = convert_from_path(
        pdf_path,
        dpi=dpi,
        first_page=page_num,
        last_page=page_num,
        poppler_path=POPPLER_PATH
    )[0]
    return np.array(img)[:, :, ::-1]  # PIL RGB to OpenCV BGR

def apply_pipeline(
    img_bgr,
    min_area=50,
    clahe_clip=3.0,
    blur_kernel=55,
    denoise_h=10,
    do_denoise=True,
    do_bg_removal=True,
    do_contrast=True,
    do_binarize=True,
    do_close=True,
    do_filter=True
):
    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)

    if do_denoise:
        gray = cv2.fastNlMeansDenoising(gray, None, h=denoise_h, templateWindowSize=7, searchWindowSize=21)

    if do_bg_removal:
        blur_kernel = max(3, blur_kernel | 1)  # force odd
        bg = cv2.medianBlur(gray, blur_kernel)
        gray = cv2.divide(gray, bg, scale=255)

    if do_contrast:
        clahe = cv2.createCLAHE(clipLimit=clahe_clip, tileGridSize=(8, 8))
        enhanced = clahe.apply(gray)
        lo, hi = np.percentile(enhanced, [2, 98])
        gray = np.clip((enhanced - lo) * (255.0 / (hi - lo)), 0, 255).astype(np.uint8)

    if do_binarize:
        _, bw_inv = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
        bw = cv2.bitwise_not(bw_inv)
    else:
        bw = gray.copy()

    if do_close:
        bw = cv2.morphologyEx(bw, cv2.MORPH_CLOSE, np.ones((2, 2), np.uint8))

    if do_filter:
        num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(bw, connectivity=8)
        cleaned = np.zeros_like(bw)
        for i in range(1, num_labels):
            if stats[i, cv2.CC_STAT_AREA] >= min_area:
                cleaned[labels == i] = 255
        bw = cleaned

    return bw

# --- Widgets ---
pdf_path_widget = widgets.Text(
    value='data\pdfs\Padilla - 2 Noble perfecto_Extract.pdf',
    description='PDF Path:',
    layout=widgets.Layout(width='600px')
)
page_widget = IntSlider(min=1, max=10, step=1, value=1, description='Page')

min_area_widget = IntSlider(min=10, max=500, step=10, value=50, description='Min Area')
clahe_clip_widget = FloatSlider(min=1.0, max=10.0, step=0.5, value=3.0, description='CLAHE Clip')
blur_kernel_widget = IntSlider(min=3, max=101, step=2, value=55, description='Blur Kernel')
denoise_h_widget = IntSlider(min=0, max=30, step=1, value=10, description='Denoise H')

toggles = {
    "do_denoise": Checkbox(value=True, description="Denoise"),
    "do_bg_removal": Checkbox(value=True, description="Background Removal"),
    "do_contrast": Checkbox(value=True, description="CLAHE + Stretch"),
    "do_binarize": Checkbox(value=True, description="Binarize"),
    "do_close": Checkbox(value=True, description="Close Gaps"),
    "do_filter": Checkbox(value=True, description="Filter Components"),
}

# --- Update function ---
def update(pdf_path, page, min_area, clahe_clip, blur_kernel, denoise_h,
           do_denoise, do_bg_removal, do_contrast,
           do_binarize, do_close, do_filter):

    try:
        img_bgr = load_pdf_page(pdf_path, page, dpi=latest_data["dpi"])
    except Exception as e:
        print(f"Error loading PDF page: {e}")
        return

    result = apply_pipeline(
        img_bgr,
        min_area=min_area,
        clahe_clip=clahe_clip,
        blur_kernel=blur_kernel,
        denoise_h=denoise_h,
        do_denoise=do_denoise,
        do_bg_removal=do_bg_removal,
        do_contrast=do_contrast,
        do_binarize=do_binarize,
        do_close=do_close,
        do_filter=do_filter
    )

    # Store for saving
    latest_data["image"] = result
    latest_data["page"] = page
    latest_data["pdf_path"] = pdf_path

    # Display
    plt.figure(figsize=(14, 6))
    plt.subplot(1, 2, 1)
    plt.imshow(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB))
    plt.title("Original")
    plt.axis("off")

    plt.subplot(1, 2, 2)
    plt.imshow(result, cmap='gray')
    plt.title("Processed")
    plt.axis("off")

    plt.tight_layout()
    plt.show()

# --- Save logic ---
save_button = Button(description="Save current page", button_style='success')
save_all_checkbox = Checkbox(value=False, description="Save ALL pages")
save_output = Output()

def save_processed_image(img, out_path):
    Path(os.path.dirname(out_path)).mkdir(exist_ok=True)
    cv2.imwrite(out_path, img)

def on_save_clicked(b):
    with save_output:
        save_output.clear_output()
        if save_all_checkbox.value:
            print("Saving all pages...")
            try:
                info = pdfinfo_from_path(latest_data["pdf_path"], poppler_path=POPPLER_PATH)
                total_pages = info["Pages"]
                for pg in range(1, total_pages + 1):
                    img_bgr = load_pdf_page(latest_data["pdf_path"], page_num=pg, dpi=latest_data["dpi"])
                    result = apply_pipeline(
                        img_bgr,
                        min_area=min_area_widget.value,
                        clahe_clip=clahe_clip_widget.value,
                        blur_kernel=blur_kernel_widget.value,
                        denoise_h=denoise_h_widget.value,
                        do_denoise=toggles["do_denoise"].value,
                        do_bg_removal=toggles["do_bg_removal"].value,
                        do_contrast=toggles["do_contrast"].value,
                        do_binarize=toggles["do_binarize"].value,
                        do_close=toggles["do_close"].value,
                        do_filter=toggles["do_filter"].value,
                    )
                    out_path = os.path.join("output", f"{Path(latest_data['pdf_path']).stem}_p{pg:03d}.png")
                    save_processed_image(result, out_path)
                print(f"Saved {total_pages} pages to /output/")
            except Exception as e:
                print(f"Error saving pages: {e}")
        else:
            img = latest_data["image"]
            page_num = latest_data["page"]
            if img is None:
                print("No processed image to save yet.")
                return
            out_path = os.path.join("output", f"{Path(latest_data['pdf_path']).stem}_p{page_num:03d}.png")
            save_processed_image(img, out_path)
            print(f"Saved current page to {out_path}")

save_button.on_click(on_save_clicked)

# --- UI display ---
ui = VBox([
    pdf_path_widget,
    page_widget,
    min_area_widget,
    clahe_clip_widget,
    blur_kernel_widget,
    denoise_h_widget,
    VBox(list(toggles.values())),
    HBox([save_button, save_all_checkbox]),
    save_output
])

out = interactive_output(update, {
    "pdf_path": pdf_path_widget,
    "page": page_widget,
    "min_area": min_area_widget,
    "clahe_clip": clahe_clip_widget,
    "blur_kernel": blur_kernel_widget,
    "denoise_h": denoise_h_widget,
    **toggles
})

display(ui, out)


VBox(children=(Text(value='data\\pdfs\\Padilla - 2 Noble perfecto_Extract.pdf', description='PDF Path:', layou…

Output()

Example of Data Extraction. Ensure Nobleza Virtuosa and Noble Perfecto are preprocessed and saved in their respective folders.

In [3]:
# Define paths for image folders and outputs
image_folders = {
    "Nobleza Virtuosa": {
        "pdf": Path("data/pdfs/Padilla - Nobleza virtuosa_testExtract.pdf"),
        "processed": Path("data/processed_data_virtuosa"),
        "bounding_boxes": Path("output/craft/boxes_virtuosa"),
        "bounding_boxes_sorted": Path("output/craft/boxes_sorted_virtuosa"),
        "gt_docx": Path("data/transcriptions/Padilla - 1 Nobleza virtuosa_testTranscription.docx"),
        "text_file": Path("data/transcriptions/NoblezaVirtuosa_all_text.txt"),
        "split_text_output": Path("data/split_data_nobleza/textSplitted_NoblezaVirtuosa"),
    },

    "Noble Perfecto": {
        "pdf": Path("data/pdfs/Padilla - 2 Noble perfecto_Extract.pdf"),
        "processed": Path("data/processed_data_perfecto"),
        "output": Path("output/craft/boxes_perfecto"),
        "bounding_boxes": Path("output/craft/boxes_perfecto"),
        "bounding_boxes_sorted": Path("output/craft/boxes_sorted_perfecto"),
        "gt_docx": Path("data/transcriptions/Padilla - 2 Noble perfecto_Transcription.docx"),
        "text_file": Path("data/transcriptions/NoblePerfecto_all_text.txt"),
        "split_text_output": Path("data/split_data_perfecto/textSplitted_NoblePerfecto")
    }
}

Download the PDFs

In [4]:
PDF_DIR = Path("data/pdfs")
FOLDER_ID ='1wMSs07lgET5sjA-alaoZ3CcNF_cEpRE2'

gdown_command = f"gdown --folder https://drive.google.com/drive/folders/{FOLDER_ID} -O {PDF_DIR}"
print('Running:', gdown_command)
!{gdown_command}

Running: gdown --folder https://drive.google.com/drive/folders/1wMSs07lgET5sjA-alaoZ3CcNF_cEpRE2 -O data\pdfs
Processing file 133Qubpcdwqa-dCZ0luuT31kT89FxOkHP Padilla - 2 Noble perfecto_Extract.pdf
Processing file 1BtDs1-3xZNi6aB-V3dXmHSiZIVCaeUuu Padilla - Nobleza virtuosa_testExtract.pdf


Retrieving folder contents
Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=133Qubpcdwqa-dCZ0luuT31kT89FxOkHP
To: c:\Users\katej\OneDrive\Documents\GitHub\RenAIssance_fork\RenAIssance_CRNN_OCR_Kate_OReilly_orig\data\pdfs\Padilla - 2 Noble perfecto_Extract.pdf

  0%|          | 0.00/3.95M [00:00<?, ?B/s]
 40%|███▉      | 1.57M/3.95M [00:00<00:00, 11.2MB/s]
 80%|███████▉  | 3.15M/3.95M [00:00<00:00, 11.6MB/s]
100%|██████████| 3.95M/3.95M [00:00<00:00, 11.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1BtDs1-3xZNi6aB-V3dXmHSiZIVCaeUuu
To: c:\Users\katej\OneDrive\Documents\GitHub\RenAIssance_fork\RenAIssance_CRNN_OCR_Kate_OReilly_orig\data\pdfs\Padilla - Nobleza virtuosa_testExtract.pdf

  0%|          | 0.00/2.58M [00:00<?, ?B/s]
 41%|████      | 1.05M/2.58M [00:00<00:00, 8.73MB/s]
100%|██████████| 2.58M/2.58M [00:00<00:00, 10.5MB/s]
100%|██████████| 2.58M/2.58M [00:00<0

Once they are preprocessed and saved. split them into individual pages.

In [5]:
from utils import split_and_save_image
import os

processed_dir = image_folders["Nobleza Virtuosa"]["processed"]
image_files = sorted([f for f in os.listdir(processed_dir) if f.lower().endswith('.png')])

last_image_number = 1

for fname in image_files:
    image_path = os.path.join(processed_dir, fname)
    last_image_number = split_and_save_image(
        image_path=image_path,
        output_folder=processed_dir,
        last_image_number=last_image_number
    )

processed_dir = image_folders["Noble Perfecto"]["processed"]
image_files = sorted([f for f in os.listdir(processed_dir) if f.lower().endswith('.png')])

last_image_number = 1

for fname in image_files:
    image_path = os.path.join(processed_dir, fname)
    last_image_number = split_and_save_image(
        image_path=image_path,
        output_folder=processed_dir,
        last_image_number=last_image_number
    )

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'data\\processed_data_virtuosa'

Text Detection with CRAFT (https://github.com/clovaai/CRAFT-pytorch)

In [None]:
!git clone https://github.com/kaoreill/CRAFT_Model.git

In [None]:
#It generally takes about ~3-4 mins
!python CRAFT_Model/CRAFT/BoundBoxFunc/test.py --result_folder=image_folders["Noble Perfecto"]["bounding_boxes"]  --test_folder=image_folders["Noble Perfecto"]["processed"]  --trained_model="CRAFT_Model/CRAFT/BoundBoxFunc/weights/craft_mlt_25k.pth"
#It generally takes about ~3-4 mins
!python CRAFT_Model/CRAFT/BoundBoxFunc/test.py --result_folder=image_folders["Nobleza Virtuosa"]["bounding_boxes"]  --test_folder=image_folders["Nobleza Virtuosa"]["processed"]  --trained_model="CRAFT_Model/CRAFT/BoundBoxFunc/weights/craft_mlt_25k.pth"

In [1]:
!python CRAFT_Model/CRAFT/BoundBoxFunc/test.py --result_folder="output/craft/boxes"  --test_folder="data/processed_pages"  --trained_model="CRAFT_Model/CRAFT/BoundBoxFunc/weights/craft_mlt_25k.pth"


Loading weights from checkpoint (CRAFT_Model/CRAFT/BoundBoxFunc/weights/craft_mlt_25k.pth)
Test image 1/32: data/processed_pages\page_001.png
Test image 2/32: data/processed_pages\page_002.png
Test image 3/32: data/processed_pages\page_003.png
Test image 4/32: data/processed_pages\page_004.png
Test image 5/32: data/processed_pages\page_005.png
Test image 6/32: data/processed_pages\page_006.png
Test image 7/32: data/processed_pages\page_007.png
Test image 8/32: data/processed_pages\page_008.png
Test image 9/32: data/processed_pages\page_009.png
Test image 10/32: data/processed_pages\page_010.png
Test image 11/32: data/processed_pages\page_011.png
Test image 12/32: data/processed_pages\page_012.png
Test image 13/32: data/processed_pages\page_013.png
Test image 14/32: data/processed_pages\page_014.png
Test image 15/32: data/processed_pages\page_015.png
Test image 16/32: data/processed_pages\page_016.png
Test image 17/32: data/processed_pages\page_017.png


Traceback (most recent call last):
  File "c:\Users\katej\OneDrive\Documents\GitHub\RenAIssance_fork\RenAIssance_CRNN_OCR_Kate_OReilly_orig\CRAFT_Model\CRAFT\BoundBoxFunc\test.py", line 146, in <module>
    bboxes, polys, score_text = test_net(net, image, args.text_threshold, args.link_threshold, args.low_text, args.cuda, args.poly, refine_net)
  File "c:\Users\katej\OneDrive\Documents\GitHub\RenAIssance_fork\RenAIssance_CRNN_OCR_Kate_OReilly_orig\CRAFT_Model\CRAFT\BoundBoxFunc\test.py", line 84, in test_net
    y, feature = net(x)
  File "c:\Users\katej\AppData\Local\Programs\Python\Python39\lib\site-packages\torch\nn\modules\module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "c:\Users\katej\OneDrive\Documents\GitHub\RenAIssance_fork\RenAIssance_CRNN_OCR_Kate_OReilly_orig\CRAFT_Model\CRAFT\BoundBoxFunc\craft.py", line 60, in forward
    sources = self.basenet(x)
  File "c:\Users\katej\AppData\Local\Programs\Python\Python39\lib\site-packages\torch\nn\

Sort Bounding Boxes

In [None]:
from utils import sort_bounding_boxes

sort_bounding_boxes(input_dir=image_folders["Nobleza Virtuosa"]["bounding_boxes"], output_dir=image_folders["Nobleza Virtuosa"]["bounding_boxes_sorted"],)
sort_bounding_boxes(input_dir=image_folders["Noble Perfecto"]["bounding_boxes"], output_dir=image_folders["Noble Perfecto"]["bounding_boxes_sorted"],)

In [15]:
from utils import sort_bounding_boxes

sort_bounding_boxes(input_dir="output/craft/boxes", output_dir="output/craft/boxes_sorted")

Processing bounding boxes in: output\craft\boxes
Sorted boxes written: res_BorbÃ³n_ROTUNDA_p001_sorted.txt
Sorted boxes written: res_BorbÃ³n_ROTUNDA_p002_sorted.txt
Sorted boxes written: res_BorbÃ³n_ROTUNDA_p003_sorted.txt
Sorted boxes written: res_BorbÃ³n_ROTUNDA_p004_sorted.txt
Sorted boxes written: res_BorbÃ³n_ROTUNDA_p005_sorted.txt
Sorted boxes written: res_BorbÃ³n_ROTUNDA_p006_sorted.txt
Sorted boxes written: res_BorbÃ³n_ROTUNDA_p007_sorted.txt
Sorted boxes written: res_BorbÃ³n_ROTUNDA_p008_sorted.txt
Sorted boxes written: res_BorbÃ³n_ROTUNDA_p009_sorted.txt
Sorted boxes written: res_BorbÃ³n_ROTUNDA_p010_sorted.txt
Sorted boxes written: res_BorbÃ³n_ROTUNDA_p011_sorted.txt
Sorted boxes written: res_BorbÃ³n_ROTUNDA_p012_sorted.txt
Sorted boxes written: res_BorbÃ³n_ROTUNDA_p013_sorted.txt
Sorted boxes written: res_BorbÃ³n_ROTUNDA_p014_sorted.txt
Sorted boxes written: res_BorbÃ³n_ROTUNDA_p015_sorted.txt
Sorted boxes written: res_BorbÃ³n_ROTUNDA_p016_sorted.txt
Sorted boxes written: r

Extract Transcriptions

In [2]:
from utils import extract_ground_truth_for_dataset

extract_ground_truth_for_dataset(
    docx_file="data/transcriptions/Feijoo_ROMAN.docx",
    output_path="data/transcriptions/Feijoo_ROMAN_all_text.txt",
)

Ground truth text saved to data\transcriptions\Feijoo_ROMAN_all_text.txt


NameError: name 'output_folder' is not defined

In [9]:
from utils import extract_ground_truth_for_dataset

extract_ground_truth_for_dataset(
    docx_file="data\transcriptions\Padilla_Nobleza_virtuosa_testTranscription.docx",
    output_path=image_folders["Nobleza Virtuosa"]["text_file"]
)
extract_ground_truth_for_dataset(
    docx_file=image_folders["Noble Perfecto"]["gt_docx"],
    output_path=image_folders["Noble Perfecto"]["text_file"]
)

.docx not found: data	ranscriptions\Padilla_Nobleza_virtuosa_testTranscription.docx
.docx not found: data\transcriptions\Padilla - 2 Noble perfecto_Transcription.docx


In [10]:
from utils import extract_ground_truth_for_dataset

extract_ground_truth_for_dataset(
    docx_file="data/transcriptions/Padilla_Nobleza_virtuosa_testTranscription.docx",
    output_path="data/transcriptions/NoblezaVirtuosa_all_text.txt"
)

Ground truth text saved to data\transcriptions\NoblezaVirtuosa_all_text.txt


NameError: name 'output_folder' is not defined

Split Transcription By Page

In [None]:
from utils import process_textfiles
import os

for name, paths in image_folders.items():
    textfile = paths["text_file"]
    sorted_BoundBox_folder = paths["bounding_boxes_sorted"]
    output_folder = paths["split_text_output"]
    os.makedirs(output_folder, exist_ok=True)
    print(f"Splitting text for {name}...")
    process_textfiles(textfile, sorted_BoundBox_folder, output_folder)
    print(f"Text splitting complete for {name}! Output: {output_folder}")


In [26]:
from utils import process_textfiles
import os

textfile = "data/data_3/borbon.txt"
sorted_BoundBox_folder = "output/craft/boxes_sorted"
output_folder = "data/data_3/split_text_output"
os.makedirs(output_folder, exist_ok=True)
print(f"Splitting text for...")
process_textfiles(textfile, sorted_BoundBox_folder, output_folder)
print(f"Text splitting complete for! Output: {output_folder}")


Splitting text for...
Found 0 text pages and 0 bbox files.
Text splitting complete. Output written to: data\data_3\split_text_output
Text splitting complete for! Output: data/data_3/split_text_output


Extract and Save Labelled Boxes

In [None]:
from utils import apply_extraction_to_folder_for_train

for name, paths in image_folders.items():
    apply_extraction_to_folder_for_train(
        image_folder=paths["processed"],
        bounding_box_folder=paths["bounding_boxes_sorted"],
        text_folder=paths["split_text_output"],
        output_folder="content/train_data/",
    )

In [None]:
from utils import apply_extraction_to_folder_for_test

for name, paths in image_folders.items():
    apply_extraction_to_folder_for_test(
        image_folder=paths["processed"],
        bounding_box_folder=paths["bounding_boxes_sorted"],
        output_folder="content/test_data/",
    )

Pad and Resize Images

In [None]:
from utils import pad_and_resize_images

pad_and_resize_images("content/train_data")
pad_and_resize_images("content/test_data")

In [2]:
from utils import pad_and_resize_images
pad_and_resize_images("data/data_3/")

Processed: data/data_3/&c..png
Processed: data/data_3/1.png
Processed: data/data_3/10°.png
Processed: data/data_3/10Â°.png
Processed: data/data_3/12.png
Processed: data/data_3/12Â°.png
Processed: data/data_3/13°.png
Processed: data/data_3/13Â°.png
Processed: data/data_3/14.png
Processed: data/data_3/14Â°.png
Processed: data/data_3/15°.png
Processed: data/data_3/15Â°.png
Processed: data/data_3/1786. (2).png
Processed: data/data_3/1786..png
Processed: data/data_3/17°.png
Processed: data/data_3/17Â°.png
Processed: data/data_3/18°.png
Processed: data/data_3/18Â°.png
Processed: data/data_3/19°.png
Processed: data/data_3/19Â°.png
Processed: data/data_3/1as.png
Processed: data/data_3/1Â°.png
Processed: data/data_3/2.png
Processed: data/data_3/20°.png
Processed: data/data_3/20Â°.png
Processed: data/data_3/21°.png
Processed: data/data_3/21Â°.png
Processed: data/data_3/22°.png
Processed: data/data_3/22Â°.png
Processed: data/data_3/23°.png
Processed: data/data_3/23Â°.png
Processed: data/data_3/24

Download Manually Corrected Dataset

In [None]:
import gdown
import zipfile
import os

# Google Drive file ID of the ZIP file
zip_output_path = 'data/downloaded_folder.zip'

# Construct download URL
gdown.download(f'https://drive.google.com/file/d/1AmciGFL8YFFgqdvhL7e5T4GPkoGhMnUq', zip_output_path, quiet=False)

# Unzip the downloaded file
with zipfile.ZipFile(zip_output_path, 'r') as zip_ref:
    zip_ref.extractall('unzipped_folder')

# Optional: remove the zip file after extraction
os.remove(zip_output_path)


In [None]:
import cv2
import numpy as np
import os
from glob import glob

import cv2, os
from glob import glob

import numpy as np
import cv2

def imread_unicode(path, flags=cv2.IMREAD_COLOR):
    # read raw bytes, then decode in-memory
    data = open(path, "rb").read()
    arr  = np.frombuffer(data, dtype=np.uint8)
    return cv2.imdecode(arr, flags)



In [10]:
from utils import pad_and_resize_images

pad_and_resize_images("data/data_9")

Processed: data/data_9\A.png
Processed: data/data_9\Análisis.png
Processed: data/data_9\Anárquico.png
Processed: data/data_9\Aquí.png
Processed: data/data_9\Auténtico.png
Processed: data/data_9\Bambú.png
Processed: data/data_9\Bilingüísimo.png
Processed: data/data_9\Botón.png
Processed: data/data_9\Brújula.png
Processed: data/data_9\Bélgica.png
Processed: data/data_9\Bénito.png
Processed: data/data_9\Camióncillo.png
Processed: data/data_9\Canción.png
Processed: data/data_9\Clásico (2).png
Processed: data/data_9\Clásico.png
Processed: data/data_9\Colón.png
Processed: data/data_9\Corazón.png
Processed: data/data_9\Creéis.png
Processed: data/data_9\Cáculo.png
Processed: data/data_9\Cáliz (2).png
Processed: data/data_9\Cáliz.png
Processed: data/data_9\Cámara.png
Processed: data/data_9\Cántaro.png
Processed: data/data_9\Cántico.png
Processed: data/data_9\Cárcel.png
Processed: data/data_9\Céfiro.png
Processed: data/data_9\Césped.png
Processed: data/data_9\Códice.png
Processed: data/data_9\Cú

In [None]:
from pathlib import Path
import re
import cv2
from collections import defaultdict



In [111]:
# ─── RUN: Full-Folder Word Extraction ───
from pathlib import Path
# import your function (adjust the import path if needed)

# 1) Define folders
boxes_folder          = "output/craft/boxes_sorted"
transcriptions_folder = "data/transcriptions"
images_folder         = "data/pages"
output_folder         = "data/training_words/test"

# 2) Ensure output parent exists
Path(output_folder).mkdir(parents=True, exist_ok=True)

# 3) Run the alignment + extraction
align_and_extract_words(
    boxes_folder=boxes_folder,
    transcriptions_folder=transcriptions_folder,
    images_folder=images_folder,
    output_folder=output_folder
)


✅ Saved 7127 word crops to data\training_words\test\cropped
✅ Labels saved to data\training_words\test\labels.csv


In [3]:
import os

folder = "data/data_4"
for fname in os.listdir(folder):
    new_name = fname.replace(" - Copy.png", ".png")
    os.rename(os.path.join(folder, fname), os.path.join(folder, new_name))
    print(f"Renamed: {fname} → {new_name}")


Renamed: a (28).png → a (28).png
Renamed: a (34).png → a (34).png
Renamed: a (37).png → a (37).png
Renamed: a (38).png → a (38).png
Renamed: a (40).png → a (40).png
Renamed: a (41).png → a (41).png
Renamed: a (46).png → a (46).png
Renamed: a (47).png → a (47).png
Renamed: a (48).png → a (48).png
Renamed: a (7).png → a (7).png
Renamed: A (8).png → A (8).png
Renamed: acusado.png → acusado.png
Renamed: ageno.png → ageno.png
Renamed: al rostro,diximos.jpg → al rostro,diximos.jpg
Renamed: alcanzase delas.png → alcanzase delas.png
Renamed: algunos.png → algunos.png
Renamed: allanar.png → allanar.png
Renamed: alli.png → alli.png
Renamed: Alléde.png → Alléde.png
Renamed: alma.png → alma.png
Renamed: alos (2).png → alos (2).png
Renamed: altas.png → altas.png
Renamed: amistad (2).png → amistad (2).png
Renamed: amistad.png → amistad.png
Renamed: amores (2).png → amores (2).png
Renamed: animo.png → animo.png
Renamed: antonio.png → antonio.png
Renamed: apresuraua.png → apresuraua.png
Renamed: aqlla

FileExistsError: [WinError 183] Cannot create a file when that file already exists: 'data/data_4\\que (91) - Copy.png' -> 'data/data_4\\que (91).png'

In [56]:
!pip install docx2pdf PyPDF2


Collecting docx2pdf
  Downloading docx2pdf-0.1.8-py3-none-any.whl.metadata (3.3 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading docx2pdf-0.1.8-py3-none-any.whl (6.7 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2, docx2pdf

   ---------------------------------------- 0/2 [PyPDF2]
   ---------------------------------------- 0/2 [PyPDF2]
   ---------------------------------------- 0/2 [PyPDF2]
   ---------------------------------------- 0/2 [PyPDF2]
   ---------------------------------------- 0/2 [PyPDF2]
   ---------------------------------------- 0/2 [PyPDF2]
   -------------------- ------------------- 1/2 [docx2pdf]
   ---------------------------------------- 2/2 [docx2pdf]

Successfully installed PyPDF2-3.0.1 docx2pdf-0.1.8



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [58]:
import os
from docx2pdf import convert
from PyPDF2 import PdfReader

def docx_to_txt_pages(docx_path, output_dir):
    """
    Converts a .docx to PDF, then extracts each PDF page's text
    and writes it out as page_001.txt, page_002.txt, …
    """
    os.makedirs(output_dir, exist_ok=True)

    # 1) Convert .docx → .pdf (creates same-name .pdf next to .docx)
    pdf_path = os.path.splitext(docx_path)[0] + ".pdf"
    convert(docx_path, pdf_path)

    # 2) Read PDF and dump one .txt per page
    reader = PdfReader(pdf_path)
    for i, page in enumerate(reader.pages, start=1):
        text = page.extract_text() or ""
        txt_name = f"page_{i:03d}.txt"
        with open(os.path.join(output_dir, txt_name), "w", encoding="utf-8") as f:
            f.write(text)

    print(f"Saved {len(reader.pages)} pages → {output_dir}")

if __name__ == "__main__":
    docx_to_txt_pages("data/transcriptions/Borbón_ROTUNDA.docx", "data")


  0%|          | 0/1 [00:00<?, ?it/s]

Saved 41 pages → data


In [1]:
import os
import shutil

def copy_as_foldername(src_root, dst_folder):
    """
    Copy all files from subfolders of src_root into dst_folder.
    Each file is renamed to the folder name it came from.
    Duplicates are numbered as: name.png, name (2).png, name (3).png, ...
    """
    os.makedirs(dst_folder, exist_ok=True)

    for root, dirs, files in os.walk(src_root):
        if root == src_root:
            continue  # skip the root folder itself

        folder_name = os.path.basename(root)
        counter = 1
        for file in files:
            ext = os.path.splitext(file)[1]  # keep original extension
            if counter == 1:
                new_name = f"{folder_name}{ext}"
            else:
                new_name = f"{folder_name} ({counter}){ext}"

            src_path = os.path.join(root, file)
            dst_path = os.path.join(dst_folder, new_name)

            # Ensure no accidental overwrite
            while os.path.exists(dst_path):
                counter += 1
                new_name = f"{folder_name} ({counter}){ext}"
                dst_path = os.path.join(dst_folder, new_name)

            shutil.copy2(src_path, dst_path)
            counter += 1

    print(f"All files copied from '{src_root}' to '{dst_folder}' using folder names only.")

# Example usage:
copy_as_foldername("data/data_6", "data/data_9")


All files copied from 'data/data_6' to 'data/data_9' using folder names only.


In [9]:
# --- Extract word crops from boxes (no OCR, no labels) ---
# Dependencies: pip install opencv-python numpy pillow (Pillow only if you want PNG compression tweaks)
import re, os
from pathlib import Path
import cv2
import numpy as np

# ==== CONFIG ====
images_dir = Path("data/processed_pages")   # <-- change this
boxes_dir  = Path("data/bounding_boxes")   # <-- change this
out_dir    = Path("data/data_6")  # <-- change this

# Save crops to: out/<page_stem>/0000.png, 0001.png, ...
# Optionally also create a contact sheet per page:
make_contact_sheet = True
contact_sheet_max_width = 2000   # pixels
contact_sheet_gap = 6            # pixels between crops
# Resize each crop so height≈target (helps uniform contact sheets). Set None to keep original sizes.
contact_crop_target_h = 48

# ==== UTILS ====
def parse_numbers(line: str):
    return [float(n) for n in re.findall(r"-?\d+(?:\.\d+)?", line)]

def read_boxes_file(txt_path: Path):
    with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
        lines = [ln.strip() for ln in f if ln.strip() and not ln.strip().startswith("#")]
    ref_size = None
    boxes = []
    for ln in lines:
        if "img_size" in ln:
            nums = parse_numbers(ln)
            if len(nums) >= 2:
                ref_size = (int(nums[0]), int(nums[1]))  # (W,H)
            continue
        nums = parse_numbers(ln)
        if len(nums) >= 8:
            boxes.append(nums[:8])  # polygon (4 points)
        elif len(nums) >= 4:
            boxes.append(nums[:4])  # rectangle
    return boxes, ref_size

def scale_points(vals, ref_size, img_w, img_h):
    if not ref_size: return vals
    ref_w, ref_h = ref_size
    if ref_w <= 0 or ref_h <= 0: return vals
    sx, sy = img_w / float(ref_w), img_h / float(ref_h)
    return [v * (sx if i % 2 == 0 else sy) for i, v in enumerate(vals)]

def order_polygon(pts: np.ndarray) -> np.ndarray:
    # Order 4 points as tl, tr, br, bl
    rect = np.zeros((4, 2), dtype="float32")
    s = pts.sum(axis=1)
    rect[0] = pts[np.argmin(s)]  # tl
    rect[2] = pts[np.argmax(s)]  # br
    diff = np.diff(pts, axis=1).reshape(-1)
    rect[1] = pts[np.argmin(diff)]  # tr
    rect[3] = pts[np.argmax(diff)]  # bl
    return rect

def warp_polygon_roi(img: np.ndarray, poly4: np.ndarray) -> np.ndarray:
    rect = order_polygon(poly4.astype("float32"))
    (tl, tr, br, bl) = rect

    # Destination size from edge lengths
    widthA = np.linalg.norm(br - bl)
    widthB = np.linalg.norm(tr - tl)
    maxW = max(1, int(round(max(widthA, widthB))))

    heightA = np.linalg.norm(tr - br)
    heightB = np.linalg.norm(tl - bl)
    maxH = max(1, int(round(max(heightA, heightB))))

    dst = np.array([[0,0],[maxW-1,0],[maxW-1,maxH-1],[0,maxH-1]], dtype="float32")
    M = cv2.getPerspectiveTransform(rect, dst)
    return cv2.warpPerspective(img, M, (maxW, maxH), flags=cv2.INTER_CUBIC)

def crop_xyxy(img: np.ndarray, x1: float, y1: float, x2: float, y2: float) -> np.ndarray:
    h, w = img.shape[:2]
    xa, ya = int(max(0, min(x1, x2))), int(max(0, min(y1, y2)))
    xb, yb = int(min(w, max(x1, x2))), int(min(h, max(y1, y2)))
    if xb <= xa or yb <= ya:
        return np.zeros((1,1), dtype=img.dtype)
    return img[ya:yb, xa:xb]

def infer_xywh_vs_xyxy(vals):
    x1, y1, x2, y2 = vals[:4]
    if x2 >= x1 and y2 >= y1:
        return x1, y1, x2, y2, False  # x1,y1,x2,y2
    return x1, y1, x1 + max(0, x2), y1 + max(0, y2), True  # x,y,w,h

def resize_keep_aspect(img, target_h=None):
    if target_h is None: return img
    h, w = img.shape[:2]
    if h == 0 or w == 0: return img
    scale = target_h / float(h)
    new_w = max(1, int(round(w * scale)))
    return cv2.resize(img, (new_w, target_h), interpolation=cv2.INTER_CUBIC)

def make_contact_sheet_from_crops(crops, max_width=1800, gap=6, bg_color=(255,255,255)):
    # Place variable-width crops row-by-row into a single image
    if not crops:
        return None
    rows = []
    current_row, row_w, row_h = [], 0, 0
    for c in crops:
        h, w = c.shape[:2]
        if row_w + (gap if current_row else 0) + w > max_width and current_row:
            rows.append((current_row, row_w, row_h))
            current_row, row_w, row_h = [], 0, 0
        if current_row:
            row_w += gap
        current_row.append(c)
        row_w += w
        row_h = max(row_h, h)
    if current_row:
        rows.append((current_row, row_w, row_h))

    total_h = sum(rh for _, _, rh in rows) + gap * (len(rows) + 1)
    total_w = max((rw for _, rw, _ in rows), default=0) + gap * 2
    sheet = np.full((total_h, total_w, 3), bg_color, dtype=np.uint8)

    y = gap
    for row_crops, row_w, row_h in rows:
        x = gap
        for c in row_crops:
            h, w = c.shape[:2]
            sheet[y:y+h, x:x+w] = c if c.ndim==3 else cv2.cvtColor(c, cv2.COLOR_GRAY2BGR)
            x += w + gap
        y += row_h + gap
    return sheet

# ==== MAIN ====
out_dir.mkdir(parents=True, exist_ok=True)
image_exts = {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp"}

for img_path in sorted([p for p in images_dir.iterdir() if p.suffix.lower() in image_exts]):
    page = cv2.imread(str(img_path))
    if page is None:
        print(f"[WARN] Cannot read {img_path}")
        continue

    txt_path = boxes_dir / f"res_{img_path.stem}.txt"
    if not txt_path.exists():
        print(f"[WARN] Missing boxes for {img_path.name} -> {txt_path.name}")
        continue

    boxes, ref_size = read_boxes_file(txt_path)
    H, W = page.shape[:2]
    crops = []
    save_dir = out_dir / img_path.stem
    save_dir.mkdir(parents=True, exist_ok=True)

    for i, vals in enumerate(boxes):
        vals = scale_points(vals, ref_size, W, H)
        if len(vals) >= 8:
            pts = np.array(vals[:8], dtype=np.float32).reshape(4, 2)
            roi = warp_polygon_roi(page, pts)
        else:
            x1, y1, x2, y2, _ = infer_xywh_vs_xyxy(vals[:4])
            roi = crop_xyxy(page, x1, y1, x2, y2)

        # Skip empty/invalid
        if roi.size == 0 or roi.shape[0] < 1 or roi.shape[1] < 1:
            continue

        # Optional normalize: force 3-channel for saving/tiling
        if roi.ndim == 2:
            roi = cv2.cvtColor(roi, cv2.COLOR_GRAY2BGR)

        # Save crop (no labels, simple index)
        out_crop = save_dir / f"{i:04d}.png"
        cv2.imwrite(str(out_crop), roi)

        # For contact sheet (standardize height for nicer packing)
        crops.append(resize_keep_aspect(roi, contact_crop_target_h))

    print(f"[OK] Saved {len(crops)} crops to {save_dir}")

    if make_contact_sheet and crops:
        sheet = make_contact_sheet_from_crops(crops, max_width=contact_sheet_max_width, gap=contact_sheet_gap)
        if sheet is not None:
            cv2.imwrite(str(out_dir / f"{img_path.stem}_contact_sheet.png"), sheet)
            print(f"[OK] Contact sheet: {img_path.stem}_contact_sheet.png")


[OK] Saved 33 crops to data\data_6\page_001
[OK] Contact sheet: page_001_contact_sheet.png
[OK] Saved 34 crops to data\data_6\page_002
[OK] Contact sheet: page_002_contact_sheet.png
[OK] Saved 33 crops to data\data_6\page_003
[OK] Contact sheet: page_003_contact_sheet.png
[OK] Saved 34 crops to data\data_6\page_004
[OK] Contact sheet: page_004_contact_sheet.png
[OK] Saved 4 crops to data\data_6\page_005
[OK] Contact sheet: page_005_contact_sheet.png
