Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 157 additions & 0 deletions dataset/generate_coco.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
import json
import sys
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed

import formalpdf
import logging


logging.getLogger("pypdfium2").setLevel(logging.ERROR)

def process_pdf(pdf_path, output_dir):
"""Process all pages of a PDF and generate JSON annotation files"""
json_dir = output_dir / "json"
images_dir = output_dir / "images"
pdf_name = pdf_path.stem

# Check if first page JSON exists - if so, skip entire PDF
first_page_json = json_dir / f"{pdf_name}-0.json"

if first_page_json.exists():
return f"Skipped {pdf_name} (already processed)"

try:
document = formalpdf.open(str(pdf_path))
num_pages = len(document)
total_widgets = 0

for page_idx in range(num_pages):
page = document[page_idx]
pdfium_page = document.document[page_idx]

width_pt, height_pt = pdfium_page.get_size()
target_px = 1680
# Scale based on the smaller dimension
scale = target_px / min(width_pt, height_pt)

image = pdfium_page.render(scale=scale, may_draw_forms=False).to_pil()
widgets = page.widgets()

image_filename = f"{pdf_name}-{page_idx}.png"

# Create image info
image_info = {
"file_name": image_filename,
"width": image.width,
"height": image.height,
}

# Save image
image.save(images_dir / image_filename)

# Process annotations
annotations = []
for widget in widgets:
# convert bounding box in pt to pixels
top = widget.rect.top * scale
left = widget.rect.left * scale
bottom = widget.rect.bottom * scale
right = widget.rect.right * scale

y0 = image.height - top
y1 = image.height - bottom

# try for the category, otherwise "Text"
categories = { "Text": 0,
"ComboBox": 0,
"CheckBox": 1,
"RadioButton": 1,
"Signature": 2,
"PushButton": 3,
"ListBox": 3,
"Unknown": 3 }

category_id = categories.get(widget.field_type_string, 3)

if category_id > 2:
continue

bbox_width = right - left
bbox_height = y1 - y0

annotations.append({
"category_id": category_id,
"bbox": [left, y0, bbox_width, bbox_height],
"area": bbox_width * bbox_height,
"iscrowd": 0,
"segmentation": [],
})

# Create per-page JSON
page_data = {
"image": image_info,
"annotations": annotations,
}

# Save JSON
json_path = json_dir / f"{pdf_name}-{page_idx}.json"

with json_path.open("w") as fp:
json.dump(page_data, fp, indent=2)

total_widgets += len(widgets)

document.document.close()
return f"Processed {pdf_name}: {num_pages} pages, {total_widgets} widgets"

except Exception as e:
return f"Error processing {pdf_name}: {str(e)}"


def main():
pdfs_dir = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("pdfs")
output_dir = Path(sys.argv[2]) if len(sys.argv) > 2 else Path("coco")
json_dir = output_dir / "json"
images_dir = output_dir / "images"

# Create directories
output_dir.mkdir(exist_ok=True)
json_dir.mkdir(exist_ok=True)
images_dir.mkdir(exist_ok=True)

# Find all PDF files
pdf_files = list(pdfs_dir.rglob("*.pdf"))
total_pdfs = len(pdf_files)
print(f"Found {total_pdfs} PDF files")

# Check which PDFs are already processed
skipped_count = 0
tasks = []

for pdf_path in pdf_files:
pdf_name = pdf_path.stem
first_page_json = json_dir / f"{pdf_name}-0.json"

if first_page_json.exists():
skipped_count += 1
else:
tasks.append(pdf_path)

print(f"Already processed (skipped): {skipped_count} PDFs")
print(f"New PDFs to process: {len(tasks)}")

if tasks:
# Process PDFs in parallel
with ProcessPoolExecutor() as executor:
futures = {executor.submit(process_pdf, pdf_path, output_dir): pdf_path for pdf_path in tasks}

completed = 0
for future in as_completed(futures):
completed += 1
result = future.result()
print(f"[{completed}/{len(tasks)}] {result}")


if __name__ == "__main__":
main()
115 changes: 115 additions & 0 deletions dataset/merge_coco.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import json
import os
import sys
from pathlib import Path


def merge_coco_annotations():
"""Merge individual JSON files into a single COCO format annotations file"""
coco_dir = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("coco")
json_dir = coco_dir / "json"
output_file = coco_dir / "annotations.json"

# COCO format structure
coco_data = {
"info": {
"year": 2025,
"version": "1.0",
"description": "Form field detection dataset",
"contributor": "",
"url": "",
"date_created": "2025-10-16"
},
"licenses": [
{
"id": 1,
"name": "Unknown",
"url": ""
}
],
"images": [],
"annotations": [],
"categories": [
{"id": 0, "name": "Text", "supercategory": "none"},
{"id": 1, "name": "CheckBox", "supercategory": "none"}
]
}

# Get all JSON files sorted by name
json_files = sorted(json_dir.glob("*.json"))

image_id = 0
annotation_id = 0

for json_file in json_files:
with json_file.open("r") as fp:
page_data = json.load(fp)

# Add image with sequential ID
image_info = page_data["image"].copy()
image_info["id"] = image_id
coco_data["images"].append(image_info)

# Track seen bounding boxes for this page to skip duplicates
seen_bboxes = set()

# Add annotations with sequential IDs and image_id reference
for annotation in page_data["annotations"]:
if json_file.name.startswith("2908641"):
continue

# Round bounding box to integers
bbox = annotation["bbox"]
bbox_int = [round(bbox[0]), round(bbox[1]), round(bbox[2]), round(bbox[3])]

# Skip if any x or y coordinate is negative
if bbox_int[0] < 0 or bbox_int[1] < 0:
continue

# Skip if bbox extends beyond image boundaries
if (bbox_int[0] + bbox_int[2] > image_info["width"] or
bbox_int[1] + bbox_int[3] > image_info["height"]):
continue

# Calculate area from rounded bounding box
area_int = bbox_int[2] * bbox_int[3]

bbox_tuple = tuple(bbox_int)

# Skip if this bounding box was already added for this page
if bbox_tuple in seen_bboxes:
continue

seen_bboxes.add(bbox_tuple)
annotation_copy = annotation.copy()
annotation_copy["id"] = annotation_id
annotation_copy["image_id"] = image_id
annotation_copy["bbox"] = bbox_int
annotation_copy["area"] = area_int
coco_data["annotations"].append(annotation_copy)
annotation_id += 1

image_id += 1

# Save merged COCO format file
with output_file.open("w") as fp:
json.dump(coco_data, fp, indent=2)

print(f"Merged {len(coco_data['images'])} images with {len(coco_data['annotations'])} annotations")
print(f"Saved to {output_file}")

# Create symlink in images folder
images_dir = coco_dir / "images"
symlink_path = images_dir / "_annotations.coco.json"

# Remove existing symlink if it exists
if symlink_path.exists() or symlink_path.is_symlink():
symlink_path.unlink()

# Create relative symlink
os.symlink(os.path.relpath(output_file, images_dir), symlink_path)
print(f"Created symlink at {symlink_path}")


if __name__ == "__main__":
merge_coco_annotations()
Loading