jbarrow · jbarrow · Oct 19, 2025 · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025
diff --git a/dataset/generate_coco.py b/dataset/generate_coco.py
@@ -0,0 +1,157 @@
+import json
+import sys
+from pathlib import Path
+from concurrent.futures import ProcessPoolExecutor, as_completed
+
+import formalpdf
+import logging
+
+
+logging.getLogger("pypdfium2").setLevel(logging.ERROR)
+
+def process_pdf(pdf_path, output_dir):
+    """Process all pages of a PDF and generate JSON annotation files"""
+    json_dir = output_dir / "json"
+    images_dir = output_dir / "images"
+    pdf_name = pdf_path.stem
+
+    # Check if first page JSON exists - if so, skip entire PDF
+    first_page_json = json_dir / f"{pdf_name}-0.json"
+
+    if first_page_json.exists():
+        return f"Skipped {pdf_name} (already processed)"
+
+    try:
+        document = formalpdf.open(str(pdf_path))
+        num_pages = len(document)
+        total_widgets = 0
+
+        for page_idx in range(num_pages):
+            page = document[page_idx]
+            pdfium_page = document.document[page_idx]
+
+            width_pt, height_pt = pdfium_page.get_size()
+            target_px = 1680
+            # Scale based on the smaller dimension
+            scale = target_px / min(width_pt, height_pt)
+
+            image = pdfium_page.render(scale=scale, may_draw_forms=False).to_pil()
+            widgets = page.widgets()
+
+            image_filename = f"{pdf_name}-{page_idx}.png"
+
+            # Create image info
+            image_info = {
+                    "file_name": image_filename,
+                    "width": image.width,
+                    "height": image.height,
+                    }
+
+            # Save image
+            image.save(images_dir / image_filename)
+
+            # Process annotations
+            annotations = []
+            for widget in widgets:
+                # convert bounding box in pt to pixels
+                top = widget.rect.top * scale
+                left = widget.rect.left * scale
+                bottom = widget.rect.bottom * scale
+                right = widget.rect.right * scale
+
+                y0 = image.height - top
+                y1 = image.height - bottom
+
+                # try for the category, otherwise "Text"
+                categories = { "Text": 0,
+                               "ComboBox": 0,
+                               "CheckBox": 1,
+                               "RadioButton": 1,
+                               "Signature": 2,
+                               "PushButton": 3,
+                               "ListBox": 3,
+                               "Unknown": 3 }
+
+                category_id = categories.get(widget.field_type_string, 3)
+
+                if category_id > 2:
+                    continue
+
+                bbox_width = right - left
+                bbox_height = y1 - y0
+
+                annotations.append({
+                    "category_id": category_id,
+                    "bbox": [left, y0, bbox_width, bbox_height],
+                    "area": bbox_width * bbox_height,
+                    "iscrowd": 0,
+                    "segmentation": [],
+                    })
+
+            # Create per-page JSON
+            page_data = {
+                    "image": image_info,
+                    "annotations": annotations,
+                    }
+
+            # Save JSON
+            json_path = json_dir / f"{pdf_name}-{page_idx}.json"
+
+            with json_path.open("w") as fp:
+                json.dump(page_data, fp, indent=2)
+
+            total_widgets += len(widgets)
+
+        document.document.close()
+        return f"Processed {pdf_name}: {num_pages} pages, {total_widgets} widgets"
+
+    except Exception as e:
+        return f"Error processing {pdf_name}: {str(e)}"
+
+
+def main():
+    pdfs_dir = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("pdfs")
+    output_dir = Path(sys.argv[2]) if len(sys.argv) > 2 else Path("coco")
+    json_dir = output_dir / "json"
+    images_dir = output_dir / "images"
+
+    # Create directories
+    output_dir.mkdir(exist_ok=True)
+    json_dir.mkdir(exist_ok=True)
+    images_dir.mkdir(exist_ok=True)
+
+    # Find all PDF files
+    pdf_files = list(pdfs_dir.rglob("*.pdf"))
+    total_pdfs = len(pdf_files)
+    print(f"Found {total_pdfs} PDF files")
+
+    # Check which PDFs are already processed
+    skipped_count = 0
+    tasks = []
+
+    for pdf_path in pdf_files:
+        pdf_name = pdf_path.stem
+        first_page_json = json_dir / f"{pdf_name}-0.json"
+
+        if first_page_json.exists():
+            skipped_count += 1
+        else:
+            tasks.append(pdf_path)
+
+    print(f"Already processed (skipped): {skipped_count} PDFs")
+    print(f"New PDFs to process: {len(tasks)}")
+
+    if tasks:
+        # Process PDFs in parallel
+        with ProcessPoolExecutor() as executor:
+            futures = {executor.submit(process_pdf, pdf_path, output_dir): pdf_path for pdf_path in tasks}
+
+            completed = 0
+            for future in as_completed(futures):
+                completed += 1
+                result = future.result()
+                print(f"[{completed}/{len(tasks)}] {result}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dataset/merge_coco.py b/dataset/merge_coco.py
@@ -0,0 +1,115 @@
+import json
+import os
+import sys
+from pathlib import Path
+
+
+def merge_coco_annotations():
+    """Merge individual JSON files into a single COCO format annotations file"""
+    coco_dir = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("coco")
+    json_dir = coco_dir / "json"
+    output_file = coco_dir / "annotations.json"
+
+    # COCO format structure
+    coco_data = {
+            "info": {
+                "year": 2025,
+                "version": "1.0",
+                "description": "Form field detection dataset",
+                "contributor": "",
+                "url": "",
+                "date_created": "2025-10-16"
+            },
+            "licenses": [
+                {
+                    "id": 1,
+                    "name": "Unknown",
+                    "url": ""
+                }
+            ],
+            "images": [],
+            "annotations": [],
+            "categories": [
+                {"id": 0, "name": "Text", "supercategory": "none"},
+                {"id": 1, "name": "CheckBox", "supercategory": "none"}
+                ]
+            }
+
+    # Get all JSON files sorted by name
+    json_files = sorted(json_dir.glob("*.json"))
+
+    image_id = 0
+    annotation_id = 0
+
+    for json_file in json_files:
+        with json_file.open("r") as fp:
+            page_data = json.load(fp)
+
+        # Add image with sequential ID
+        image_info = page_data["image"].copy()
+        image_info["id"] = image_id
+        coco_data["images"].append(image_info)
+
+        # Track seen bounding boxes for this page to skip duplicates
+        seen_bboxes = set()
+
+        # Add annotations with sequential IDs and image_id reference
+        for annotation in page_data["annotations"]:
+            if json_file.name.startswith("2908641"):
+                continue
+
+            # Round bounding box to integers
+            bbox = annotation["bbox"]
+            bbox_int = [round(bbox[0]), round(bbox[1]), round(bbox[2]), round(bbox[3])]
+
+            # Skip if any x or y coordinate is negative
+            if bbox_int[0] < 0 or bbox_int[1] < 0:
+                continue
+
+            # Skip if bbox extends beyond image boundaries
+            if (bbox_int[0] + bbox_int[2] > image_info["width"] or
+                bbox_int[1] + bbox_int[3] > image_info["height"]):
+                continue
+
+            # Calculate area from rounded bounding box
+            area_int = bbox_int[2] * bbox_int[3]
+
+            bbox_tuple = tuple(bbox_int)
+
+            # Skip if this bounding box was already added for this page
+            if bbox_tuple in seen_bboxes:
+                continue
+
+            seen_bboxes.add(bbox_tuple)
+            annotation_copy = annotation.copy()
+            annotation_copy["id"] = annotation_id
+            annotation_copy["image_id"] = image_id
+            annotation_copy["bbox"] = bbox_int
+            annotation_copy["area"] = area_int
+            coco_data["annotations"].append(annotation_copy)
+            annotation_id += 1
+
+        image_id += 1
+
+    # Save merged COCO format file
+    with output_file.open("w") as fp:
+        json.dump(coco_data, fp, indent=2)
+
+    print(f"Merged {len(coco_data['images'])} images with {len(coco_data['annotations'])} annotations")
+    print(f"Saved to {output_file}")
+
+    # Create symlink in images folder
+    images_dir = coco_dir / "images"
+    symlink_path = images_dir / "_annotations.coco.json"
+
+    # Remove existing symlink if it exists
+    if symlink_path.exists() or symlink_path.is_symlink():
+        symlink_path.unlink()
+
+    # Create relative symlink
+    os.symlink(os.path.relpath(output_file, images_dir), symlink_path)
+    print(f"Created symlink at {symlink_path}")
+
+
+if __name__ == "__main__":
+    merge_coco_annotations()