# Docling PDF Parser (GPU Accelerated)

This notebook is designed to run on Google Colab (T4 GPU) to fast-track the parsing of ALIO and BAI PDF files.

## 1. Setup Environment

In [None]:
!pip install docling --quiet
!pip install pdf2image --quiet # Optional fallback

## 2. Mount Google Drive (Optional) or Upload Data
If your data is in Google Drive, mount it. Otherwise, upload a zip file.

In [None]:
from google.colab import drive
import os

# Uncomment if using Drive
# drive.mount('/content/drive')

# Define your raw data path here
# Example: /content/drive/MyDrive/aiffelthon/00_data/raw_data
INPUT_ROOT = "/content/00_data/raw_data"
OUTPUT_ROOT = "/content/00_data/parsed_data"

## 3. Define Parsing Logic

In [None]:
import os
import glob
import time
from tqdm import tqdm
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
from docling.datamodel.base_models import InputFormat

def run_docling_batch(input_dir, output_dir, dataset_name="dataset"):
    if not os.path.exists(input_dir):
        print(f"Input directory not found: {input_dir}")
        return

    os.makedirs(output_dir, exist_ok=True)

    pdf_files = glob.glob(os.path.join(input_dir, "**/*.pdf"), recursive=True)
    print(f"[{dataset_name}] Found {len(pdf_files)} PDF files.")

    # Configure Pipeline for GPU
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
    pipeline_options.accelerator_options.device = "cuda" # FORCE CUDA

    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    success = 0
    errors = 0
    skipped = 0

    for pdf_path in tqdm(pdf_files, desc=f"Processing {dataset_name}"):
        try:
            # Path logic
            rel_path = os.path.relpath(pdf_path, input_dir)
            out_subdir = os.path.dirname(rel_path)
            out_name = os.path.splitext(os.path.basename(pdf_path))[0] + ".md"
            
            full_out_dir = os.path.join(output_dir, out_subdir)
            os.makedirs(full_out_dir, exist_ok=True)
            out_path = os.path.join(full_out_dir, out_name)

            if os.path.exists(out_path) and os.path.getsize(out_path) > 0:
                skipped += 1
                continue

            # Convert
            result = converter.convert(pdf_path)
            md = result.document.export_to_markdown()

            with open(out_path, "w", encoding="utf-8") as f:
                f.write(md)
            
            success += 1

        except Exception as e:
            # print(f"Error: {e}") # Optional: reduce spam
            errors += 1
    
    print(f"[{dataset_name}] Done. Success: {success}, Skipped: {skipped}, Errors: {errors}")

## 4. Run Batch Processing

In [None]:
# 1. ALIO
ALIO_INPUT = os.path.join(INPUT_ROOT, "1_alio_raw_files")
ALIO_OUTPUT = os.path.join(OUTPUT_ROOT, "alio_docling")
run_docling_batch(ALIO_INPUT, ALIO_OUTPUT, "ALIO")

# 2. BAI
BAI_INPUT = os.path.join(INPUT_ROOT, "1_bai_raw_files")
BAI_OUTPUT = os.path.join(OUTPUT_ROOT, "bai_docling")
run_docling_batch(BAI_INPUT, BAI_OUTPUT, "BAI")

## 5. Zip Results for Download

In [None]:
!zip -r parsed_data_docling.zip {OUTPUT_ROOT}