<a href="https://colab.research.google.com/github/jpsangare/DoclingBatch/blob/main/DoclingBatch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# First cell - Install required packages
%%capture
!apt-get update
!apt-get install -y poppler-utils
!pip install docling
!pip install --upgrade google-colab # Update google-colab

In [3]:
# Second cell - Import packages and define processing functions
import json
import logging
import time
from pathlib import Path
from typing import List
import os
from google.colab import files
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

def setup_directories():
    """Create necessary directories for processing."""
    os.makedirs("tests/data", exist_ok=True)
    os.makedirs("scratch", exist_ok=True)

def upload_and_move_files() -> List[Path]:
    """Upload PDF files and move them to the data directory."""
    print("Please select one or more PDF files to upload...")
    uploaded = files.upload()  # This will prompt for file upload
    input_files = []

    for filename in uploaded.keys():
        dest_path = Path("tests/data") / filename
        os.rename(filename, dest_path)
        input_files.append(dest_path)

    return input_files

def process_pdf(input_path: Path, doc_converter: DocumentConverter) -> float:
    """Process a single PDF file and save outputs."""
    _log = logging.getLogger(__name__)

    start_time = time.time()
    conv_result = doc_converter.convert(input_path)
    end_time = time.time() - start_time

    _log.info(f"Document {input_path.name} converted in {end_time:.2f} seconds.")

    # Export results
    output_dir = Path("scratch")
    doc_filename = conv_result.input.file.stem

    # Export in different formats
    output_formats = {
        "json": lambda: conv_result.document.export_to_dict(),
        "txt": lambda: conv_result.document.export_to_text(),
        "md": lambda: conv_result.document.export_to_markdown(),
        "doctags": lambda: conv_result.document.export_to_document_tokens()
    }

    for ext, export_func in output_formats.items():
        output_path = output_dir / f"{doc_filename}.{ext}"
        content = export_func()

        with output_path.open("w", encoding="utf-8") as fp:
            if ext == "json":
                json.dump(content, fp, ensure_ascii=False, indent=2)
            else:
                fp.write(content)

    return end_time

def display_results(input_files: List[Path]):
    """Display and download processing results."""
    print("\nProcessed files in scratch directory:")
    os.system("ls -l scratch/")

    # Display content of text and markdown files
    for input_file in input_files:
        filename = input_file.stem
        print(f"\nResults for {filename}:")

        for ext in ["txt", "md"]:
            output_path = Path("scratch") / f"{filename}.{ext}"
            if output_path.exists():
                print(f"\n{ext.upper()} output:")
                with open(output_path, 'r') as f:
                    print(f.read())

    # Create and download zip of results
    !zip -r output_files.zip scratch/
    files.download('output_files.zip')

In [4]:
# Third cell - Main execution
def main():
    logging.basicConfig(level=logging.INFO)
    _log = logging.getLogger(__name__)

    # Setup
    setup_directories()
    input_files = upload_and_move_files()

    if not input_files:
        _log.error("No files were uploaded")
        return

    # Configure document converter
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = False
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    # Process each file
    total_time = 0
    for input_file in input_files:
        _log.info(f"Processing {input_file.name}")
        total_time += process_pdf(input_file, doc_converter)

    _log.info(f"All documents processed in {total_time:.2f} seconds.")

    # Display and download results
    display_results(input_files)

if __name__ == "__main__":
    main()

Please select one or more PDF files to upload...


Saving ISTQB_CTFL_v4.0_Sample-Exam-A-Answers_v1.6.pdf to ISTQB_CTFL_v4.0_Sample-Exam-A-Answers_v1.6.pdf
Saving ISTQB_CTFL_v4.0_Sample-Exam-A-Questions_v1.6.pdf to ISTQB_CTFL_v4.0_Sample-Exam-A-Questions_v1.6.pdf
Saving ISTQB_CTFL_v4.0_Sample-Exam-B-Answers_v1.6.pdf to ISTQB_CTFL_v4.0_Sample-Exam-B-Answers_v1.6.pdf
Saving ISTQB_CTFL_v4.0_Sample-Exam-B-Questions_v1.6.pdf to ISTQB_CTFL_v4.0_Sample-Exam-B-Questions_v1.6.pdf
Saving ISTQB_CTFL_v4.0_Sample-Exam-C-Answers_v1.5.pdf to ISTQB_CTFL_v4.0_Sample-Exam-C-Answers_v1.5.pdf
Saving ISTQB_CTFL_v4.0_Sample-Exam-C-Questions_v1.5.pdf to ISTQB_CTFL_v4.0_Sample-Exam-C-Questions_v1.5.pdf
Saving ISTQB_CTFL_v4.0_Sample-Exam-D-Answers_v1.4.pdf to ISTQB_CTFL_v4.0_Sample-Exam-D-Answers_v1.4.pdf
Saving ISTQB_CTFL_v4.0_Sample-Exam-D-Questions_v1.4.pdf to ISTQB_CTFL_v4.0_Sample-Exam-D-Questions_v1.4.pdf


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

.gitignore:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.49k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.71k [00:00<?, ?B/s]

(…)artifacts/tableformer/fat/tm_config.json:   0%|          | 0.00/7.09k [00:00<?, ?B/s]

model.pt:   0%|          | 0.00/202M [00:00<?, ?B/s]

otslp_all_standard_094_clean.check:   0%|          | 0.00/213M [00:00<?, ?B/s]

otslp_all_fast.check:   0%|          | 0.00/146M [00:00<?, ?B/s]

(…)del_artifacts/tableformer/tm_config.json:   0%|          | 0.00/7.09k [00:00<?, ?B/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

<missing-text>

Certified Tester, Foundation Level Sample Exam set C Sample Exam - Answers

<missing-text>

<missing-text>

Certified Tester, Foundation Level Sample Exam set C Sample Exam - Answers

<missing-text>

<missing-text>

Certified Tester, Foundation Level Sample Exam set C Sample Exam - Answers

<missing-text>

<missing-text>

Certified Tester, Foundation Level Sample Exam set C Sample Exam - Answers

<missing-text>

<missing-text>

Certified Tester, Foundation Level Sample Exam set C Sample Exam - Answers

<missing-text>

<missing-text>

Certified Tester, Foundation Level Sample Exam set C Sample Exam - Answers

<missing-text>

<missing-text>

Certified Tester, Foundation Level Sample Exam set C Sample Exam - Answers

<missing-text>

<missing-text>

Certified Tester, Foundation Level Sample Exam set C Sample Exam - Answers

<missing-text>

<missing-text>

Certified Tester, Foundation Level Sample Exam set C S

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>