## Docling transform the raw data into md files

In [2]:
import os
import signal
import fitz  # PyMuPDF
from docling.document_converter import DocumentConverter
from docling.datamodel.document import ConversionResult
import pymupdf4llm


# Initialize the DocumentConverter
converter = DocumentConverter()

# Define input and output directories
input_dir = '../data/raw/'
output_dir = '../data/markdown/'

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Define a handler for the timeout
def handler(signum, frame):
    raise TimeoutError("Conversion timed out")

# Set the signal handler for SIGALRM
signal.signal(signal.SIGALRM, handler)

# Function to convert PDF to markdown using PyMuPDF as a fallback
def convert_with_pymupdf(input_path, output_path):
    try:
        # Convert the PDF to Markdown using pymupdf4llm
        markdown_content = pymupdf4llm.to_markdown(input_path)

        # Save the Markdown content to a file
        with open(output_path, 'w', encoding='utf-8') as md_file:
            md_file.write(markdown_content)

        print(f"Fallback conversion of '{os.path.basename(input_path)}' using pymupdf4llm succeeded.")
    except Exception as e:
        print(f"Fallback conversion of '{os.path.basename(input_path)}' failed: {e}")

# Iterate over all files in the input directory
for filename in os.listdir(input_dir):
    if filename.lower().endswith('.pdf'):
        input_path = os.path.join(input_dir, filename)
        output_filename = f"{os.path.splitext(filename)[0]}.md"
        output_path = os.path.join(output_dir, output_filename)

        # Check if the .md file already exists
        if os.path.exists(output_path):
            print(f"'{output_filename}' already exists. Skipping conversion.")
            continue

        try:
            # Set an alarm for 5 minutes (300 seconds)
            signal.alarm(300)

            # Convert the PDF to a Docling document
            conv_result: ConversionResult = converter.convert(input_path)

            # Export the document to Markdown
            markdown_content = conv_result.document.export_to_markdown()

            # Save the Markdown content to a file
            with open(output_path, 'w', encoding='utf-8') as md_file:
                md_file.write(markdown_content)

            print(f"Successfully converted '{filename}' to '{output_filename}'")
        
        except TimeoutError:
            print(f"Conversion of '{filename}' timed out. Using PyMuPDF as fallback.")
            convert_with_pymupdf(input_path, output_path)
        
        except Exception as e:
            print(f"Failed to convert '{filename}' using Docling: {e}. Attempting fallback with PyMuPDF.")
            convert_with_pymupdf(input_path, output_path)

        finally:
            # Cancel the alarm
            signal.alarm(0)


Successfully converted 'Bofors 1966.pdf' to 'Bofors 1966.md'
Successfully converted 'REDERI AB TRANSATLANTIC 1947.pdf' to 'REDERI AB TRANSATLANTIC 1947.md'


Encountered an error during conversion of document 69976dd4b0bdbefb1307b2e33105d2e359f301cc2d0f4208cf8f6d0b67b99dae:
Traceback (most recent call last):

  File "/Users/jonathanjayes/Documents/PhD/Swedish-annual-reports-archive/.venv/lib/python3.11/site-packages/docling/pipeline/base_pipeline.py", line 163, in _build_document
    for p in pipeline_pages:  # Must exhaust!

  File "/Users/jonathanjayes/Documents/PhD/Swedish-annual-reports-archive/.venv/lib/python3.11/site-packages/docling/pipeline/base_pipeline.py", line 127, in _apply_on_pages
    yield from page_batch

  File "/Users/jonathanjayes/Documents/PhD/Swedish-annual-reports-archive/.venv/lib/python3.11/site-packages/docling/models/page_assemble_model.py", line 60, in __call__
    for page in page_batch:

  File "/Users/jonathanjayes/Documents/PhD/Swedish-annual-reports-archive/.venv/lib/python3.11/site-packages/docling/models/table_structure_model.py", line 178, in __call__
    for page in page_batch:

  File "/Users/jonathanj

Conversion of 'Ericsson 1924.pdf' timed out. Using PyMuPDF as fallback.
Processing ../data/raw/Ericsson 1924.pdf...
Fallback conversion of 'Ericsson 1924.pdf' using pymupdf4llm succeeded.


KeyboardInterrupt: 