# Simple PDF to Markdown Converter

# Type of PDF's
1. Simple (Digital Text):-
    Text-only documents with standard layouts
    Digital PDFs with selectable text
    Examples: Reports, articles, plain books, documentation
   
2. Medium (Tables, Scanned):-
    Documents with tables and basic formatting
    PDFs with occasional images
    Multi-column layouts
    Examples: Academic papers, business reports, scanned books


3. Complex (Images Heavy):-
    PDFs with heavy images and minimal text
    Complex charts, diagrams, and infographics
    Examples: Scientific papers with diagrams, medical reports, technical manuals, presentations

In [None]:
# Category 1: Simple PDFs - Fast Text Extraction
# Tool using :- PyMuPDF4LLM
!pip install pymupdf4llm

Collecting pymupdf4llm
  Downloading pymupdf4llm-0.2.7-py3-none-any.whl.metadata (7.5 kB)
Collecting pymupdf>=1.26.6 (from pymupdf4llm)
  Downloading pymupdf-1.26.7-cp310-abi3-macosx_11_0_arm64.whl.metadata (3.4 kB)
Collecting tabulate (from pymupdf4llm)
  Using cached tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Downloading pymupdf4llm-0.2.7-py3-none-any.whl (66 kB)
Downloading pymupdf-1.26.7-cp310-abi3-macosx_11_0_arm64.whl (22.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.5/22.5 MB[0m [31m39.5 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25hUsing cached tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate, pymupdf, pymupdf4llm
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [pymupdf4llm][0m [pymupdf]
[1A[2KSuccessfully installed pymupdf-1.26.7 pymupdf4llm-0.2.7 tabulate-0.9.0


In [4]:
import pymupdf4llm
import pathlib

def convert_simple_pdf_to_markdown(pdf_folder:str,output_folder:str):
    """
    Convert simple text-based PDFs to Markdown using PyMuPDF4LLM.

    Args:
        pdf_folder: Path to folder containing PDF files
        output_folder: Path to output folder for Markdown files
    """
    pdf_path = pathlib.Path(pdf_folder)
    output_path = pathlib.Path(output_folder)
    output_path.mkdir(exist_ok=True,parents=True)
    

    for pdf_file in pdf_path.glob("*.pdf"):
        try:
            # Extract text as Markdown
            md_text = pymupdf4llm.to_markdown(pdf_file)
            
            # Save to file
            output_file = output_path / f"{pdf_file.stem}.md"
            output_file.write_text(md_text, encoding='utf-8')

            print(f"✓ Converted: {pdf_file.name}")

        except Exception as e:
            print(f"Error processing {pdf_file}: {str(e)}")

    print(f"\nConversion complete! Output in '{output_folder}'")     



convert_simple_pdf_to_markdown(pdf_folder="./simple_pdfs",output_folder="./markdown_output")

✓ Converted: sample.pdf

Conversion complete! Output in './markdown_output'


In [1]:
# Medium (Tables, Scanned) PDF :-

!pip install docling

Collecting docling
  Downloading docling-2.65.0-py3-none-any.whl.metadata (11 kB)
Collecting docling-core<3.0.0,>=2.50.1 (from docling-core[chunking]<3.0.0,>=2.50.1->docling)
  Downloading docling_core-2.57.0-py3-none-any.whl.metadata (7.8 kB)
Collecting docling-parse<5.0.0,>=4.7.0 (from docling)
  Downloading docling_parse-4.7.2-cp310-cp310-macosx_14_0_arm64.whl.metadata (10 kB)
Collecting docling-ibm-models<4,>=3.9.1 (from docling)
  Downloading docling_ibm_models-3.10.3-py3-none-any.whl.metadata (7.3 kB)
Collecting pypdfium2!=4.30.1,<5.0.0,>=4.30.0 (from docling)
  Downloading pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl.metadata (48 kB)
Collecting ocrmac<2.0.0,>=1.0.0 (from docling)
  Downloading ocrmac-1.0.0-py2.py3-none-any.whl.metadata (6.1 kB)
Collecting rapidocr<4.0.0,>=3.3 (from docling)
  Downloading rapidocr-3.4.5-py3-none-any.whl.metadata (1.5 kB)
Collecting rtree<2.0.0,>=1.3.0 (from docling)
  Downloading rtree-1.4.1-py3-none-macosx_11_0_arm64.whl.metadata (2.1 kB)
Col

In [10]:
from pathlib import Path
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions

def convert_medium_pdfs_docling(pdf_folder: str, output_folder: str):
    """
    Convert medium-complexity PDFs using Docling with OCR and table extraction.

    Args:
        pdf_folder: Path to folder containing PDF files
        output_folder: Path to output folder for Markdown files
    """
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_table_structure = True  # Extract table structures
    pipeline_options.do_ocr = True  # Enable OCR for scanned content
    pipeline_options.images_scale = 2.0  # Higher quality image extraction
    pipeline_options.generate_picture_images = True

    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    output_path = Path(output_folder)
    output_path.mkdir(parents=True, exist_ok=True)

    pdf_files = list(Path(pdf_folder).glob("*.pdf"))

    for pdf_file in pdf_files:
        try:
            result = converter.convert(str(pdf_file))
            markdown_content = result.document.export_to_markdown()

            output_file = output_path / f"{pdf_file.stem}.md"
            output_file.write_text(markdown_content, encoding='utf-8')

            print(f"✓ Converted: {pdf_file.name}")

        except Exception as e:
            print(f"✗ Error processing {pdf_file.name}: {e}")

    print(f"\nConversion complete! Output in '{output_folder}'")



convert_medium_pdfs_docling("./medium_pdfs", "./md_output")


Conversion complete! Output in './md_output'
