# Simple PDF to Markdown Converter

# Type of PDF's
1. Simple (Digital Text):-
    Text-only documents with standard layouts
    Digital PDFs with selectable text
    Examples: Reports, articles, plain books, documentation
   
2. Medium (Tables, Scanned):-
    Documents with tables and basic formatting
    PDFs with occasional images
    Multi-column layouts
    Examples: Academic papers, business reports, scanned books


3. Complex (Images Heavy):-
    PDFs with heavy images and minimal text
    Complex charts, diagrams, and infographics
    Examples: Scientific papers with diagrams, medical reports, technical manuals, presentations

In [None]:
# Category 1: Simple PDFs - Fast Text Extraction
# Tool using :- PyMuPDF4LLM
!pip install pymupdf4llm

Collecting pymupdf4llm
  Downloading pymupdf4llm-0.2.7-py3-none-any.whl.metadata (7.5 kB)
Collecting pymupdf>=1.26.6 (from pymupdf4llm)
  Downloading pymupdf-1.26.7-cp310-abi3-macosx_11_0_arm64.whl.metadata (3.4 kB)
Collecting tabulate (from pymupdf4llm)
  Using cached tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Downloading pymupdf4llm-0.2.7-py3-none-any.whl (66 kB)
Downloading pymupdf-1.26.7-cp310-abi3-macosx_11_0_arm64.whl (22.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.5/22.5 MB[0m [31m39.5 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25hUsing cached tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate, pymupdf, pymupdf4llm
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [pymupdf4llm][0m [pymupdf]
[1A[2KSuccessfully installed pymupdf-1.26.7 pymupdf4llm-0.2.7 tabulate-0.9.0


In [4]:
import pymupdf4llm
import pathlib

def convert_simple_pdf_to_markdown(pdf_folder:str,output_folder:str):
    """
    Convert simple text-based PDFs to Markdown using PyMuPDF4LLM.

    Args:
        pdf_folder: Path to folder containing PDF files
        output_folder: Path to output folder for Markdown files
    """
    pdf_path = pathlib.Path(pdf_folder)
    output_path = pathlib.Path(output_folder)
    output_path.mkdir(exist_ok=True,parents=True)
    

    for pdf_file in pdf_path.glob("*.pdf"):
        try:
            # Extract text as Markdown
            md_text = pymupdf4llm.to_markdown(pdf_file)
            
            # Save to file
            output_file = output_path / f"{pdf_file.stem}.md"
            output_file.write_text(md_text, encoding='utf-8')

            print(f"✓ Converted: {pdf_file.name}")

        except Exception as e:
            print(f"Error processing {pdf_file}: {str(e)}")

    print(f"\nConversion complete! Output in '{output_folder}'")     



convert_simple_pdf_to_markdown(pdf_folder="./simple_pdfs",output_folder="./markdown_output")

✓ Converted: sample.pdf

Conversion complete! Output in './markdown_output'
