In [2]:
from unstructured.partition.pdf import partition_pdf

# Extract elements from PDF
elements = partition_pdf(
    filename="2018_UserManual.pdf",
    strategy="hi_res",
)

# Convert elements to dictionaries
element_dict = [el.to_dict() for el in elements]

# Identify unique element types
unique_types = set()
for item in element_dict:
    unique_types.add(item['type'])

print(f"Unique element types in the PDF: {unique_types}")


Unique element types in the PDF: {'Table', 'Image', 'Address', 'Title', 'ListItem', 'UncategorizedText', 'FigureCaption', 'NarrativeText'}


Unique element types found: {'Title', 'Address', 'UncategorizedText', 'NarrativeText', 'FigureCaption', 'Image', 'Table', 'ListItem'}
Conversion complete. Output saved to output6\output.md


In [1]:
import os
from unstructured.partition.pdf import partition_pdf
import base64
import uuid

def convert_pdf_to_markdown(pdf_path, output_dir):
    # Create output directories
    os.makedirs(output_dir, exist_ok=True)
    images_dir = os.path.join(output_dir, "images")
    os.makedirs(images_dir, exist_ok=True)

    # Extract elements from PDF with image and table extraction
    elements = partition_pdf(
        filename=pdf_path,
        strategy="hi_res",
        extract_images_in_pdf=True,
        extract_image_block_types=["Image", "Table"],
        extract_image_block_to_payload=True,
        extract_image_block_output_dir=images_dir
    )

    # Convert elements to dictionaries
    element_dict = [el.to_dict() for el in elements]

    # Identify unique element types
    unique_types = set(item['type'] for item in element_dict)
    print(f"Unique element types found: {unique_types}")

    markdown_content = []
    for element in element_dict:
        element_type = element['type']
        
        if element_type in ["Table", "Image"]:
            # Handle both tables and images as images
            if 'metadata' in element and 'image_base64' in element['metadata']:
                image_filename = f"{element_type.lower()}_{uuid.uuid4()}.jpg"
                image_path = os.path.join(images_dir, image_filename)
                img_data = base64.b64decode(element['metadata']['image_base64'])
                with open(image_path, 'wb') as f:
                    f.write(img_data)
                markdown_content.append(f"\n![{element_type}]({os.path.relpath(image_path, output_dir)})\n")
            else:
                markdown_content.append(f"\n[{element_type} data not available]\n")
        
        elif element_type == "Formula":
            # Handle formulas (you might want to use LaTeX formatting if supported)
            markdown_content.append(f"\n`{element['text']}`\n")
        
        elif element_type == "FigureCaption":
            markdown_content.append(f"\n*Figure: {element['text']}*\n")
        
        elif element_type in ["NarrativeText", "Address", "EmailAddress", "UncategorizedText"]:
            # Handle various types of text elements
            markdown_content.append(f"\n{element['text']}\n")
        
        elif element_type == "ListItem":
            markdown_content.append(f"- {element['text']}\n")
        
        elif element_type == "Title":
            markdown_content.append(f"\n# {element['text']}\n")
        
        elif element_type == "PageBreak":
            markdown_content.append("\n---\n")  # Horizontal rule to represent page break
        
        elif element_type in ["Header", "Footer"]:
            # Add headers and footers as italic text
            markdown_content.append(f"\n*{element_type}: {element['text']}*\n")
        
        elif element_type == "CodeSnippet":
            # Format code snippets with backticks
            markdown_content.append(f"\n```\n{element['text']}\n```\n")
        
        elif element_type == "PageNumber":
            # Add page numbers as bold text
            markdown_content.append(f"\n**Page {element['text']}**\n")
        
        else:
            # Handle any unforeseen element types as plain text
            markdown_content.append(f"\n{element['text']}\n")

    # Join all content and write to markdown file
    markdown_text = "".join(markdown_content)
    output_file = os.path.join(output_dir, "output.md")
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(markdown_text)

    print(f"Conversion complete. Output saved to {output_file}")

# Usage
pdf_path = "2018_UserManual.pdf"
output_dir = "output7"
convert_pdf_to_markdown(pdf_path, output_dir)

  from .autonotebook import tqdm as notebook_tqdm


Unique element types found: {'UncategorizedText', 'Table', 'FigureCaption', 'Address', 'NarrativeText', 'Title', 'ListItem', 'Image'}
Conversion complete. Output saved to output7\output.md
