In [None]:
# Notebook to convert a document to Markdown format
# Important - install requirements.txt as well as LibreOffice (via install-libreoffice.ipynb)

In [2]:
import os
import re  
import requests
import concurrent.futures  
from functools import partial  
import pathlib

# Image processing via GPT-4o  
from IPython.display import Markdown, display  

# Utils
import doc2md_utils

image_path = 'images'
markdown_path = 'markdown'

In [None]:
# Reset the output directories
doc2md_utils.reset_local_dirs()


In [None]:
# Convert file to PDF
file_to_process = 'Transforming-Content-with-GPT4o.pptx'
pdf_path = doc2md_utils.convert_to_pdf(file_to_process)

In [None]:
# Extract PDF pages to images
doc_id = doc2md_utils.extract_pdf_pages_to_images(pdf_path, image_path)
pdf_images_dir = os.path.join(image_path, doc_id)
print ('Images saved to:', pdf_images_dir)
print ('Doc ID:', doc_id)
files = doc2md_utils.get_all_files(pdf_images_dir)  
total_files = len(files)
print ('Total Image Files to Process:', total_files)


In [None]:
# Convert the images to markdown using GPT-4o 
# Process pages in parallel - adjust worker count as needed
max_workers = 10

markdown_out_dir = os.path.join(markdown_path, doc_id)
doc2md_utils.ensure_directory_exists(markdown_out_dir)

# Using ThreadPoolExecutor with a limit of max_workers threads  
partial_process_image = partial(doc2md_utils.process_image, markdown_out_dir=markdown_out_dir)  
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:  
    # Map the partial function to the array of items  
    results = list(executor.map(partial_process_image, files))  
    
print('Total Processed:', len(results))
