In [None]:
# make sure you have ImageMagick 7+ installed
!magick --version

In [None]:
# ghostscript for Windows must also be installed as ImageMagick uses GhostScript
# had errors using Ghostscript 9.26, Ghostscript 9.25 works, though
# https://www.ghostscript.com/

In [None]:
# imports and options
import logging
import shutil
from pathlib import Path

from ipywidgets import IntProgress, Label, VBox
from IPython.display import display
from PIL import Image
from PyPDF2 import PdfFileReader

# set Logging Configuration with current level at INFO
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)

# set display at 95% width
from IPython.core.display import display, HTML
display(HTML('<style>.container { width:95% !important; }</style>'))

In [None]:
# set root PDF directory and create PDF paths list
pdf_directory = r'Z:\ContinuingPublications\Phoenix\work'  # use Raw string for Windows paths
pdf_directory_path = Path(pdf_directory)
pdf_paths_list = sorted(pdf_directory_path.glob('*.pdf'))  # NOT recursive

In [None]:
# process PDF paths list

# set PDF output dpi
dpi = 600  # use 600 for high-quality OCR, then can shrink to 300
# set temporary JPEG quality settings
jpeg_quality = 100
jpeg_dpi = dpi

# process all pdfs in paths list
for pdf_path in pdf_paths_list:
    
    print(f'Processing {pdf_path.name} . . .')
    
    # create a local data directory and temporaryorary PDF path
    temporary_directory_path = Path('00_temporary_data_directory')
    temporary_directory_path.mkdir(exist_ok=True)
    temporary_pdf_path = temporary_directory_path.joinpath(pdf_path.name)
    
    # copy PDF to local directory
    shutil.copy(pdf_path, temporary_pdf_path)
        
    # try to create final output directory
    try:
        final_output_directory_path = pdf_path.parents[0].joinpath(pdf_path.stem)
        final_output_directory_path.mkdir()
    except FileExistsError:  # breaks if directory already exists so we don't overwrite anything
        print('********************************************')
        print(f'Output directory already exists for {pdf_path.name}; file will be skipped, delete directory and re-run to process')
        print('')
        
        # delete temporary PDF file and continue with next pdf_path
        temporary_pdf_path.unlink()
        continue    
    
    # open PDF with PyPDF2 and get the number of pages
    f = open(temporary_pdf_path, 'rb')
    pdf = PdfFileReader(f)
    number_of_pages = pdf.getNumPages()
    f.close()
    print(f'# of pages: {number_of_pages}')
    
    # instantiate progress bar
    label = f'Processing {temporary_pdf_path.name} . . .'
    progress_label = Label(label)
    progress_bar = IntProgress(min=0, max=number_of_pages)
    progress_widget = VBox([progress_label, progress_bar])
    display(progress_widget)
    
    # loop through PDF pages
    for i in range(number_of_pages):
        
        # set output name and temporary output paths
        output_stem = f'{str(pdf_path.stem)}_{str(i+1).zfill(4)}'
        print(f'output_stem: {output_stem}')
        output_jpeg_name = f'{output_stem}.jpeg'
        print(f'output_jpeg_name: {output_jpeg_name}')
        output_name = f'{output_stem}.tif'
        print(f'output_name: {output_name}')
        temporary_image_path = temporary_directory_path.joinpath(output_jpeg_name)
        
        # update progress bar label
        label = f'Processing {temporary_pdf_path.name} page {i+1} . . .'
        progress_label.value = label
        
        # convert PDF page to sRGB, 8-bit, with {dpi} settings
        !magick {str(temporary_pdf_path)}[{i}] -colorspace sRGB -depth 8 -density {jpeg_dpi}x{jpeg_dpi} -units pixelsperinch -quality {jpeg_quality} {str(temporary_image_path)}
        
        # open temporary file with Pillow, reduce size to {dpi}, and save as TIFF
        image = Image.open(temporary_image_path)
        
        # set final output path and save the flattened (pasted) image with {dpi} settings
        final_output_path = final_output_directory_path.joinpath(output_name)
        image.save(final_output_path, dpi=(dpi, dpi))
        
        # close temporary image and delete the temporary file
        image.close()  # Windows doesn't seem to gracefully open/close images with Pillow like macOS
        temporary_image_path.unlink()
        
        # update progress bar value
        progress_bar.value = i + 1
    
    # 
    image_paths_list = list(final_output_directory_path.glob('*.tif'))
    number_of_images = len(image_paths_list)
    if number_of_images == number_of_pages:
        print(f'{number_of_images} TIFFs created in {final_output_directory_path}')
        print('')
    else:
        print('********************************************')
        print(f'# of pages DOES NOT EQUAL # of final TIFFs: {number_of_pages} != {number_of_images}')
        print('')
        
    # delete temporary pdf
    temporary_pdf_path.unlink()
    # delete temporary directory
    temporary_directory_path.unlink()

In [None]:
# manual processing for Debug
pdf_path = pdf_paths_list[2]
logging.info(f'{pdf_path.name}')
pdf = PdfFileReader(open(str(pdf_path), 'rb'))
number_of_pages = pdf.getNumPages()
logging.info(f'# of pages: {number_of_pages}')

pdf_output_directory_path = pdf_path.parents[0].joinpath(pdf_path.stem)
pdf_output_directory_path.mkdir(exist_ok=True)  # create output directory with same name as PDF

for i in range(number_of_pages):
    print(i)
    output_name = f'{str(pdf_path.stem)}_{str(i+1).zfill(4)}.tif'
    print(output_name)
    temp_path = Path('data').joinpath(output_name)
    output_path = pdf_output_directory_path.joinpath(output_name)
    !magick -colorspace sRGB -density 300x300 -depth 8 -units pixelsperinch +compress -verbose {str(pdf_path)}[{i}] {str(temp_path)}
    image = Image.open(temp_path)
    size = image.size
    image_new = Image.new(mode='RGB', size=image.size, color='white')
    image_new.paste(image, box=(0,0))
    
    image_new.save(output_path, dpi=(300, 300))
    image.close()
    temp_path.unlink()