# Convert one book directory into a PDF

Jupyter Notebook to use ImageMagick, Ghostscript, and PDFTK to convert a directory of \*.tif into a single PDF

This PDF will then be OCR'd & optimized with Adobe Acrobat DC, but we can create a high-quality PDF that contains images which are indistinguishable from the the TIFFs (**TEST THIS TO SEE IF IT'S TRUE!**)

In [14]:
# imports
import shutil
from pathlib import Path

from ipywidgets import IntProgress, Label, VBox
from IPython.display import display

In [15]:
# network path
data_directory = Path('/Volumes/fluffy/ThesesDissertations_PatronRequests/3.toPDF')

directories_to_convert_to_pdf_paths_list = sorted([x for x in data_directory.iterdir() if x.is_dir()])

len(directories_to_convert_to_pdf_paths_list)

1

In [16]:
# processed with ScanTailor so get the "out" directory from the first directory in our data_directory
directory_path_to_process = directories_to_convert_to_pdf_paths_list[0]
scantailor_output_path = directory_path_to_process.joinpath('out')

# right now I'm using a <cropped_and_expanded> directory for processing images, like with DuncanRuby_1939
if scantailor_output_path.exists():
    pass
else:  # if we don't have an "out" directory then look for one inside of a cropped_and_expanded directory first
    scantailor_output_path = directory_path_to_process.joinpath('cropped_and_expanded', 'out', 'cropped')

In [17]:
# get paths to *.tif in directory to process
tif_paths_list = sorted(scantailor_output_path.glob('*.tif'))
# get tif paths without dot-files
tif_paths_list = [x for x in tif_paths_list if not str(x.name).startswith('.')]

number_of_tifs = len(tif_paths_list)
print(f'{number_of_tifs} *.tif to process in {scantailor_output_path}')

398 *.tif to process in /Volumes/fluffy/ThesesDissertations_PatronRequests/3.toPDF/DuncanRuby_1939/cropped_and_expanded/out/cropped


In [18]:
# convert *.tif into *.pdf and save in local temp directory

# set temp save directory path
temp_pdf_save_directory_path = Path.home().joinpath('Desktop', '_temp_pdf_save_directory')
    
# create temp save directory
temp_pdf_save_directory_path.mkdir(exist_ok=True)

# progress bar
progress_label = Label('Convert *.tif to PDF')
progress_bar = IntProgress(min=0, max=number_of_tifs)
progress_widget = VBox([progress_label, progress_bar])
display(progress_widget)

for index, tif_path in enumerate(tif_paths_list, start=1):
    
    pdf_name = f'{tif_path.stem}.pdf'
    pdf_temp_save_path = temp_pdf_save_directory_path.joinpath(pdf_name)
    
    label = f'Converting {tif_path.name} into {pdf_name}'
    progress_label.value = label
    
    !magick {str(tif_path)} {str(pdf_temp_save_path)}
    
    progress_bar.value = index
    
temp_pdf_paths_list = sorted(temp_pdf_save_directory_path.glob('*.pdf'))
number_of_temp_pdfs = len(temp_pdf_paths_list)

if number_of_temp_pdfs != number_of_tifs:
    print(f'ERROR - ERROR - ERROR')
    print(f'# of *.tif: {number_of_tifs}')
    print(f'# of temporary PDFs: {number_of_temp_pdfs}')
else:
    print(f'Created {number_of_temp_pdfs} temporary *.pdf in {temp_pdf_save_directory_path}')

VBox(children=(Label(value='Convert *.tif to PDF'), IntProgress(value=0, max=398)))

Created 398 temporary *.pdf in /Users/dlisla/Desktop/_temp_pdf_save_directory


In [22]:
# concatenate all *.pdf in local temp directory into single PDF

print(f'Combining *.pdf into single PDF')

combined_pdf_name = f'{directory_path_to_process.name}.pdf'
combined_pdf_save_path = directory_path_to_process.parents[1].joinpath('4.toOCR', combined_pdf_name)
!pdftk {str(temp_pdf_save_directory_path)}/*.pdf cat output {str(combined_pdf_save_path)}

if combined_pdf_save_path.is_file() and combined_pdf_save_path.stat().st_size > 0:
    print(f'{combined_pdf_save_path} created with size {round((combined_pdf_save_path.stat().st_size/1024/1024/1024), 2)} GB')

Combining *.pdf into single PDF
/Volumes/fluffy/ThesesDissertations_PatronRequests/4.toOCR/DuncanRuby_1939.pdf created with size 7235.71 GB


In [12]:
# delete all pdfs in temporary directory then delete directory
# for temp_pdf_path in temp_pdf_paths_list:
#     if temp_pdf_path.exists():
#         temp_pdf_path.unlink()
    
# temp_pdf_save_directory_path.rmdir()

# just use shutil.rmtree to recursively delete from now on
shutil.rmtree(temp_pdf_save_directory_path)

In [19]:
# move directory on fluffy from 3.toPDF to 4.toOCR
move_to_this_path = directory_path_to_process.parents[1].joinpath('4.toOCR', directory_path_to_process.name)
directory_path_to_process.rename(move_to_this_path)

In [20]:
# open output directory
!open {str(move_to_this_path)}

In [23]:
# open PDF in Adobe Acrobat to use OCR/optimize action
!open {str(combined_pdf_save_path)}