In [2]:
# imports
import shutil
from pathlib import Path

from ipywidgets import IntProgress, Label, VBox
from IPython.display import display

In [3]:
# set network path and get list of directories to process
data_directory = Path('/Volumes/fluffy/ThesesDissertations_MassDigitization/')
input_directory = data_directory.joinpath('3.toPDF')
output_directory = data_directory.joinpath('3a.toCheckPDFs')

directories_to_convert_to_pdf_paths_list = sorted([x for x in input_directory.iterdir() if x.is_dir()])

print(f'{len(directories_to_convert_to_pdf_paths_list)} directories to process')

15 directories to process


In [4]:
# functions

def get_tif_paths_list(directory_path, scantailor=False):
    
    # if directory_path not a Path-like object then make it one
    if not isinstance(directory_path, Path):
        directory_path = Path(directory_path)
        
    # add 'out' path to directory if processed with ScanTailor
    if scantailor:
        directory_path = Path(directory_path).joinpath('out')
    
    # get paths to *.tif to process
    tif_paths_list = sorted(directory_path.glob('*.tif'))
    
    # only keep *.tif that DO NOT start with '.' to exclude macOS index files
    tif_paths_list = [x for x in tif_paths_list if not str(x.name).startswith('.')]
    
    return tif_paths_list


def create_temporary_pdf_directory(directory_path):
    
    # if directory_path not a Path-like object then make it one
    if not isinstance(directory_path, Path):
        directory_path = Path(directory_path)
        
    temporary_pdf_directory_path = Path.home().joinpath('Desktop', f'_temporary_pdfs_{dir_path.name}')
    
    try:  # creating the temporary directory
        temporary_pdf_directory_path.mkdir()
    except FileExistsError:  # delete the directory & all contents then create it
        shutil.rmtree(temporary_pdf_directory_path)
        temporary_pdf_directory_path.mkdir()
        
    return temporary_pdf_directory_path


def batch_ocr_tifs_into_pdfs(tif_paths_list, temporary_pdf_directory_path):
    
    number_of_tifs = len(tif_paths_list)
    number_of_pdfs = 0
    
    # progress bar
    progress_label = Label('OCR *.tif')
    progress_bar = IntProgress(min=0, max=number_of_tifs)
    progress_widget = VBox([progress_label, progress_bar])
    display(progress_widget)
    
    for tif_path in tif_paths_list:
        
        label = f'OCRing {tif_path.name}'
        progress_label.value = label

        temporary_pdf_output_path = temporary_pdf_directory_path.joinpath(tif_path.stem)

        # ocr *.tif with tesseract
        # NOTE: output SHOULD NOT have extension on end!
        !tesseract {str(tif_path)} {str(temporary_pdf_output_path)} pdf 2>/dev/null

        # Add the suffix .pdf that tesseract didn't want
        temporary_pdf_output_path = temporary_pdf_directory_path.joinpath(f'{tif_path.stem}.pdf') 

        if temporary_pdf_output_path.is_file():
            number_of_pdfs += 1
            progress_bar.value = number_of_pdfs
        
    if number_of_pdfs != number_of_tifs:
        print(f'ERROR - ERROR - ERROR')
        print(f'# of *.tif: {number_of_tifs}')
        print(f'# of PDFs: {number_of_pdfs}')
    else:
        print(f'{number_of_pdfs} PDFs created and saved in {temporary_pdf_directory_path}')
    
    
def concatenate_pdfs(temporary_pdf_directory_path):

    # strip '_temporary_pdfs_' from beginning of directory path to get combined PDF name
    combined_pdf_name = f"{str(temporary_pdf_directory_path).split('_temporary_pdfs_')[1]}.pdf"

    combined_pdf_output_path = temporary_pdf_directory_path.joinpath(combined_pdf_name)

    # concatenate with pdftk
    !pdftk {str(temporary_pdf_directory_path)}/*.pdf cat output {str(combined_pdf_output_path)}

    if combined_pdf_output_path.is_file() and combined_pdf_output_path.stat().st_size > 0:
        print(f'{combined_pdf_output_path} created with size {round((combined_pdf_output_path.stat().st_size/1024/1024/1024), 2)} GB')

    return combined_pdf_output_path

In [6]:
for dir_path in directories_to_convert_to_pdf_paths_list:
    
    # get list of *.tif to process
    tif_paths_list = get_tif_paths_list(dir_path, scantailor=True)
    
    number_of_tifs = len(tif_paths_list)
    print(f'{number_of_tifs} *.tif to process in {dir_path}')
    
    # create temp directory
    temp_pdf_dir_path = create_temporary_pdf_directory(dir_path)
    
    # OCR tifs into PDFs with progress bar
    batch_ocr_tifs_into_pdfs(tif_paths_list, temp_pdf_dir_path)
    
    # concatenate PDFs into single file
    combined_pdf_path = concatenate_pdfs(temp_pdf_dir_path)
    
    # copy combined PDF to copy directory
    output_pdf_path = output_directory.joinpath(combined_pdf_path.name)
    shutil.copy(combined_pdf_path, output_pdf_path)
    
    if output_pdf_path.is_file():  # move processed dir_path into output_directory
        renamed_dir_path = output_directory.joinpath(dir_path.name)
        dir_path.rename(renamed_dir_path)

    print('')
    print('*****')    
    print(f'{dir_path.name} is now {renamed_dir_path}')
    print('*****')
    print('')

130 *.tif to process in /Volumes/fluffy/ThesesDissertations_MassDigitization/3.toPDF/SmithDavid_2002


VBox(children=(Label(value='OCR *.tif'), IntProgress(value=0, max=130)))

130 PDFs created and saved in /Users/dlisla/Desktop/_temporary_pdfs_SmithDavid_2002
/Users/dlisla/Desktop/_temporary_pdfs_SmithDavid_2002/SmithDavid_2002.pdf created with size 0.01 GB

*****
SmithDavid_2002 is now /Volumes/fluffy/ThesesDissertations_MassDigitization/3a.toCheckPDFs/SmithDavid_2002
*****

116 *.tif to process in /Volumes/fluffy/ThesesDissertations_MassDigitization/3.toPDF/SmithKenneth_2002


VBox(children=(Label(value='OCR *.tif'), IntProgress(value=0, max=116)))

116 PDFs created and saved in /Users/dlisla/Desktop/_temporary_pdfs_SmithKenneth_2002
/Users/dlisla/Desktop/_temporary_pdfs_SmithKenneth_2002/SmithKenneth_2002.pdf created with size 0.0 GB

*****
SmithKenneth_2002 is now /Volumes/fluffy/ThesesDissertations_MassDigitization/3a.toCheckPDFs/SmithKenneth_2002
*****

124 *.tif to process in /Volumes/fluffy/ThesesDissertations_MassDigitization/3.toPDF/SouthwardLeigh_2002


VBox(children=(Label(value='OCR *.tif'), IntProgress(value=0, max=124)))

124 PDFs created and saved in /Users/dlisla/Desktop/_temporary_pdfs_SouthwardLeigh_2002
/Users/dlisla/Desktop/_temporary_pdfs_SouthwardLeigh_2002/SouthwardLeigh_2002.pdf created with size 0.01 GB

*****
SouthwardLeigh_2002 is now /Volumes/fluffy/ThesesDissertations_MassDigitization/3a.toCheckPDFs/SouthwardLeigh_2002
*****

102 *.tif to process in /Volumes/fluffy/ThesesDissertations_MassDigitization/3.toPDF/SprohgeErik_2002


VBox(children=(Label(value='OCR *.tif'), IntProgress(value=0, max=102)))

102 PDFs created and saved in /Users/dlisla/Desktop/_temporary_pdfs_SprohgeErik_2002
/Users/dlisla/Desktop/_temporary_pdfs_SprohgeErik_2002/SprohgeErik_2002.pdf created with size 0.01 GB

*****
SprohgeErik_2002 is now /Volumes/fluffy/ThesesDissertations_MassDigitization/3a.toCheckPDFs/SprohgeErik_2002
*****

172 *.tif to process in /Volumes/fluffy/ThesesDissertations_MassDigitization/3.toPDF/StanfieldTodd_2002


VBox(children=(Label(value='OCR *.tif'), IntProgress(value=0, max=172)))

172 PDFs created and saved in /Users/dlisla/Desktop/_temporary_pdfs_StanfieldTodd_2002
/Users/dlisla/Desktop/_temporary_pdfs_StanfieldTodd_2002/StanfieldTodd_2002.pdf created with size 0.01 GB

*****
StanfieldTodd_2002 is now /Volumes/fluffy/ThesesDissertations_MassDigitization/3a.toCheckPDFs/StanfieldTodd_2002
*****

126 *.tif to process in /Volumes/fluffy/ThesesDissertations_MassDigitization/3.toPDF/StephensDaniel_2002


VBox(children=(Label(value='OCR *.tif'), IntProgress(value=0, max=126)))

126 PDFs created and saved in /Users/dlisla/Desktop/_temporary_pdfs_StephensDaniel_2002
/Users/dlisla/Desktop/_temporary_pdfs_StephensDaniel_2002/StephensDaniel_2002.pdf created with size 0.06 GB

*****
StephensDaniel_2002 is now /Volumes/fluffy/ThesesDissertations_MassDigitization/3a.toCheckPDFs/StephensDaniel_2002
*****

260 *.tif to process in /Volumes/fluffy/ThesesDissertations_MassDigitization/3.toPDF/WalkerBrian_2001


VBox(children=(Label(value='OCR *.tif'), IntProgress(value=0, max=260)))

260 PDFs created and saved in /Users/dlisla/Desktop/_temporary_pdfs_WalkerBrian_2001
/Users/dlisla/Desktop/_temporary_pdfs_WalkerBrian_2001/WalkerBrian_2001.pdf created with size 0.01 GB

*****
WalkerBrian_2001 is now /Volumes/fluffy/ThesesDissertations_MassDigitization/3a.toCheckPDFs/WalkerBrian_2001
*****

207 *.tif to process in /Volumes/fluffy/ThesesDissertations_MassDigitization/3.toPDF/WalkerKathleen_2001


VBox(children=(Label(value='OCR *.tif'), IntProgress(value=0, max=207)))

207 PDFs created and saved in /Users/dlisla/Desktop/_temporary_pdfs_WalkerKathleen_2001
/Users/dlisla/Desktop/_temporary_pdfs_WalkerKathleen_2001/WalkerKathleen_2001.pdf created with size 0.01 GB

*****
WalkerKathleen_2001 is now /Volumes/fluffy/ThesesDissertations_MassDigitization/3a.toCheckPDFs/WalkerKathleen_2001
*****

204 *.tif to process in /Volumes/fluffy/ThesesDissertations_MassDigitization/3.toPDF/Ware-HargisMixon_2001


VBox(children=(Label(value='OCR *.tif'), IntProgress(value=0, max=204)))

204 PDFs created and saved in /Users/dlisla/Desktop/_temporary_pdfs_Ware-HargisMixon_2001
/Users/dlisla/Desktop/_temporary_pdfs_Ware-HargisMixon_2001/Ware-HargisMixon_2001.pdf created with size 0.01 GB

*****
Ware-HargisMixon_2001 is now /Volumes/fluffy/ThesesDissertations_MassDigitization/3a.toCheckPDFs/Ware-HargisMixon_2001
*****

194 *.tif to process in /Volumes/fluffy/ThesesDissertations_MassDigitization/3.toPDF/WashingtonKadesha_2001


VBox(children=(Label(value='OCR *.tif'), IntProgress(value=0, max=194)))

194 PDFs created and saved in /Users/dlisla/Desktop/_temporary_pdfs_WashingtonKadesha_2001
/Users/dlisla/Desktop/_temporary_pdfs_WashingtonKadesha_2001/WashingtonKadesha_2001.pdf created with size 0.01 GB

*****
WashingtonKadesha_2001 is now /Volumes/fluffy/ThesesDissertations_MassDigitization/3a.toCheckPDFs/WashingtonKadesha_2001
*****

201 *.tif to process in /Volumes/fluffy/ThesesDissertations_MassDigitization/3.toPDF/WebbLisa_2001


VBox(children=(Label(value='OCR *.tif'), IntProgress(value=0, max=201)))

201 PDFs created and saved in /Users/dlisla/Desktop/_temporary_pdfs_WebbLisa_2001
/Users/dlisla/Desktop/_temporary_pdfs_WebbLisa_2001/WebbLisa_2001.pdf created with size 0.02 GB

*****
WebbLisa_2001 is now /Volumes/fluffy/ThesesDissertations_MassDigitization/3a.toCheckPDFs/WebbLisa_2001
*****

146 *.tif to process in /Volumes/fluffy/ThesesDissertations_MassDigitization/3.toPDF/WellsGayle_2001


VBox(children=(Label(value='OCR *.tif'), IntProgress(value=0, max=146)))

146 PDFs created and saved in /Users/dlisla/Desktop/_temporary_pdfs_WellsGayle_2001
/Users/dlisla/Desktop/_temporary_pdfs_WellsGayle_2001/WellsGayle_2001.pdf created with size 0.01 GB

*****
WellsGayle_2001 is now /Volumes/fluffy/ThesesDissertations_MassDigitization/3a.toCheckPDFs/WellsGayle_2001
*****

213 *.tif to process in /Volumes/fluffy/ThesesDissertations_MassDigitization/3.toPDF/WhiteDavid_2001


VBox(children=(Label(value='OCR *.tif'), IntProgress(value=0, max=213)))

213 PDFs created and saved in /Users/dlisla/Desktop/_temporary_pdfs_WhiteDavid_2001
/Users/dlisla/Desktop/_temporary_pdfs_WhiteDavid_2001/WhiteDavid_2001.pdf created with size 0.01 GB

*****
WhiteDavid_2001 is now /Volumes/fluffy/ThesesDissertations_MassDigitization/3a.toCheckPDFs/WhiteDavid_2001
*****

107 *.tif to process in /Volumes/fluffy/ThesesDissertations_MassDigitization/3.toPDF/WilliamsonNancy_2001


VBox(children=(Label(value='OCR *.tif'), IntProgress(value=0, max=107)))

107 PDFs created and saved in /Users/dlisla/Desktop/_temporary_pdfs_WilliamsonNancy_2001
/Users/dlisla/Desktop/_temporary_pdfs_WilliamsonNancy_2001/WilliamsonNancy_2001.pdf created with size 0.01 GB

*****
WilliamsonNancy_2001 is now /Volumes/fluffy/ThesesDissertations_MassDigitization/3a.toCheckPDFs/WilliamsonNancy_2001
*****

228 *.tif to process in /Volumes/fluffy/ThesesDissertations_MassDigitization/3.toPDF/ZhaoRongguo_2001


VBox(children=(Label(value='OCR *.tif'), IntProgress(value=0, max=228)))

228 PDFs created and saved in /Users/dlisla/Desktop/_temporary_pdfs_ZhaoRongguo_2001
/Users/dlisla/Desktop/_temporary_pdfs_ZhaoRongguo_2001/ZhaoRongguo_2001.pdf created with size 0.08 GB

*****
ZhaoRongguo_2001 is now /Volumes/fluffy/ThesesDissertations_MassDigitization/3a.toCheckPDFs/ZhaoRongguo_2001
*****



In [7]:
# open output_directory_path for visual Quality Assurance
!open {str(output_directory)}

The file /Volumes/fluffy/ThesesDissertations_MassDigitization/3a.toCheckPDFs does not exist.


In [8]:
# delete all '_temporary_pdfs_*' directories on the desktop
all_temp_dir_paths = sorted(Path('/Users/dlisla/Desktop/').glob('_temporary_pdfs_*'))
number_of_dir_paths = len(all_temp_dir_paths)
for temp_dir_path in all_temp_dir_paths:
    shutil.rmtree(temp_dir_path)

all_temp_dir_paths = sorted(Path('/Users/dlisla/Desktop/').glob('_temporary_pdfs_*'))
number_of_deleted_dir_paths = number_of_dir_paths - len(all_temp_dir_paths)

print(f'{number_of_deleted_dir_paths} _temporary_pdf_* directories deleted')

15 _temporary_pdf_* directories deleted
