# Process a directory of TIFFs for batch upload

This will process a book of TIFF files into a directory as-needed for Islandora ingest (Continuing Publications meeting 2019-02-19)

Structure for directory is:

/book/

     1/
       page 1.tif
     2/
       page 2.tif
     n/
       page n.tif
       
Keep "book" directory and have the individual pages in sub-directories in a final 'forIngest'.zip file

In [23]:
# importing & options
import datetime
import logging
import shutil
import sys
from pathlib import Path

import pandas as pd
from ipywidgets import IntProgress, Label, VBox
from IPython.display import display

# set Logging Configuration with current level at INFO
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)

# set display at 95% width
from IPython.core.display import display, HTML
display(HTML('<style>.container { width:95% !important; }</style>'))

In [4]:
def create_subdirectories_for_ingest(book_directory, file_extension='.tif'):
    
    book_directory_path = Path(book_directory)
    logging.info(f'Processing book at {book_directory_path}')
    
    # get sorted list of all image paths with file_extension
    image_paths_list = sorted([x for x in book_directory_path.iterdir() if str(x).endswith(file_extension)])
    logging.info(f'There are {len(image_paths_list)} "{file_extension}"s in "{book_directory_path}"')
    
    # set ingest stub to add to directory name
    ingest_stub = 'CreatedForIslandoraIngest'
    # get today's date in YYY-MM-DD format
    todays_date = datetime.datetime.now().strftime('%Y-%m-%d')
    # add today's date to ingest stub
    ingest_stub = f'{ingest_stub}_{todays_date}'
    
    # set ingest directory name
    ingest_directory_name = f'{book_directory_path.name}_{ingest_stub}'
    logging.info(f'ingest directory name: {ingest_directory_name}')
    
    # create ingest directory
    ingest_directory_path = book_directory_path.parents[0].joinpath(ingest_directory_name)
    try:
        ingest_directory_path.mkdir()  # existing directory will throw error
        if ingest_directory_path.is_dir():
            logging.info(f'ingest directory path: {ingest_directory_path}')
    except FileExistsError:  # directory already exists
        logging.error(f'********** ingest directory already exists at {ingest_directory_path} **********')

    logging.info(f'To Process: {len(image_paths_list)} images in {book_directory_path}')
    
    # progress bar
    progress_label = Label('Image Being Processed')
    progress_bar = IntProgress(min=0, max=len(image_paths_list))
    progress_widget = VBox([progress_label, progress_bar])
    display(progress_widget)
    
    # create a directory for an image then copy image into it
    for index, image_path in enumerate(image_paths_list, start=1):
        
        progress_label.value = image_path.name
        
        # create sub-directory for image
        image_directory_path = ingest_directory_path.joinpath(str(index))
        try:
            image_directory_path.mkdir()  # existing directory will throw error
        except FileExistsError:
            logging.error(f'********** ingest directory already exists at {image_directory_path} **********')
        
        # set copy image path & copy image
        copy_image_path = image_directory_path.joinpath(image_path.name)
        shutil.copy(image_path, copy_image_path)
        
        # set new image name to "page {index}{file_extension}"
        new_image_name = f'page {str(index)}{file_extension}'
        new_image_path = copy_image_path.parents[0].joinpath(new_image_name)
        
        # rename copied image path to new name
        copy_image_path.rename(new_image_path)
        
        progress_bar.value = index
    
    glob_string = f'**/*{file_extension}'
    images_processed_paths_list = list(ingest_directory_path.glob(glob_string))
    logging.info(f'Processed images: {len(images_processed_paths_list)} images in {ingest_directory_path}')

In [14]:
parent_directory = r'Z:\ContinuingPublications\Alumnus\delivery'
book_name_list = ['alum_2014fall', 'alum_2014summer', 'alum_2015spring']

In [15]:
for directory_name in directory_name_list:
    book_directory = Path(parent_directory).joinpath(directory_name)
    print(book_directory)

Z:\ContinuingPublications\Alumnus\delivery\alum_2014fall
Z:\ContinuingPublications\Alumnus\delivery\alum_2014summer
Z:\ContinuingPublications\Alumnus\delivery\alum_2015spring


In [16]:
for book_name in book_name_list:
    book_directory = Path(parent_directory).joinpath(book_name)
    
    create_subdirectories_for_ingest(book_directory)


INFO:Processing book at Z:\ContinuingPublications\Alumnus\delivery\alum_2014fall
INFO:There are 52 ".tif"s in "Z:\ContinuingPublications\Alumnus\delivery\alum_2014fall"
INFO:ingest directory name: alum_2014fall_CreatedForIslandoraIngest_2019-02-19
INFO:ingest directory path: Z:\ContinuingPublications\Alumnus\delivery\alum_2014fall_CreatedForIslandoraIngest_2019-02-19
INFO:To Process: 52 images in Z:\ContinuingPublications\Alumnus\delivery\alum_2014fall


VBox(children=(Label(value='Image Being Processed'), IntProgress(value=0, max=52)))

INFO:Processed images: 52 images in Z:\ContinuingPublications\Alumnus\delivery\alum_2014fall_CreatedForIslandoraIngest_2019-02-19
INFO:Processing book at Z:\ContinuingPublications\Alumnus\delivery\alum_2014summer
INFO:There are 52 ".tif"s in "Z:\ContinuingPublications\Alumnus\delivery\alum_2014summer"
INFO:ingest directory name: alum_2014summer_CreatedForIslandoraIngest_2019-02-19
INFO:ingest directory path: Z:\ContinuingPublications\Alumnus\delivery\alum_2014summer_CreatedForIslandoraIngest_2019-02-19
INFO:To Process: 52 images in Z:\ContinuingPublications\Alumnus\delivery\alum_2014summer


VBox(children=(Label(value='Image Being Processed'), IntProgress(value=0, max=52)))

INFO:Processed images: 52 images in Z:\ContinuingPublications\Alumnus\delivery\alum_2014summer_CreatedForIslandoraIngest_2019-02-19
INFO:Processing book at Z:\ContinuingPublications\Alumnus\delivery\alum_2015spring
INFO:There are 52 ".tif"s in "Z:\ContinuingPublications\Alumnus\delivery\alum_2015spring"
INFO:ingest directory name: alum_2015spring_CreatedForIslandoraIngest_2019-02-19
INFO:ingest directory path: Z:\ContinuingPublications\Alumnus\delivery\alum_2015spring_CreatedForIslandoraIngest_2019-02-19
INFO:To Process: 52 images in Z:\ContinuingPublications\Alumnus\delivery\alum_2015spring


VBox(children=(Label(value='Image Being Processed'), IntProgress(value=0, max=52)))

INFO:Processed images: 52 images in Z:\ContinuingPublications\Alumnus\delivery\alum_2015spring_CreatedForIslandoraIngest_2019-02-19


In [42]:
root = r'Z:\ContinuingPublications\basketball-men\delivery'
root_path = Path(root)
items_list = list(root_path.iterdir())

In [43]:
# copy years list to clipboard for Google Sheets
df = pd.DataFrame(items_list)
df.to_clipboard(index=False, header=False, excel=True, sep='\t')

In [22]:
for item in list(root_path.iterdir()):
    print(item.name)

1970april_backup_KEEP_to_replace
2009spring_test-PDF-to_TIFF
2012summer_originals
2012winter_orginals
2013winter_originals
2014fall_originals
2014spring_originals
2014summer_originals
2014winter_originals
2015spring
Alumnus Fall 2015.pdf
Alumnus Fall 2016.pdf
Alumnus Fall 2017.pdf
Alumnus Spring 2016.pdf
Alumnus Spring 2017.pdf
Alumnus Winter 2016.pdf
Alumnus Winter 2017.pdf
Alumnus Winter 2018.pdf
Alumnus_missingpages.pdf
corrections_2010Nov19_stitched_copy
corrections_2010Nov19_unstitched
jpg-batching.sh
PDFS-fromMediaRelationsCD
tesseract-batching.sh
tesseract-ocr-stitched-tiffs
unstitched_originals_1926-1929_3TiffsAreIncorrectHere
UT Alumnus Centennial Book 2017.pdf


In [None]:
# copy years list to clipboard for Google Sheets
df = pd.DataFrame(years_list)
df.to_clipboard(index=False, header=False, excel=True, sep='\t')