# Process Continuing Publications: Playbills

date: 2019-09-24

## Playbills YAML files

https://github.com/utkdigitalinitiatives/Automated-Ingest-for-Continuing-Publications/blob/master/collection_templates/collections__playbills.yml

1. adminDB: "0012_003049_XXXXXX" # Replace "X" with appropriate adminDB values.
1. title: "Title" # Replace with title in title case.
1. date_Issued: "Month Day, YYYY" # Replace with date values (e.g. September 21, 2019)
1. date_Issued_edtf: "YYYY-MM-DD" # Replace letters with a four-digit year and a two-digit number for month and day (e.g. 2019-09-21)

In [1]:
from datetime import datetime
from dateutil.parser import parse
from hashlib import md5
from pathlib import Path
from shutil import copy2, rmtree

In [2]:
def get_formatted_extension(from_extension, remediate=False):
    '''
    -- Purpose --
    Returns an extension that:
    1. has a period in the front
    2. Optional: is lower-case
    3. Optional: return jpeg as jpg and tiff as tif

    -- Arguments --
    from_extension: type=string; file extension with or without a '.'

    -- Returns --
    formatted_extension: type=string; formatted extension
    '''
    # make sure there's a period at the front of the extension
    if from_extension.startswith('.'):  # do nothing
        formatted_extension = from_extension
    else:  # add a period
        formatted_extension = f'.{from_extension}'

    # make it lower-case
    if remediate:
        formatted_extension = formatted_extension.lower()
        # hard-coded alterations for jpeg and tiff
        if formatted_extension == '.jpeg':
            formatted_extension = '.jpg'
        elif formatted_extension == '.tiff':
            formatted_extension = '.tif'

    return formatted_extension


def md5_update_from_dir(directory, hash):
    assert Path(directory).is_dir()
    for path in sorted(Path(directory).iterdir()):
        hash.update(path.name.encode())
        if path.is_file():
            with open(path, "rb") as f:
                for chunk in iter(lambda: f.read(4096), b""):
                    hash.update(chunk)
        elif path.is_dir():
            hash = md5_update_from_dir(path, hash)
    return hash


def md5_dir(directory):
    return md5_update_from_dir(directory, md5()).hexdigest()


def batch_process_playbills(root_dir, adminDB_collection, adminDB_next_item):
    
    directory_paths_list = sorted([x for x in root_dir.iterdir() if x.is_dir()])
    
    print(f'Processing {len(directory_paths_list)} directories')
    
    for directory_path in directory_paths_list:
        
        volume = Playbills(directory_path, adminDB_collection, adminDB_next_item)
        print(volume.directory_path)
        volume.process_publication()
        
        # increase adminDB_next_item number
        adminDB_next_item += 1

In [7]:
class ContinuingPublications_Volume:
    '''Common base class for Continuing Publications'''

    def __init__(self, directory, adminDB_collection, adminDB_item):
        self.directory_path = Path(directory).resolve()
        self.input_directory_name = self.directory_path.name
        # set yaml path, rows, and rows list
        self.yaml_path = self.directory_path.parents[0].joinpath(f'{self.directory_path.name}.yml')  # yaml lives next to directory

    def backup_volume(self):
        '''
        -- Purpose --
        Copy all files in directory to backup directory with name: <directory>_backup

        -- Arguments --
        None

        -- Returns --
        backup_directory_path: type=Path-like object; returns absolute path to backup directory
        '''
        backup_directory_path = self.directory_path.parent.joinpath(f'{self.directory_path.name}_backup')

        if backup_directory_path.exists():  # copytree requires directory to NOT exist
            # rmtree(backup_directory_path)
            print(f'Backup already exists at {backup_directory_path}')
        else:
            print(f'Backing up {self.directory_path.name} . . .')
            backup_directory_path.mkdir()
            
            # roll my own copytree since I'm having issues with permissions
            everything_paths_list = list(self.directory_path.glob('**/*'))
            
            dirs_list = [x for x in everything_paths_list if x.is_dir()]
            files_list = [x for x in everything_paths_list if x.is_file()]
            
            for dir_path in dirs_list:
                # get dir_path without volume.directory_path at the beginning
                local_path = str(dir_path).replace(str(self.directory_path), '')
                # get a list of the directories by stripping the forward-slashes
                local_path = local_path.strip('/')
                # create output_dir and create it
                output_dir = backup_directory_path.joinpath(local_path)
                output_dir.mkdir(parents=True)
                
            for file_path in files_list:
                # get local path for file (everything after volume.directory_path) as a list
                local_path = str(file_path).replace(str(self.directory_path), '').strip('/')
                # set output_path and copy
                output_path = backup_directory_path.joinpath(local_path)
                copy2(file_path, output_path)

            # copytree(self.directory_path, backup_directory_path)

            if backup_directory_path.exists():
                self.input_hash = md5_dir(self.directory_path)
                self.backup_hash = md5_dir(backup_directory_path)
                
                # check md5 hashes of backup against original
                if self.input_hash != self.backup_hash:
                    print(f'input hash: {self.input_hash}')
                    print(f'backup hash: {self.backup_hash}')
                    raise ValueError
                    return
                print('Backup created')
        return backup_directory_path.resolve()

    
    def rename_PDFs_for_ingest(self):

        pdf_paths_list = self.get_file_paths('.pdf')

        number_of_pdfs = len(pdf_paths_list)
        if number_of_pdfs == 0:
            print(f'{number_of_pdfs} PDFs to process')
        else:  # process PDFs
            for pdf_path in pdf_paths_list:
                # expect PDF stems ending in original or processed
                if pdf_path.stem.lower().endswith('original'):
                    new_pdf_path = pdf_path.parents[0].joinpath('ORIGINAL.pdf')
                elif pdf_path.stem.lower().endswith('edited'):
                    new_pdf_path = pdf_path.parents[0].joinpath('ORIGINAL_EDITED.pdf')
                else:  # don't rename
                    print(f'{pdf_path} is not original or original_edited, manually remediate')
                    print('')
                    continue
                # rename PDF
                print(f'Renaming {pdf_path.name} to {new_pdf_path}')
                print('')
                pdf_path.replace(new_pdf_path)

        self.pdf_paths_list = self.get_file_paths('.pdf')
        return self.pdf_paths_list

    
    def create_islandora_ingest_directory(self):
        '''
        -- Purpose --
        Create Islandora ingest directory with TIFF in nested structure

        -- Arguments --
        None

        -- Returns --
        ingest_directory_path: type=Path-like object; Path to the directory for ingest
        '''

        # create ingest directory
        ingest_directory_name = self.directory_path.name
        ingest_directory_path = self.directory_path.parents[0].joinpath(ingest_directory_name)
        # try:
        #     ingest_directory_path.mkdir()
        # except FileExistsError:  # directory already exists
        #     print(f'WARNING: ingest directory already exists at {ingest_directory_path}')

        self.directory_path.replace(ingest_directory_path)

        image_paths_list = [x for x in ingest_directory_path.glob('*.tif')]
        number_of_images = len(image_paths_list)

        print(f'Processing {number_of_images} images in {self.directory_path.name}')

        # for each image
        for index, image_path in enumerate(image_paths_list, start=1):

            # create a sub-directory with a simple index number
            image_subdirectory_path = ingest_directory_path.joinpath(str(index).zfill(6))
            try:
                image_subdirectory_path.mkdir()
            except FileExistsError:
                print(f'Sub-directory already exists at {image_subdirectory_path}')

            # set new image name and copy path, then copy image
            #copy_image_path = image_subdirectory_path.joinpath(image_path.name)
            #copyfile(image_path, copy_image_path)
            image_path.replace(image_subdirectory_path.joinpath(image_path.name))

        self.rename_PDFs_for_ingest()
        print(f'Ingest directory created at {ingest_directory_path}')
        print('')

        return ingest_directory_path

    
    def get_file_paths(self, with_extension):
        '''
        -- Purpose --
        Get all file Paths with_extension in self.directory_path

        -- Arguments --
        with_extension: type=string; extension to use for globbing

        -- Returns --
        file_paths_list: type:list; list of Path-like objects, 1 Path-like object
        per file_path in self.directory_path
        '''
        formatted_extension = get_formatted_extension(with_extension)
        file_paths_list = sorted(self.directory_path.glob(f'*{formatted_extension}'))
        return file_paths_list

    
    def rename_files_to_directory_name(self, with_extension, zerofill=4):
        '''
        -- Purpose --
        Rename all files {with_extension} to {self.directory_path.name}_{str(index).zfill(zerofill)}
        *Note: will currently remediate extensions to lower-case and change tiff/jpeg to tif/jpg

        -- Arguments --
        with_extension: type=string; extension to rename
        zerofill: type=integer; how many digits to zeropad

        -- Returns --
        None
        '''
        formatted_extension = get_formatted_extension(with_extension)

        # extension will be lower-case and tif/jpg instead of tiff/jpeg
        remediated_extension = get_formatted_extension(with_extension, remediate=True)

        # get total number of files and the paths for files to rename
        file_paths_list = self.get_file_paths(formatted_extension)
        number_of_files = len(file_paths_list)

        print(f'{number_of_files} with {formatted_extension}')

        if number_of_files == 0:
            print('0 files to process')
            pass

        else:  # rename files
            backup_directory_path = self.backup_volume()

            print(f'Renaming {number_of_files} "{formatted_extension}"s in {self.directory_path.name} . . .')

            count = 0
            try:
                for index, file_path in enumerate(file_paths_list, start=1):
                    # rename TIFF files from Adobe Acrobat for Islandora ingest, i.e. FILENAME.extension
                    new_file_name = f'{self.directory_path.name.upper()}_{str(index).zfill(zerofill)}{remediated_extension}'
                    new_file_path = file_path.parents[0].joinpath(new_file_name)
                    file_path.replace(new_file_path)
                    count = index
            except IndexError:
                pass

            print(f' Renamed {count} "{formatted_extension}"s')
            print('') 

In [8]:
class Playbills(ContinuingPublications_Volume):
    def __init__(self, directory, adminDB_collection, adminDB_item):
        # load ContinuingPublications_Volume class
        super().__init__(directory, adminDB_collection, adminDB_item)
        
        # get metadata from filename
        self.date, self.title = self.directory_path.name.split('_', maxsplit=1)
        self.title_replace_underscores = self.title.replace('_', ' ')
        self.yyyy, self.mm, self.dd = self.date.split('-')
        self.parsed_date = parse(self.date)
        self.month = self.parsed_date.strftime("%B")
        
        # cast self.dd as int to remove a possible leading zero
        self.date_issued = f'{self.month} {int(self.dd)}, {self.yyyy}'
        self.date_issued_edtf = self.date
        self.adminDB = f'0012_{str(adminDB_collection).zfill(6)}_{str(adminDB_item).zfill(6)}'
        self.yaml_row_0 = f'''adminDB: "{self.adminDB}"'''
        self.yaml_row_1 = f'''Title: "{self.title_replace_underscores}"'''
        self.yaml_row_2 = f'''date_Issued: "{self.date_issued}"'''
        self.yaml_row_3 = f'''date_Issued_edtf: "{self.date_issued_edtf}"'''
        self.yaml_rows_list = [self.yaml_row_0, self.yaml_row_1, self.yaml_row_2, self.yaml_row_3]
        
    def create_yaml(self):
        
        if self.yaml_path.is_file():
            print(f'{self.yaml_path} already exists')
            raise FileExistsError
            return
        else:  # create it
            print(f'Creating {self.yaml_path}')
            with open(self.yaml_path, 'a+') as yml_file:
                for yaml_row in self.yaml_rows_list:
                    yml_file.write(f'{yaml_row}\n')  # add line break
            # !touch "{self.yaml_path}"
            # for yaml_row in self.yaml_rows_list:
            #     print(f'Adding {yaml_row}')
            #     !echo "{yaml_row}" >> "{self.yaml_path}"
            # print(f'YAML data in {self.yaml_path}')
            !cat "{self.yaml_path}"
            return
        
    def process_publication(self):
        
        # rename files
        self.rename_files_to_directory_name('.tiff')  # process .tiff first just in case
        self.rename_files_to_directory_name('.tif')  # then make sure it's all .tif
        
        # create YAML file, ingest directory
        self.create_yaml()
        ingest_directory_path = self.create_islandora_ingest_directory()
        
        # create Islandora-required book directory then move ingest directory into it
        self.book_directory_path = self.directory_path.parents[0].joinpath('book')
        self.book_directory_path.mkdir(exist_ok=True)
        self.final_path = self.book_directory_path.joinpath(ingest_directory_path.name)
        ingest_directory_path.replace(self.final_path)
        
        # move YAML file into book directory
        new_yaml_path = self.book_directory_path.joinpath(self.yaml_path.name)
        self.yaml_path.replace(new_yaml_path) 

        number_of_books = len([x for x in self.book_directory_path.iterdir() if x.is_dir()])
        print(f'{number_of_books} books in {self.book_directory_path} for ingest')
        print('')

In [9]:
root_directory_path = Path('/Volumes/fluffy/ContinuingPublications/BacklogApril2019/0.toProcessForUpload/Playbills')
adminDB_collection = 3049
adminDB_next_item = 873

In [10]:
batch_process_playbills(root_directory_path, adminDB_collection, adminDB_next_item)

Processing 7 directories
/Volumes/fluffy/ContinuingPublications/BacklogApril2019/0.toProcessForUpload/Playbills/2013-10-03_Our_Country's_Good
0 with .tiff
0 files to process
6 with .tif
Backing up 2013-10-03_Our_Country's_Good . . .
Backup created
Renaming 6 ".tif"s in 2013-10-03_Our_Country's_Good . . .
 Renamed 6 ".tif"s

Creating /Volumes/fluffy/ContinuingPublications/BacklogApril2019/0.toProcessForUpload/Playbills/2013-10-03_Our_Country's_Good.yml
adminDB: "0012_003049_000873"
Title: "Our Country's Good"
date_Issued: "October 3, 2013"
date_Issued_edtf: "2013-10-03"
Processing 6 images in 2013-10-03_Our_Country's_Good
Renaming ._2013-10-03_Our_Country's_Good_original.pdf to /Volumes/fluffy/ContinuingPublications/BacklogApril2019/0.toProcessForUpload/Playbills/2013-10-03_Our_Country's_Good/ORIGINAL.pdf

Renaming 2013-10-03_Our_Country's_Good_original.pdf to /Volumes/fluffy/ContinuingPublications/BacklogApril2019/0.toProcessForUpload/Playbills/2013-10-03_Our_Country's_Good/ORIGINAL.pd

In [None]:
# rename PDFs to match directory name with "_original" at the end
directory_paths_list = [x for x in root_directory_path.iterdir() if x.is_dir()]

for directory_path in directory_paths_list:
    print('')
    print(f'Directory: {directory_path}')
    print('')

    pdf_path = list(directory_path.glob('*.pdf'))[0]
    new_pdf_path = directory_path.joinpath(f'{directory_path.name}_original.pdf')
    pdf_path.rename(new_pdf_path)

In [None]:
# just rename images to match directory name
directory_paths_list = [x for x in root_directory_path.iterdir() if x.is_dir()]

for directory_path in directory_paths_list:
    print('')
    print(f'Directory: {directory_path}')
    print('')

    # create Volume
    volume = ContinuingPublications_Volume(directory_path)

    # rename Adobe Acrobat .tiff files to directory and .tif extension
    volume.rename_tiffs_to_directory_name('.tiff')
    volume.rename_tiffs_to_directory_name('.tif')

In [None]:
# run ContinuingPub class's directory creation algorithm
directory_paths_list = [x for x in root_directory_path.iterdir() if x.is_dir()]

for directory_path in directory_paths_list:
    print('')
    print(f'Directory: {directory_path}')
    print('')

    # create Volume
    volume = ContinuingPublications_Volume(directory_path)

    # rename Adobe Acrobat .tiff and .tif files to directory name
    volume.rename_tiffs_to_directory_name('.tiff')
    volume.rename_tiffs_to_directory_name('.tif')

    # rename PDFs for ingest
    volume.rename_PDFs_for_ingest()

    # create Islanodra book ingest directory
    ingest_directory_path = volume.create_islandora_ingest_directory()

    # create book directory path as needed for Islandora
    book_directory_path = volume.directory_path.parents[0].joinpath('book')
    book_directory_path.mkdir(exist_ok=True)

    # move ingest directory into book directory
    final_path = book_directory_path.joinpath(ingest_directory_path.name)
    ingest_directory_path.replace(final_path)

    number_of_books = len([x for x in book_directory_path.iterdir() if x.is_dir()])
    print(f'{number_of_books} books in {book_directory_path} for ingest')
    print('')


In [None]:
directory_paths_list = [x for x in root_directory_path.iterdir() if x.is_dir()]
dir_test = directory_paths_list[-1]
dir_test

In [None]:
dir_test

In [None]:
volume = Playbills(dir_test, adminDB_collection, adminDB_next_item)
volume.title, volume.title_replace_underscores

In [None]:
volume.date, volume.yyyy, volume.mm, volume.month, volume.dd

In [None]:
volume.date_issued, volume.date_issued_edtf

In [None]:
volume.adminDB

In [None]:
volume.yaml_path, volume.directory_path

In [None]:
for row in volume.yaml_rows_list:
    print(row)

In [None]:
volume = Playbills(dir_test, adminDB_collection, adminDB_next_item)

In [None]:
volume.directory_path

In [None]:
directories_list = list(volume.directory_path.glob('**'))
directories_list

In [None]:
everything_paths_list = list(volume.directory_path.glob('**/*'))
everything_paths_list

In [None]:
test_backup_dir = volume.directory_path.parent.joinpath(f'{volume.directory_path.name}_backup')
test_backup_dir.mkdir()
print(test_backup_dir.is_dir())

In [None]:
test_backup_dir

In [None]:
dirs_list = [x for x in everything_paths_list if x.is_dir()]
files_list = [x for x in everything_paths_list if x.is_file()]

In [None]:
for dir_path in dirs_list:
    # get dir_path without volume.directory_path at the beginning
    local_path = str(dir_path).replace(str(volume.directory_path), '')
    # get a list of the directories by stripping the forward-slashes
    local_path = local_path.strip('/')
    # create output_dir and create it
    output_dir = test_backup_dir.joinpath(local_path)
    output_dir.mkdir(parents=True)

In [None]:
for file_path in files_list:
    # get local path for file (everything after volume.directory_path) as a list
    local_path = str(file_path).replace(str(volume.directory_path), '').strip('/')
    # set output_path and copy
    output_path = test_backup_dir.joinpath(local_path)
    copy2(file_path, output_path)

In [None]:
import hashlib

In [None]:
input_hash = md5_dir(volume.directory_path)
backup_hash = md5_dir(test_backup_dir)

In [None]:
print(input_hash)
print(backup_hash)

In [None]:
volume

In [None]:
volume.get_file_paths('.tif')

In [None]:
volume.directory_path