In [4]:
import shutil
from pathlib import Path

import numpy as np
from PIL import Image

In [5]:
def get_formatted_extension(from_extension, remediate=False):
    '''
    -- Purpose --
    Returns an extension that:
    1. has a period in the front
    2. Optional: is lower-case
    3. Optional: return jpeg as jpg and tiff as tif

    -- Arguments --
    from_extension: type=string; file extension with or without a '.'

    -- Returns --
    formatted_extension: type=string; formatted extension
    '''
    # make sure there's a period at the front of the extension
    if from_extension.startswith('.'):  # do nothing
        formatted_extension = from_extension
    else:  # add a period
        formatted_extension = f'.{from_extension}'

    # make it lower-case
    if remediate:
        formatted_extension = formatted_extension.lower()
        # hard-coded alterations for jpeg and tiff
        if formatted_extension == '.jpeg':
            formatted_extension = '.jpg'
        elif formatted_extension == '.tiff':
            formatted_extension = '.tif'

    return formatted_extension

In [1]:
class HathiTrust_Volume:
    
    '''Common base class for all Hathi Trust Volumes'''
    
    def __init__(self, directory, object_id):
        self.directory_path = Path(directory)
        self.object_id = object_id
        
    def backup_volume(self):
        '''
        -- Purpose --
        Copy all files in directory to backup directory with name: <directory>_backup

        -- Arguments --
        None

        -- Returns --
        backup_directory_path: type=Path-like object; returns absolute path to backup directory
        '''
        backup_directory_name = f'{self.directory_path.name}_backup'
        backup_directory_path = self.directory_path.parents[0].joinpath(backup_directory_name)

        if backup_directory_path.exists():  # shutil.copytree requires directory to NOT exist
            shutil.rmtree(backup_directory_path)

        shutil.copytree(self.directory_path, backup_directory_path)

        if backup_directory_path.exists():
            return backup_directory_path.resolve()
        
    def get_image_paths(self, with_extension):
        formatted_extension = get_formatted_extension(with_extension)
        image_paths_list = sorted(self.directory_path.glob(f'*{formatted_extension}'))
        return image_paths_list
        
    def rename_directory(self):
        '''
        -- Purpose --
        Rename {self.directory_path} to match {self.object_id}
        
        -- Arguments --
        None
        
        -- Returns --
        book: type=class object; HathiTrust_Volume as book at {new_directory_path}
        '''
        new_directory_path = self.directory_path.parents[0].joinpath(self.object_id)
        self.directory_path.rename(new_directory_path)
        book = HathiTrust_Volume(new_directory_path, object_id)
        return book
    
    def rename_images(self, with_extension):
        '''
        -- Purpose --
        Rename images {with_extension} to an 8-digit zeropadded remediated filename
        '''
        formatted_extension = get_formatted_extension(with_extension)
        remediated_extension = get_formatted_extension(with_extension, remediate=True)
        new_directory_path = self.directory_path.parents[0].joinpath(self.object_id)
        
        backup_directory_path = self.backup_volume()
        
        image_paths_list = self.get_image_paths(formatted_extension)
        number_of_images = len(image_paths_list)
        
        
        for index, image_path in enumerate(image_paths_list, start=1):
            new_image_name = f'{str(index).zfill(8)}{remediated_extension}'
            new_image_path = image_path.parents[0].joinpath(new_image_name)
            image_path.rename(new_image_path)
        
    def report(self):
        '''
        Reports back with known list of errors and remediation suggestions
        '''
        error_list = []
        if self.directory_path.name != str(self.object_id):
            error_list.append('Directory name and object ID do NOT match: MUST remediate')
            
        print(error_list)