# HathiTrust Book Processing

This notebook contains a HathiTrust_Volume class to process a digitized item ready that has passed quality control and is ready for ingest into the HathiTrust

Tasks:
1. Rename directory for HathiTrust
2. Rename images and remediate file extensions for HathiTrust
3. Clean up folder for ingest
4. Verify bitonal images

In [1]:
# importing and options
import shutil
from pathlib import Path

%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

# == display 95% width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
# functions
def get_formatted_extension(from_extension, remediate=False):
    '''
    -- Purpose --
    Returns an extension that:
    1. has a period in the front
    2. Optional: is lower-case
    3. Optional: return jpeg as jpg and tiff as tif

    -- Arguments --
    from_extension: type=string; file extension with or without a '.'

    -- Returns --
    formatted_extension: type=string; formatted extension
    '''
    # make sure there's a period at the front of the extension
    if from_extension.startswith('.'):  # do nothing
        formatted_extension = from_extension
    else:  # add a period
        formatted_extension = f'.{from_extension}'

    # make it lower-case
    if remediate:
        formatted_extension = formatted_extension.lower()
        # hard-coded alterations for jpeg and tiff
        if formatted_extension == '.jpeg':
            formatted_extension = '.jpg'
        elif formatted_extension == '.tiff':
            formatted_extension = '.tif'

    return formatted_extension

In [3]:
# classes
class HathiTrust_Volume:
    
    '''Common base class for all Hathi Trust Volumes'''
    
    def __init__(self, directory, object_id):
        self.directory_path = Path(directory)
        self.object_id = object_id
        
    def backup_volume(self):
        '''
        -- Purpose --
        Copy all files in directory to backup directory with name: <directory>_backup

        -- Arguments --
        None

        -- Returns --
        backup_directory_path: type=Path-like object; returns absolute path to backup directory
        '''
        backup_directory_name = f'{self.directory_path.name}_backup'
        backup_directory_path = self.directory_path.parents[0].joinpath(backup_directory_name)

        if backup_directory_path.exists():  # shutil.copytree requires directory to NOT exist
            shutil.rmtree(backup_directory_path)

        shutil.copytree(self.directory_path, backup_directory_path)

        if backup_directory_path.exists():
            return backup_directory_path.resolve()
        
    def get_file_paths(self, with_extension):
        
        # get formatted extension and return sorted list of file paths that match
        formatted_extension = get_formatted_extension(with_extension)
        
        file_paths_list = sorted(self.directory_path.glob(f'*{formatted_extension}'))
        
        return file_paths_list
        
    def rename_directory(self):
        '''
        -- Purpose --
        Rename {self.directory_path} to match {self.object_id}
        
        -- Arguments --
        None
        
        -- Returns --
        book: type=class object; HathiTrust_Volume as book at {new_directory_path}
        '''
        
        # backup directory before proceeding
        # backup_directory_path = self.backup_volume()
        
        # set new directory path with the object_id then rename
        new_directory_path = self.directory_path.parents[0].joinpath(self.object_id)
        try:
            self.directory_path.replace(new_directory_path)
        except OSError:
            print(f'{new_directory_path} already exists and was not replaced')
        
        # return HathiTrust_Volume as new book
        book = HathiTrust_Volume(new_directory_path, self.object_id)
        
        return book
    
    def rename_images(self, with_extension):
        '''
        -- Purpose --
        Rename images {with_extension} to an 8-digit zeropadded remediated filename
        '''
        formatted_extension = get_formatted_extension(with_extension)
        
        remediated_extension = get_formatted_extension(with_extension, remediate=True)
        
        new_directory_path = self.directory_path.parents[0].joinpath(self.object_id)
        
        backup_directory_path = self.backup_volume()
        
        file_paths_list = self.get_file_paths(formatted_extension)
        number_of_images = len(file_paths_list)
        
        for index, file_path in enumerate(file_paths_list, start=1):
            new_image_name = f'{str(index).zfill(8)}{remediated_extension}'
            new_file_path = file_path.parents[0].joinpath(new_image_name)
            file_path.rename(new_file_path)
        
    def report(self):
        '''
        Reports back with known list of errors and remediation suggestions
        '''
        
        print('Report assumes you are starting with a directory of TIFFs that have not been converted for HathiTrust yet')
        
        error_count = 0
        error_list = []
        
        # verify directory name is the object_id
        if self.directory_path.name != str(self.object_id):
            error_count += 1
            error_list.append('! Must Redmediate: Directory name and object ID do NOT match')
            error_list.append('------ Remediate with: book = book.rename_directory() then re-run book.report()')
        
        # check if .tiff is an extension
        extensions_list = set([x.suffix for x in self.directory_path.iterdir() if x.is_file()])
        
        if '.tiff' in extensions_list:
            error_count += 1
            error_list.append('! Must Remediate: Images have extension .tiff')
            error_list.append('------ Remediate with: book.rename_images(".tiff") then re-run book.report()')
        
        # verify image names
        image_paths_list = self.get_file_paths('.tif')
        
        number_of_images = len(image_paths_list)
        if number_of_images == 0:
            error_count += 1
            error_list.append('!!! FATAL ERROR: No *.tif files to process')
            error_list.append(f'Check your directory with: !open {self.directory_path}')
        else:
            number_of_naming_errors = 0
    
            # check that first image is correctly named 8-digit, zeropadded 1.tif
            if image_paths_list[0].name != '00000001.tif':
                error_count += 1
                error_list.append(f'! Must Remediate: image_paths_list[0].name is NOT 00000001.tif')
                error_list.append('------ Remediate with: book.rename_images(".tif") then re-run book.report()')
            # check that each .tif image is correctly numbered starting with 1
            else:
                for number in range(number_of_images):
            
                    # create name and path to test against image_path
                    name_to_test = f'str(number + 1).zfill(8).tif'  # add 1 to number for filename
                    path_to_test = self.directory_path.joinpath(name_to_test)
                    
                    image_path = image_paths_list[number]
                
                    if path_to_test != image_path:  # sorted list of filenames is incorrect, but file may still exist
                        if not path_to_test.is_file():
                            number_of_naming_errors += 1
                        else:
                            naming_error = f'A naming error exists, investigate manually'
        
        print(f'There are {error_count} errors')
        if error_count > 0:
            print('------ Remediate errors in the order that they appear below')
            for line in error_list:
                print(line)
        else:
            print(f'Continue to process images')
        
        

In [4]:
# Technical Metadata

# individual object and institution info to verify
object_id = 'papers-andrew-v1-test'
scanner_user = 'University of Tennessee: John C. Hodges Library'  # Wrong
scanning_order = 'left-to-right'
reading_order = 'left-to-right'

# capture info to verify
capture_date = '2019-01-01T12:00:00-5:00'  # Wrong
scanner_make = 'Fujitsu'  # WRONG
scanner_model = '6670-fi'  # WRONG

# processing info to verify
bitonal_resolution_dpi = 600  # HathiTrust min bitonal dpi: 600
contone_resolution_dpi = 300  # HathiTrust min contone dpi: 300
image_compression_date = '2019-02-12T12:00:00-5:00'  # Wrong
image_compression_agent = 'utk'  # HathiTrust organization code  Wrong
image_compression_tool: ['Pillow v.5.4.1']  # Assumes use of Python/Pillow for compression, check version number

In [31]:
# HathiTrust page-level metadata

# How to capture page-level metadata?

In [16]:
book_path = Path('data/PapersOfAndrewJackson/01_tif/papers-andrew-v1-test')
book = HathiTrust_Volume(book_path, object_id)

In [17]:
book.report()

Report assumes you are starting with a directory of TIFFs that have not been converted for HathiTrust yet
There are 0 errors
Continue to process images


In [8]:
book = book.rename_directory()

In [9]:
book.report()

Report assumes you are starting with a directory of TIFFs that have not been converted for HathiTrust yet
There are 2 errors
------ Remediate errors in the order that they appear below
! Must Remediate: Images have extension .tiff
------ Remediate with: book.rename_images(".tiff") then re-run book.report()
!!! FATAL ERROR: No *.tif files to process
Check your directory with: !open data/PapersOfAndrewJackson/01_tif/papers-andrew-v1-test


In [12]:
book = book.rename_images(".tiff")

In [13]:
book.report()

AttributeError: 'NoneType' object has no attribute 'report'

In [18]:
# can I get the necessary metadata I need from Pillow for HathiTrust?

# split images at: bitonal/contone is bitspersample -> 1/1+
# bitonal: format -> TIFF
# bitonal: photometric_interpretation -> 0
# bitonal: x_ and y_ resolution -> 600
# bitonal: compression -> Group4
# if bitonal passes add to OCR list
# if bitonal doens't pass -> remediate
# contone: format -> JPEG2000
# contone: compression -> Kakadu Part 1 Lossy
# contone: bit depth -> 8
# contone: colorspace sRGB *NOTE: icc not embedded

In [19]:
image_paths_list = book.get_file_paths('.tif')
len(image_paths_list)

574

In [28]:
bitonal_image_paths_list = []
contone_image_paths_list = []

for image_path in image_paths_list:
    image = Image.open(image_path)
    bits_per_sample = image.tag_v2[258]
    if bits_per_sample == (8, 8, 8):  # image is 24-bit rgb
        contone_image_paths_list.append(image_path)
    elif bits_per_sample == (8, ):  # image is 8-bit grayscale
        contone_image_paths_list.append(image_path)
    else:  # image should be bitonal
        bitonal_image_paths.append(image_path)

In [30]:
contone_image_paths_list[:4]

[PosixPath('data/PapersOfAndrewJackson/01_tif/papers-andrew-v1-test/00000001.tif'),
 PosixPath('data/PapersOfAndrewJackson/01_tif/papers-andrew-v1-test/00000002.tif'),
 PosixPath('data/PapersOfAndrewJackson/01_tif/papers-andrew-v1-test/00000003.tif'),
 PosixPath('data/PapersOfAndrewJackson/01_tif/papers-andrew-v1-test/00000004.tif')]