# UT Record -- Batch Resize & OCR ScanTailor Individual Volumes

In [None]:
# magic that lets us plot directly in the notebook
%matplotlib inline

# imports
import shutil
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
from ipywidgets import IntProgress, Label, VBox
from IPython.display import display
from PIL import Image

# import Jeremy's code from img_qc/img_qc.py
import img_qc.img_qc as img_qc  # don't want to have to type it twice so import it as img_qc

In [None]:
# matplotlib & pandas options

# parameters for matplotlib to increase our default figure size -- NOTE: figure sizes are in INCHES
plt.rcParams["figure.figsize"] = (12,12)  # set as needed for your screen and eyes

# on a high-dpi monitor this will increase the quality of plots on-screen
# %config InlineBackend.figure_format = 'retina'

# set max_colwidth for a wider column so data isn't truncated
pd.set_option('max_colwidth', 240)

In [None]:
# set network path and get list of directories to process
data_directory = Path('/Volumes/fluffy/0_ActiveProjects/utk_UTRecord/')
input_directory = data_directory.joinpath('2.toResizeAndPDF')
output_directory = data_directory.joinpath('3.toQC')

directories_to_convert_to_pdf_paths_list = sorted([x for x in input_directory.iterdir() if x.is_dir()])

print(f'{len(directories_to_convert_to_pdf_paths_list)} directories to process')

In [None]:
# find all ScanTailor cache directories
cache_directory_paths_list = sorted(input_directory.glob('**/cache'))
print(f'Number of cache directories: {len(cache_directory_paths_list)}')

In [None]:
# delete all ScanTailor cache directories
for cache_directory_path in cache_directory_paths_list:
    shutil.rmtree(cache_directory_path)
cache_directory_paths_list = sorted(input_directory.glob('**/cache'))
print(f'Number of cache directories: {len(cache_directory_paths_list)}')

In [None]:
# run Jeremy's function to load metadata using ExifTool into
# a DataFrame (think Excel: rows & colummns of data)
images_df = img_qc.get_images_df(input_directory, 'tif')

images_df

In [None]:
# unique resolutions
images_df['EXIF:XResolution'].unique()

In [None]:
# first 5 unique widths
sorted(images_df['EXIF:ImageWidth'].unique())[:5]

In [None]:
# get the horizontal images; where width > height
horizontal_df = images_df[images_df['EXIF:ImageWidth'] > images_df['EXIF:ImageHeight']]

if len(horizontal_df.index) > 0:  # if there are any horizontal images
    for sourcefile in horizontal_df['SourceFile']:
        # print the full path to the image
        print(sourcefile)
else:
    print('No horizontal images')

In [None]:
# get all of the directories
images_df['File:Directory'].unique()

In [None]:
# get dataframe of first image directory
image_directory_df = images_df[images_df['File:Directory'] == sorted(images_df['File:Directory'].unique())[0]]

image_directory_df['File:FileName']

In [None]:
image_directory_df['EXIF:ImageWidth'].unique()

In [None]:
small_images_df = image_directory_df[image_directory_df['EXIF:ImageWidth'] == 2534]

In [None]:
image_path = sorted(small_images_df['SourceFile'].tolist())[0]
image_path

In [None]:
# open image
image = Image.open(image_path)
print(image)
print('')

print(f'image.format: {image.format}')
print(f'image.size: {image.size}')  # (width, height)
print(f'image.mode: {image.mode}')
print('')

for index, data in enumerate(image.info):
    print(f'{index} (key): {data}')
    print(f'{index} (value): {image.info[data]}')
    print('')

In [None]:
plt.imshow(image)

In [None]:
# loop over sourcefiles and display them
for sourcefile in small_images_df['SourceFile']:
    
    # open image
    small_image = Image.open(sourcefile)
    
    plt.figure()
    # display image
    plt.imshow(small_image)