# Create Project CERES Deliverables for UT Extension Special Circulars (agrutesc)

 - OCR TIFF with Tesseract and output PDF into new directory as 001.pdf, 002.pdf, 00n.pdf

Will separated PDFs into directories based on the publication date after all PDFs have been created. The work to identify the publication date and therefore the correct directory for the deliverable will be done concurrent to PDF construction.

In [20]:
# importing and options
import time
from pathlib import Path
from shutil import copy

import pandas as pd
from ipywidgets import IntProgress, Label, VBox
from IPython.display import display

# == display 95% width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# set pandas option to display wider column info, use -1 to expand to largest necessary width
pd.set_option('max_colwidth', -1)

In [21]:
# exported Google Docs spreadsheet as csv
meta_df = pd.read_csv('data/agrutesc_crl_deliverables.csv')
meta_df.head(5)

Unnamed: 0,adminDB,title,crl_deliverable_directory,date,crl_deliverable_by_date
0,0012_004268_000001,Circular 24,circular_24,1933-01-27,1933_n01
1,0012_004268_000002,Circular 25,circular_25,1932-02,1932_n01
2,0012_004268_000003,Circular 26,circular_26,1932-03,1932_n02
3,0012_004268_000004,Circular 28,circular_28,1932-04,1932_n03
4,0012_004268_000005,Circular 29,circular_29,1932-10,1932_n04


In [22]:
# get title in all lower case, with spaces as underscores
meta_df['title'].str.replace(' ', '_').str.lower().head(5)

0    circular_24
1    circular_25
2    circular_26
3    circular_28
4    circular_29
Name: title, dtype: object

In [23]:
# get title in all lower case, with spaces as underscores
meta_df['crl_deliverable_directory'] = meta_df['title'].str.replace(' ', '_').str.lower()
meta_df.head(5)

Unnamed: 0,adminDB,title,crl_deliverable_directory,date,crl_deliverable_by_date
0,0012_004268_000001,Circular 24,circular_24,1933-01-27,1933_n01
1,0012_004268_000002,Circular 25,circular_25,1932-02,1932_n01
2,0012_004268_000003,Circular 26,circular_26,1932-03,1932_n02
3,0012_004268_000004,Circular 28,circular_28,1932-04,1932_n03
4,0012_004268_000005,Circular 29,circular_29,1932-10,1932_n04


In [24]:
processed_date_list = []
for admindb in meta_df['adminDB']:
    date = meta_df[meta_df['adminDB'] == admindb]['date'].to_list()
    date = date[0]
    if isinstance(date, float):
        date_stub = 'unknown_date'
    else:
        try:
            yyyy, mm, dd = date.split('-')
            # print(f'{yyyy}-{mm}-{dd}')
        except ValueError:  # not enough values to unpack
            try:
                yyyy, mm = date.split('-')
                # print(f'{yyyy}-{mm}')
            except ValueError:  # not enough values to unplack
                yyyy = date
                # print(yyyy)
        date_stub = yyyy
        # print(date_stub)
    
    count = 1
    test_date = f'{date_stub}_n{str(count).zfill(2)}'
    while test_date in processed_date_list:
        count += 1
        test_date = f'{date_stub}_n{str(count).zfill(2)}'
    processed_date_list.append(test_date)
processed_date_list

['1933_n01',
 '1932_n01',
 '1932_n02',
 '1932_n03',
 '1932_n04',
 '1938_n01',
 '1938_n02',
 '1933_n02',
 '1933_n03',
 '1934_n01',
 '1937_n01',
 '1937_n02',
 '1937_n03',
 '1940_n01',
 '1936_n01',
 '1936_n02',
 '1936_n03',
 '1936_n04',
 '1936_n05',
 '1937_n04',
 '1937_n05',
 '1937_n06',
 '1937_n07',
 '1937_n08',
 '1937_n09',
 '1937_n10',
 '1937_n11',
 '1937_n12',
 '1937_n13',
 '1937_n14',
 '1937_n15',
 '1937_n16',
 '1937_n17',
 '1937_n18',
 '1937_n19',
 '1937_n20',
 '1937_n21',
 '1937_n22',
 '1937_n23',
 '1938_n03',
 '1937_n24',
 '1937_n25',
 '1937_n26',
 '1938_n04',
 '1938_n05',
 '1938_n06',
 '1938_n07',
 '1938_n08',
 '1938_n09',
 '1938_n10',
 '1938_n11',
 '1938_n12',
 '1938_n13',
 '1938_n14',
 '1938_n15',
 '1938_n16',
 '1938_n17',
 '1938_n18',
 '1938_n19',
 '1938_n20',
 '1938_n21',
 '1938_n22',
 '1938_n23',
 '1938_n24',
 '1939_n01',
 '1939_n02',
 '1939_n03',
 '1939_n04',
 '1939_n05',
 '1939_n06',
 '1939_n07',
 '1939_n08',
 '1939_n09',
 '1939_n10',
 '1939_n11',
 '1939_n12',
 '1939_n13',

In [25]:
meta_df['crl_deliverable_by_date'] = processed_date_list

In [26]:
meta_df.tail(5)

Unnamed: 0,adminDB,title,crl_deliverable_directory,date,crl_deliverable_by_date
220,0012_004268_000223,Special Circular 200,special_circular_200,1943-12,1943_n05
221,0012_004268_000224,Special Circular 284,special_circular_284,1947-03,1947_n14
222,0012_004268_000225,Circular UNKNOWN_2,circular_unknown_2,1945-06,1945_n10
223,0012_004268_000226,Circular UNKNOWN_3,circular_unknown_3,,unknown_date_n24
224,0012_004268_000227,Circular UNKNOWN_4,circular_unknown_4,1944,1944_n02


## Batch process directories

In [27]:
# create class for processing individual volumes
class Metadata:
    def __init__(self, directory_path):
        self.directory_path = Path(directory_path)
        # self.number_of_pages = int(df_tn_farm_and_home_science[df_tn_farm_and_home_science["old_directory_name"].str.contains(directory_path.name)]['pages'].to_string(index=False))
        
    def get_image_paths_list(self):
        
        image_paths_list = sorted(self.directory_path.glob('*.tif'))
        
        # delete all macOS index files that start with '.'
        deleted_dot_files = False
        for image_path in image_paths_list:
            if image_path.name.startswith('.'):
                image_path.unlink()  # delete
                deleted_dot_files = True
        if deleted_dot_files:  # re-create the list
            image_paths_list = sorted(self.directory_path.glob('*.tif'))
        
        self.image_paths_list = image_paths_list
        self.number_of_images = len(self.image_paths_list)
        
        # if self.number_of_images != self.number_of_pages:
        #     print(f'# of images != # of pages described in metadata')
        #     print(f'images: {self.number_of_images}')
        #     print(f'pages: {self.number_of_pages}')
        #     return None
        
        return self.image_paths_list

    
    def ocr(self):
        # will convert TIFF into OCRed PDF with Tesseract
        # PDFs will be named using a 3-digit index
        
        # progress bar
        progress_label = Label('Images being processed')
        progress_bar = IntProgress(min=0, max=self.number_of_images)
        progress_widget = VBox([progress_label, progress_bar])
        display(progress_widget)
        
        for index, image_path in enumerate(self.image_paths_list, start=1):
            
            # update progress bar value
            progress_label.value = f'Processing image: {image_path.name} . . . {index}/{self.number_of_images}'
            
            # NO extension at the end for Tesseract processing
            pdf_output_path = self.directory_path.joinpath(f'{str(index).zfill(3)}')

            # OCR with Tesseract
            !tesseract {str(image_path)} {str(pdf_output_path)} pdf 2>/dev/null
            
            # set output path with extension at end
            pdf_output_path = self.directory_path.joinpath(f'{str(index).zfill(3)}.pdf')
            
            # update progress bar value
            progress_bar.value = index
            
        self.pdf_paths_list = list(self.directory_path.glob('*.pdf'))
        self.number_of_pdfs = len(self.pdf_paths_list)
        
        if self.number_of_pdfs != self.number_of_images:
            print(f'# of PDFs != # of images to OCR')
            print(f'PDFs: {self.number_of_pdfs}')
            print(f'images: {self.number_of_images}')
            return None
        else:
            return self.pdf_paths_list
            

    def copy_pdfs(self):
        
        # progress bar
        progress_label = Label('Copying PDFs . . .')
        progress_bar = IntProgress(min=0, max=self.number_of_images)
        progress_widget = VBox([progress_label, progress_bar])
        display(progress_widget)
        
        # get output directory name
        output_directory_name_1 = meta_df[meta_df["adminDB"].str.contains(directory_path.name)]['crl_deliverable_directory'].to_string(index=False)
        output_directory_name_2 = meta_df[meta_df["adminDB"].str.contains(directory_path.name)]['crl_deliverable_by_date'].to_string(index=False)
        # strip spaces from output_directory_name_1
        self.output_directory_name_1 = output_directory_name_1.strip()
        # set output directory path and make directory
        self.output_directory_path_1 = batch_output_directory_path_1.joinpath(self.output_directory_name_1)
        self.output_directory_path_1.mkdir()
        # strip spaces from output_directory_name_2
        self.output_directory_name_2 = output_directory_name_2.strip()
        # set output directory path and make directory
        self.output_directory_path_2 = batch_output_directory_path_2.joinpath(self.output_directory_name_2)
        self.output_directory_path_2.mkdir()
            
        self.pdf_paths_list = list(self.directory_path.glob('*.pdf'))
        # delete all macOS index files that start with '.'
        deleted_dot_files = False
        for pdf_path in self.pdf_paths_list:
            if pdf_path.name.startswith('.'):
                pdf_path.unlink()  # delete
                deleted_dot_files = True
        if deleted_dot_files:  # re-create the list
            self.pdf_paths_list = sorted(self.directory_path.glob('*.pdf'))
            
        self.number_of_pdfs = len(self.pdf_paths_list)
        
        if self.number_of_images != self.number_of_pdfs:
            print(f'input_dir: {self.directory_path}')
            print(f'# of PDFs != # of images to OCR')
            print(f'PDFs: {self.number_of_pdfs}')
            print(f'images: {self.number_of_images}')
            return None
        else:
            # for each *.pdf in self.directory_path
            for index, pdf_path in enumerate(self.pdf_paths_list, start=1):

                # update progress bar label
                progress_label.value = f'Copying PDF: {pdf_path.name} . . . {index}/{self.number_of_pdfs}'

                # set output path with extension at end
                pdf_output_path_1 = self.output_directory_path_1.joinpath(pdf_path.name)
                pdf_output_path_2 = self.output_directory_path_2.joinpath(pdf_path.name)

                # copy file
                copy(pdf_path, pdf_output_path_1)
                copy(pdf_path, pdf_output_path_2)

                # update progress bar value
                progress_bar.value = index
                
        self.pdf_paths_list = list(self.output_directory_path_1.glob('*.pdf'))
        # delete all macOS index files that start with '.'
        deleted_dot_files = False
        for pdf_path in self.pdf_paths_list:
            if pdf_path.name.startswith('.'):
                pdf_path.unlink()  # delete
                deleted_dot_files = True
        if deleted_dot_files:  # re-create the list
            self.pdf_paths_list = sorted(self.output_directory_path_1.glob('*.pdf'))
        self.number_of_pdfs = len(self.pdf_paths_list)
        if self.number_of_images != self.number_of_pdfs:
            print(f'input_dir: {self.directory_path}')
            print(f'output_dir: {self.output_directory_path_1}')
            print(f'# of PDFs != # of images to OCR')
            print(f'PDFs: {self.number_of_pdfs}')
            print(f'images: {self.number_of_images}')
            return None

In [28]:
# Set batch_input and batch_output directories

# set project identifier and root directory path
project_identifier = 'agrutesc'
root_directory_path = Path('/Volumes/fluffy/ProjectCeres/00_for_CRL/')

# set batch_input and batch_output directory paths from root and project identifier
batch_input_directory_path = root_directory_path.joinpath(project_identifier)
batch_output_directory_path_1 = root_directory_path.joinpath(f'{project_identifier}_for_crl_circular')
batch_output_directory_path_2 = root_directory_path.joinpath(f'{project_identifier}_for_crl_date')
batch_output_directory_path_1.mkdir()
batch_output_directory_path_2.mkdir()

# create batch_input directory paths list
batch_input_directory_paths_list = [x for x in batch_input_directory_path.iterdir() if x.is_dir()]
number_of_input_dirs = len(batch_input_directory_paths_list)

print(f'batch_input directory: {batch_input_directory_path}')
print(f'\t{number_of_input_dirs} directories to batch process\n')
print(f'batch_output directory 1: {batch_output_directory_path_1}')
print(f'batch_output directory 2: {batch_output_directory_path_2}')

batch_input directory: /Volumes/fluffy/ProjectCeres/00_for_CRL/agrutesc
	225 directories to batch process

batch_output directory 1: /Volumes/fluffy/ProjectCeres/00_for_CRL/agrutesc_for_crl_circular
batch_output directory 2: /Volumes/fluffy/ProjectCeres/00_for_CRL/agrutesc_for_crl_date


In [30]:
# start batch process to copy PDFs

# progress bar
progress_label = Label('Directories being processed')
progress_bar = IntProgress(min=0, max=number_of_input_dirs)
progress_widget = VBox([progress_label, progress_bar])
display(progress_widget)

for index, directory_path in enumerate(batch_input_directory_paths_list, start=1):
    
    # update progress bar label
    progress_label.value = f'Processing directory: {directory_path.name} . . . {index}/{number_of_input_dirs}'
    
    # load directory path as class & process
    issue = Metadata(directory_path)
    issue.get_image_paths_list()
    issue.copy_pdfs()
    
    #update progress bar value
    progress_bar.value = index

print(f'Processed {index} directories')

VBox(children=(Label(value='Directories being processed'), IntProgress(value=0, max=225)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=6)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=3)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=4)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=4)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=9)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=5)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=6)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=7)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=6)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=5)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=10)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=133)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=40)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=8)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=10)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=5)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=6)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=1)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=9)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=1)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=7)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=10)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=9)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=5)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=8)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=10)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=5)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=9)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=8)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=9)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=4)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=8)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=11)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=8)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=10)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=13)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=18)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=10)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=6)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=5)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=8)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=6)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=9)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=4)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=7)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=6)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=11)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=4)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=11)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=13)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=14)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=7)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=7)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=4)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=13)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=3)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=3)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=7)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=7)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=5)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=4)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=4)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=28)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=5)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=5)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=3)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=20)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=8)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=8)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=76)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=3)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=6)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=22)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=17)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=3)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=6)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=4)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=9)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=16)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=4)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=1)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=4)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=5)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=10)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=3)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=5)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=8)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=9)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=4)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=5)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=1)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=4)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=5)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=8)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=27)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=4)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=6)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=9)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=6)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=6)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=58)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=6)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=5)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=67)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=27)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=84)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=4)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=1)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=4)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=6)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=3)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=4)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=4)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=3)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=1)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=53)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=26)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=6)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=4)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=6)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=17)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=13)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=68)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=54)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=17)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=8)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=24)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=3)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=7)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=17)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=16)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=11)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=10)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=8)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=10)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=17)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=7)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=7)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=5)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=4)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=7)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=4)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=6)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=8)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=5)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=5)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=3)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=26)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=13)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=5)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=4)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=10)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=6)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=119)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=4)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=7)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=8)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=15)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=15)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=1)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=15)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=27)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=8)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=23)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=4)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=6)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=4)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=1)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

VBox(children=(Label(value='Copying PDFs . . .'), IntProgress(value=0, max=2)))

Processed 225 directories


In [None]:
# start batch process

# progress bar
progress_label = Label('Directories being processed')
progress_bar = IntProgress(min=0, max=number_of_input_dirs)
progress_widget = VBox([progress_label, progress_bar])
display(progress_widget)

for index, directory_path in enumerate(batch_input_directory_paths_list, start=1):
    
    # update progress bar label
    progress_label.value = f'Processing directory: {directory_path.name} . . . {index}/{number_of_input_dirs}'
    
    # load directory path as class & process
    issue = Agrutesc(directory_path)
    issue.get_image_paths_list()
    issue.ocr()
    
    #update progress bar value
    progress_bar.value = index

print(f'Processed {index} directories')