# Create Project CERES Deliverables for Tennessee farm and home science (agrtfhs)



## Processing Steps
1. Import csv with titles into DataFrame
 - ```Tennessee farm and home science, progress report 1, January - March 1952```
1. convert title into starting month and year
 - ```January 1952```
1. convert month and year into yyyy_mm
 - ```1952_01```
1. map adminDB directory to new yyyy_mm name
 - old_directory_name: ```0012_004267_000001```
 - new_directory_name: ```1952_01```
1. get page numbers as simple integer
 - ```12 pages``` -> ```12```
1. process each directory using DataFrame
 - verify number of images on disk vs. metadata
 - create new directory with yyyy_mm
 - OCR TIFF with Tesseract and output PDF into new directory as 001.pdf, 002.pdf, 00n.pdf

In [1]:
# importing and options
import time
from pathlib import Path

import pandas as pd
from ipywidgets import IntProgress, Label, VBox
from IPython.display import display

# == display 95% width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# set pandas option to display wider column info, use -1 to expand to largest necessary width
pd.set_option('max_colwidth', -1)

## Load data from CSV

In [2]:
# exported Google Docs spreadsheet as csv
df_tn_farm_and_home_science = pd.read_csv('data/tennessee_farm_and_home_science.csv')
df_tn_farm_and_home_science.head(5)

Unnamed: 0,adminDB,title,extent
0,0012_004267_000001,"Tennessee farm and home science, progress report 1, January - March 1952",12 pages
1,0012_004267_000002,"Tennessee farm and home science, progress report 2, April - June 1952",12 pages
2,0012_004267_000003,"Tennessee farm and home science, progress report 3, July - September 1952",12 pages
3,0012_004267_000004,"Tennessee farm and home science, progress report 4, October - December 1952",12 pages
4,0012_004267_000005,"Tennessee farm and home science, progress report 5, January - March 1953",12 pages


## Split date range from title

In [3]:
# rsplit by comma and get the last field
df_tn_farm_and_home_science['title'].str.rsplit(',').str[-1].head(5)

0     January - March 1952   
1     April - June 1952      
2     July - September 1952  
3     October - December 1952
4     January - March 1953   
Name: title, dtype: object

In [4]:
# add date_range to Dataframe by rsplitting on comma and taking the last field
df_tn_farm_and_home_science['date_range'] = df_tn_farm_and_home_science['title'].str.rsplit(',').str[-1]
df_tn_farm_and_home_science.head(5)

Unnamed: 0,adminDB,title,extent,date_range
0,0012_004267_000001,"Tennessee farm and home science, progress report 1, January - March 1952",12 pages,January - March 1952
1,0012_004267_000002,"Tennessee farm and home science, progress report 2, April - June 1952",12 pages,April - June 1952
2,0012_004267_000003,"Tennessee farm and home science, progress report 3, July - September 1952",12 pages,July - September 1952
3,0012_004267_000004,"Tennessee farm and home science, progress report 4, October - December 1952",12 pages,October - December 1952
4,0012_004267_000005,"Tennessee farm and home science, progress report 5, January - March 1953",12 pages,January - March 1953


## Convert date range into yyyy and mm

In [5]:
# split date range by spaces and get the first month
df_tn_farm_and_home_science['date_range'].str.split().str[0].head(5)

0    January
1    April  
2    July   
3    October
4    January
Name: date_range, dtype: object

In [6]:
# add the month to the DataFrame
df_tn_farm_and_home_science['Month'] = df_tn_farm_and_home_science['date_range'].str.split().str[0]
df_tn_farm_and_home_science.head(5)

Unnamed: 0,adminDB,title,extent,date_range,Month
0,0012_004267_000001,"Tennessee farm and home science, progress report 1, January - March 1952",12 pages,January - March 1952,January
1,0012_004267_000002,"Tennessee farm and home science, progress report 2, April - June 1952",12 pages,April - June 1952,April
2,0012_004267_000003,"Tennessee farm and home science, progress report 3, July - September 1952",12 pages,July - September 1952,July
3,0012_004267_000004,"Tennessee farm and home science, progress report 4, October - December 1952",12 pages,October - December 1952,October
4,0012_004267_000005,"Tennessee farm and home science, progress report 5, January - March 1953",12 pages,January - March 1953,January


In [7]:
# split yyyy from date range and add to Dataframe
df_tn_farm_and_home_science['yyyy'] = df_tn_farm_and_home_science['date_range'].str.split().str[-1]
df_tn_farm_and_home_science.head(5)

Unnamed: 0,adminDB,title,extent,date_range,Month,yyyy
0,0012_004267_000001,"Tennessee farm and home science, progress report 1, January - March 1952",12 pages,January - March 1952,January,1952
1,0012_004267_000002,"Tennessee farm and home science, progress report 2, April - June 1952",12 pages,April - June 1952,April,1952
2,0012_004267_000003,"Tennessee farm and home science, progress report 3, July - September 1952",12 pages,July - September 1952,July,1952
3,0012_004267_000004,"Tennessee farm and home science, progress report 4, October - December 1952",12 pages,October - December 1952,October,1952
4,0012_004267_000005,"Tennessee farm and home science, progress report 5, January - March 1953",12 pages,January - March 1953,January,1953


In [8]:
# convert month to mm, e.g. January -> 01
str(time.strptime(df_tn_farm_and_home_science['Month'][0], '%B').tm_mon).zfill(2)

'01'

In [9]:
# create a list of what the mm column should be
# due to formatting issues getting time.strptime to work on a Pandas column
mm_list = []
for value in df_tn_farm_and_home_science['Month']:
    mm = str(time.strptime(value, '%B').tm_mon).zfill(2)
    mm_list.append(mm)
mm_list[:6]

['01', '04', '07', '10', '01', '04']

In [10]:
# add mm column to DataFrame using list
df_tn_farm_and_home_science['mm'] = mm_list
df_tn_farm_and_home_science.tail(5)

Unnamed: 0,adminDB,title,extent,date_range,Month,yyyy,mm
130,0012_004267_000131,"Tennessee farm and home science, progress report 152, October - December 1989",52 pages,October - December 1989,October,1989,10
131,0012_004267_000132,"Tennessee farm and home science, progress report 153, January - March 1990",28 pages,January - March 1990,January,1990,1
132,0012_004267_000133,"Tennessee farm and home science, progress report 154, April - June 1990",28 pages,April - June 1990,April,1990,4
133,0012_004267_000134,"Tennessee farm and home science, progress report 155, July - September 1990",32 pages,July - September 1990,July,1990,7
134,0012_004267_000135,"Tennessee farm and home science, progress report 156, October - December 1990",32 pages,October - December 1990,October,1990,10


## Map old to new directory

In [11]:
# explicitly define adminDB as old_directory_name
df_tn_farm_and_home_science['old_directory_name'] = df_tn_farm_and_home_science['adminDB']
df_tn_farm_and_home_science.head(5)

Unnamed: 0,adminDB,title,extent,date_range,Month,yyyy,mm,old_directory_name
0,0012_004267_000001,"Tennessee farm and home science, progress report 1, January - March 1952",12 pages,January - March 1952,January,1952,1,0012_004267_000001
1,0012_004267_000002,"Tennessee farm and home science, progress report 2, April - June 1952",12 pages,April - June 1952,April,1952,4,0012_004267_000002
2,0012_004267_000003,"Tennessee farm and home science, progress report 3, July - September 1952",12 pages,July - September 1952,July,1952,7,0012_004267_000003
3,0012_004267_000004,"Tennessee farm and home science, progress report 4, October - December 1952",12 pages,October - December 1952,October,1952,10,0012_004267_000004
4,0012_004267_000005,"Tennessee farm and home science, progress report 5, January - March 1953",12 pages,January - March 1953,January,1953,1,0012_004267_000005


In [12]:
# add new_directory_name from yyyy and mm columns
df_tn_farm_and_home_science['new_directory_name'] = df_tn_farm_and_home_science['yyyy'].map(str) + '_' + df_tn_farm_and_home_science['mm'].map(str)
df_tn_farm_and_home_science.head(5)

Unnamed: 0,adminDB,title,extent,date_range,Month,yyyy,mm,old_directory_name,new_directory_name
0,0012_004267_000001,"Tennessee farm and home science, progress report 1, January - March 1952",12 pages,January - March 1952,January,1952,1,0012_004267_000001,1952_01
1,0012_004267_000002,"Tennessee farm and home science, progress report 2, April - June 1952",12 pages,April - June 1952,April,1952,4,0012_004267_000002,1952_04
2,0012_004267_000003,"Tennessee farm and home science, progress report 3, July - September 1952",12 pages,July - September 1952,July,1952,7,0012_004267_000003,1952_07
3,0012_004267_000004,"Tennessee farm and home science, progress report 4, October - December 1952",12 pages,October - December 1952,October,1952,10,0012_004267_000004,1952_10
4,0012_004267_000005,"Tennessee farm and home science, progress report 5, January - March 1953",12 pages,January - March 1953,January,1953,1,0012_004267_000005,1953_01


In [13]:
# verify that we have a matching number of old_ and new_directory_name(s)
df_tn_farm_and_home_science.count()

adminDB               135
title                 135
extent                135
date_range            135
Month                 135
yyyy                  135
mm                    135
old_directory_name    135
new_directory_name    135
dtype: int64

## Convert pages value to integer

In [14]:
# Add the number of pages, without " pages", from the extent field
df_tn_farm_and_home_science['pages'] = df_tn_farm_and_home_science['extent'].str.split().str[0]
df_tn_farm_and_home_science.head(5)

Unnamed: 0,adminDB,title,extent,date_range,Month,yyyy,mm,old_directory_name,new_directory_name,pages
0,0012_004267_000001,"Tennessee farm and home science, progress report 1, January - March 1952",12 pages,January - March 1952,January,1952,1,0012_004267_000001,1952_01,12
1,0012_004267_000002,"Tennessee farm and home science, progress report 2, April - June 1952",12 pages,April - June 1952,April,1952,4,0012_004267_000002,1952_04,12
2,0012_004267_000003,"Tennessee farm and home science, progress report 3, July - September 1952",12 pages,July - September 1952,July,1952,7,0012_004267_000003,1952_07,12
3,0012_004267_000004,"Tennessee farm and home science, progress report 4, October - December 1952",12 pages,October - December 1952,October,1952,10,0012_004267_000004,1952_10,12
4,0012_004267_000005,"Tennessee farm and home science, progress report 5, January - March 1953",12 pages,January - March 1953,January,1953,1,0012_004267_000005,1953_01,12


In [15]:
df_tn_farm_and_home_science['new_directory_name'][0]

'1952_01'

## Batch process directories

In [16]:
# create class for processing individual volumes
class Agrtfn:
    def __init__(self, directory_path):
        self.directory_path = Path(directory_path)
        self.number_of_pages = int(df_tn_farm_and_home_science[df_tn_farm_and_home_science["old_directory_name"].str.contains(directory_path.name)]['pages'].to_string(index=False))
        
    def get_image_paths_list(self):
        
        image_paths_list = sorted(self.directory_path.glob('*.tif'))
        
        # delete all macOS index files that start with '.'
        deleted_dot_files = False
        for image_path in image_paths_list:
            if image_path.name.startswith('.'):
                image_path.unlink()  # delete
                deleted_dot_files = True
        if deleted_dot_files:  # re-create the list
            image_paths_list = sorted(self.directory_path.glob('*.tif'))
        
        self.image_paths_list = image_paths_list
        self.number_of_images = len(self.image_paths_list)
        
        if self.number_of_images != self.number_of_pages:
            print(f'# of images != # of pages described in metadata')
            print(f'images: {self.number_of_images}')
            print(f'pages: {self.number_of_pages}')
            return None
        
        return self.image_paths_list
    
    def process(self):
        
        # progress bar
        progress_label = Label('Images being processed')
        progress_bar = IntProgress(min=0, max=self.number_of_images)
        progress_widget = VBox([progress_label, progress_bar])
        display(progress_widget)
        
        # get output directory name
        output_directory_name = df_tn_farm_and_home_science[df_tn_farm_and_home_science["old_directory_name"].str.contains(directory_path.name)]['new_directory_name'].to_string(index=False)
        # strip spaces from output_directory_name
        self.output_directory_name = output_directory_name.strip()
        # set output directory path and make directory
        self.output_directory_path = batch_output_directory_path.joinpath(self.output_directory_name)
        self.output_directory_path.mkdir()
        
        # for each *.tif in self.directory_path
        for index, image_path in enumerate(self.image_paths_list, start=1):
            
            # update progress bar label
            progress_label.value = f'Processing image: {image_path.name} . . . {index}/{self.number_of_images}'
            
            # NO extension at the end for Tesseract processing
            pdf_output_path = self.output_directory_path.joinpath(f'{str(index).zfill(3)}')
            
            # OCR with Tesseract
            !tesseract {str(image_path)} {str(pdf_output_path)} pdf 2>/dev/null
            
            # set output path with extension at end
            pdf_output_path = self.output_directory_path.joinpath(f'{str(index).zfill(3)}.pdf')
            
            # update progress bar value
            progress_bar.value = index
            
        self.pdf_paths_list = list(self.output_directory_path.glob('*.pdf'))
        self.number_of_pdfs = len(self.pdf_paths_list)
        
        if self.number_of_pdfs != self.number_of_pdfs:
            print(f'# of PDFs != # of images to OCR')
            print(f'PDFs: {self.number_of_pdfs}')
            print(f'images: {self.number_of_images}')
            return None
        else:
            return self.pdf_paths_list

In [17]:
# Set batch_input and batch_output directories

# set project identifier and root directory path
project_identifier = 'agrtfhs'
root_directory_path = Path('/Volumes/fluffy/ProjectCeres/00_for_CRL/')

# set batch_input and batch_output directory paths from root and project identifier
batch_input_directory_path = root_directory_path.joinpath(project_identifier)
batch_output_directory_path = root_directory_path.joinpath(f'{project_identifier}_batch_output')
batch_output_directory_path.mkdir()

# create batch_input directory paths list
batch_input_directory_paths_list = [x for x in batch_input_directory_path.iterdir() if x.is_dir()]
number_of_input_dirs = len(batch_input_directory_paths_list)

print(f'batch_input directory: {batch_input_directory_path}')
print(f'\t{number_of_input_dirs} directories to batch process\n')
print(f'batch_output directory: {batch_output_directory_path}')

batch_input directory: /Volumes/fluffy/ProjectCeres/00_for_CRL/agrtfhs
	135 directories to batch process

batch_output directory: /Volumes/fluffy/ProjectCeres/00_for_CRL/agrtfhs_batch_output


In [None]:
# start batch process

# progress bar
progress_label = Label('Directories being processed')
progress_bar = IntProgress(min=0, max=number_of_input_dirs)
progress_widget = VBox([progress_label, progress_bar])
display(progress_widget)

for index, directory_path in enumerate(batch_input_directory_paths_list, start=1):
    
    # update progress bar label
    progress_label.value = f'Processing directory: {directory_path.name} . . . {index}/{number_of_input_dirs}'
    
    # load directory path as class & process
    issue = Agrtfn(directory_path)
    issue.get_image_paths_list()
    issue.process()
    
    #update progress bar value
    progress_bar.value = index

print(f'Processed {index} directories')

VBox(children=(Label(value='Directories being processed'), IntProgress(value=0, max=135)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=16)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=16)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=16)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=16)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=16)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=20)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=16)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=16)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=24)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=20)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=16)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=12)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=16)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=20)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=24)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=16)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=16)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=24)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=20)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=20)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=24)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=32)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=20)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=20)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=24)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=36)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=32)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=36)))

VBox(children=(Label(value='Images being processed'), IntProgress(value=0, max=28)))