# Importing and Options

In [1]:
# importing and options
import time
from pathlib import Path
from shutil import copy, rmtree

import numpy as np
import pandas as pd
from PIL import Image
from tqdm.auto import tqdm

# == display 95% width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# set pandas option to display wider column info, use -1 to expand to largest necessary width
pd.set_option('max_colwidth', -1)

# Create folder structure for CRL deliverables

Deprecated Structure:

Root:
oclc#/
       
       v.1/
        
            1934-01-01/
            
                        001.tif
                        002.tif
                        last pg.tif
            1934-02-01/
            
                        001.tif
                        002.tif
                        last pg.tif
            1934-03-01/
            
                        001.tif
                        002.tif
                        last pg.tif
                        
        v.2/
        
            1935-01-01/
            
                        001.tif
                        002.tif
                        last pg.tif
                        
Levels:
1. oclc number for print title
2. volume, written as v. with number
3. issue level, written as yyyy-mm-dd or yyyy-mm if no day
4. individual page TIFFs 001.tif through last page

## New folder structure received from CRL on 2019-04-04

1. issue level date, written yyyy_mm_dd, or yyyy_mm if no day
1. individual PDFs 0001.pdf, 0002.pdf, through last page
1. no instructions given on creating PDFs
 - create PDFs with Pillow?
 
From Amy Wood (Wood@crl.edu)

*When you are ready to deliver the images and the metadata, please email my colleague, Maritza  Kellum, at mkellum@crl.edu  She is the best person to advise you on the file naming as well.  I have copied her on this email.*

*We have recently migrated to a new digital delivery system.  To load files into that system we use the following file naming:
Scans are placed into folders separated by issue. The folders are labeled using issue dates and underscores (yyyy_mm_dd). If there are multiple issues in a single year without month information, then the folders are labeled according to year, volume, and issue number (yyyy_v.#_no.#).*



# Processing Tennessee Farm and Home Science
1. Import csv with titles
 - ```Tennessee farm and home science, progress report 1, January - March 1952```
1. convert title into starting month and year
 - ```January 1952```
1. convert month and year into yyyy_mm
 - ```1952_01```

## Load data

In [2]:
# exported Google Docs spreadsheet as csv
df_tn_farm_and_home_science = pd.read_csv('data/tennessee_farm_and_home_science.csv')

In [3]:
df_tn_farm_and_home_science.head(5)

Unnamed: 0,adminDB,title,extent
0,0012_004267_000001,"Tennessee farm and home science, progress report 1, January - March 1952",12 pages
1,0012_004267_000002,"Tennessee farm and home science, progress report 2, April - June 1952",12 pages
2,0012_004267_000003,"Tennessee farm and home science, progress report 3, July - September 1952",12 pages
3,0012_004267_000004,"Tennessee farm and home science, progress report 4, October - December 1952",12 pages
4,0012_004267_000005,"Tennessee farm and home science, progress report 5, January - March 1953",12 pages


In [4]:
df_tn_farm_and_home_science.head(5)

Unnamed: 0,adminDB,title,extent
0,0012_004267_000001,"Tennessee farm and home science, progress report 1, January - March 1952",12 pages
1,0012_004267_000002,"Tennessee farm and home science, progress report 2, April - June 1952",12 pages
2,0012_004267_000003,"Tennessee farm and home science, progress report 3, July - September 1952",12 pages
3,0012_004267_000004,"Tennessee farm and home science, progress report 4, October - December 1952",12 pages
4,0012_004267_000005,"Tennessee farm and home science, progress report 5, January - March 1953",12 pages


In [5]:
for value in df_tn_farm_and_home_science:
    print(value)

adminDB
title
extent


In [6]:
df_tn_farm_and_home_science['title'][0]

'Tennessee farm and home science, progress report 1, January - March 1952'

In [7]:
# rsplit the title value for the first index in dataframe with spaces and get the last field
df_tn_farm_and_home_science['yyyy'] = df_tn_farm_and_home_science['title'][0].rsplit(' ')[-1]

In [8]:
# add yyyy to DataFrame

# https://stackoverflow.com/a/52269469
# rsplit by ' ' (the default) and get the last field
df_tn_farm_and_home_science['yyyy'] = df_tn_farm_and_home_science['title'].str.rsplit().str[-1]
df_tn_farm_and_home_science.head(5)

Unnamed: 0,adminDB,title,extent,yyyy
0,0012_004267_000001,"Tennessee farm and home science, progress report 1, January - March 1952",12 pages,1952
1,0012_004267_000002,"Tennessee farm and home science, progress report 2, April - June 1952",12 pages,1952
2,0012_004267_000003,"Tennessee farm and home science, progress report 3, July - September 1952",12 pages,1952
3,0012_004267_000004,"Tennessee farm and home science, progress report 4, October - December 1952",12 pages,1952
4,0012_004267_000005,"Tennessee farm and home science, progress report 5, January - March 1953",12 pages,1953


In [9]:
# drop yyyy from DataFrame as it makes more sent to rsplit by ',' and get the last field
df_tn_farm_and_home_science = df_tn_farm_and_home_science.drop(columns='yyyy')
df_tn_farm_and_home_science.head(5)

Unnamed: 0,adminDB,title,extent
0,0012_004267_000001,"Tennessee farm and home science, progress report 1, January - March 1952",12 pages
1,0012_004267_000002,"Tennessee farm and home science, progress report 2, April - June 1952",12 pages
2,0012_004267_000003,"Tennessee farm and home science, progress report 3, July - September 1952",12 pages
3,0012_004267_000004,"Tennessee farm and home science, progress report 4, October - December 1952",12 pages
4,0012_004267_000005,"Tennessee farm and home science, progress report 5, January - March 1953",12 pages


## Date Range from Title

In [10]:
# rsplit by comma and get the last field
df_tn_farm_and_home_science['title'].str.rsplit(',').str[-1].head(5)

0     January - March 1952   
1     April - June 1952      
2     July - September 1952  
3     October - December 1952
4     January - March 1953   
Name: title, dtype: object

In [11]:
# add date_range to Dataframe by rsplitting on comma and taking the last field
df_tn_farm_and_home_science['date_range'] = df_tn_farm_and_home_science['title'].str.rsplit(',').str[-1]
df_tn_farm_and_home_science.head(5)

Unnamed: 0,adminDB,title,extent,date_range
0,0012_004267_000001,"Tennessee farm and home science, progress report 1, January - March 1952",12 pages,January - March 1952
1,0012_004267_000002,"Tennessee farm and home science, progress report 2, April - June 1952",12 pages,April - June 1952
2,0012_004267_000003,"Tennessee farm and home science, progress report 3, July - September 1952",12 pages,July - September 1952
3,0012_004267_000004,"Tennessee farm and home science, progress report 4, October - December 1952",12 pages,October - December 1952
4,0012_004267_000005,"Tennessee farm and home science, progress report 5, January - March 1953",12 pages,January - March 1953


## Month

In [12]:
# split date range by spaces and get the first month
df_tn_farm_and_home_science['date_range'].str.split().str[0].head(5)

0    January
1    April  
2    July   
3    October
4    January
Name: date_range, dtype: object

In [13]:
# add the month to the DataFrame
df_tn_farm_and_home_science['Month'] = df_tn_farm_and_home_science['date_range'].str.split().str[0]
df_tn_farm_and_home_science.head(5)

Unnamed: 0,adminDB,title,extent,date_range,Month
0,0012_004267_000001,"Tennessee farm and home science, progress report 1, January - March 1952",12 pages,January - March 1952,January
1,0012_004267_000002,"Tennessee farm and home science, progress report 2, April - June 1952",12 pages,April - June 1952,April
2,0012_004267_000003,"Tennessee farm and home science, progress report 3, July - September 1952",12 pages,July - September 1952,July
3,0012_004267_000004,"Tennessee farm and home science, progress report 4, October - December 1952",12 pages,October - December 1952,October
4,0012_004267_000005,"Tennessee farm and home science, progress report 5, January - March 1953",12 pages,January - March 1953,January


## yyyy

In [14]:
# add yyyy to the DataFrame
df_tn_farm_and_home_science['yyyy'] = df_tn_farm_and_home_science['date_range'].str.split().str[-1]
df_tn_farm_and_home_science.head(5)

Unnamed: 0,adminDB,title,extent,date_range,Month,yyyy
0,0012_004267_000001,"Tennessee farm and home science, progress report 1, January - March 1952",12 pages,January - March 1952,January,1952
1,0012_004267_000002,"Tennessee farm and home science, progress report 2, April - June 1952",12 pages,April - June 1952,April,1952
2,0012_004267_000003,"Tennessee farm and home science, progress report 3, July - September 1952",12 pages,July - September 1952,July,1952
3,0012_004267_000004,"Tennessee farm and home science, progress report 4, October - December 1952",12 pages,October - December 1952,October,1952
4,0012_004267_000005,"Tennessee farm and home science, progress report 5, January - March 1953",12 pages,January - March 1953,January,1953


## mm

In [15]:
# convert month to mm, e.g. January -> 01
str(time.strptime(df_tn_farm_and_home_science['Month'][0], '%B').tm_mon).zfill(2)

'01'

In [16]:
# create list of mm as that's easier to understand than trying to time.strptime a Pandas column
mm_list = []
for value in df_tn_farm_and_home_science['Month']:
    mm = str(time.strptime(value, '%B').tm_mon).zfill(2)
    mm_list.append(mm)
mm_list[:6]

['01', '04', '07', '10', '01', '04']

In [17]:
# add mm to DataFrame
df_tn_farm_and_home_science['mm'] = mm_list
df_tn_farm_and_home_science.tail(5)

Unnamed: 0,adminDB,title,extent,date_range,Month,yyyy,mm
130,0012_004267_000131,"Tennessee farm and home science, progress report 152, October - December 1989",52 pages,October - December 1989,October,1989,10
131,0012_004267_000132,"Tennessee farm and home science, progress report 153, January - March 1990",28 pages,January - March 1990,January,1990,1
132,0012_004267_000133,"Tennessee farm and home science, progress report 154, April - June 1990",28 pages,April - June 1990,April,1990,4
133,0012_004267_000134,"Tennessee farm and home science, progress report 155, July - September 1990",32 pages,July - September 1990,July,1990,7
134,0012_004267_000135,"Tennessee farm and home science, progress report 156, October - December 1990",32 pages,October - December 1990,October,1990,10


## old_directory_name

In [18]:
df_tn_farm_and_home_science['old_directory_name'] = df_tn_farm_and_home_science['adminDB']
df_tn_farm_and_home_science.head(5)

Unnamed: 0,adminDB,title,extent,date_range,Month,yyyy,mm,old_directory_name
0,0012_004267_000001,"Tennessee farm and home science, progress report 1, January - March 1952",12 pages,January - March 1952,January,1952,1,0012_004267_000001
1,0012_004267_000002,"Tennessee farm and home science, progress report 2, April - June 1952",12 pages,April - June 1952,April,1952,4,0012_004267_000002
2,0012_004267_000003,"Tennessee farm and home science, progress report 3, July - September 1952",12 pages,July - September 1952,July,1952,7,0012_004267_000003
3,0012_004267_000004,"Tennessee farm and home science, progress report 4, October - December 1952",12 pages,October - December 1952,October,1952,10,0012_004267_000004
4,0012_004267_000005,"Tennessee farm and home science, progress report 5, January - March 1953",12 pages,January - March 1953,January,1953,1,0012_004267_000005


## new_directory_name

In [19]:
df_tn_farm_and_home_science['new_directory_name'] = df_tn_farm_and_home_science['yyyy'].map(str) + '_' + df_tn_farm_and_home_science['mm'].map(str)
df_tn_farm_and_home_science.count() ## verify that we have a unique number of old_ and new_directory_name(s)

adminDB               135
title                 135
extent                135
date_range            135
Month                 135
yyyy                  135
mm                    135
old_directory_name    135
new_directory_name    135
dtype: int64

## pages

In [20]:
# get just the number of pages, without " pages"
df_tn_farm_and_home_science['pages'] = df_tn_farm_and_home_science['extent'].str.split().str[0]
df_tn_farm_and_home_science.head(5)

Unnamed: 0,adminDB,title,extent,date_range,Month,yyyy,mm,old_directory_name,new_directory_name,pages
0,0012_004267_000001,"Tennessee farm and home science, progress report 1, January - March 1952",12 pages,January - March 1952,January,1952,1,0012_004267_000001,1952_01,12
1,0012_004267_000002,"Tennessee farm and home science, progress report 2, April - June 1952",12 pages,April - June 1952,April,1952,4,0012_004267_000002,1952_04,12
2,0012_004267_000003,"Tennessee farm and home science, progress report 3, July - September 1952",12 pages,July - September 1952,July,1952,7,0012_004267_000003,1952_07,12
3,0012_004267_000004,"Tennessee farm and home science, progress report 4, October - December 1952",12 pages,October - December 1952,October,1952,10,0012_004267_000004,1952_10,12
4,0012_004267_000005,"Tennessee farm and home science, progress report 5, January - March 1953",12 pages,January - March 1953,January,1953,1,0012_004267_000005,1953_01,12


## Process directory path

In [21]:
# set data path
data_directory_path = Path('/Volumes/fluffy/ProjectCeres/00_for_CRL/')
tn_farm_and_home_science_directory_path = data_directory_path.joinpath('agrtfhs')

In [22]:
# get list of directores in data path
directories_path_list = sorted([x for x in tn_farm_and_home_science_directory_path.iterdir() if x.is_dir()])
len(directories_path_list)

135

In [23]:
# get new directory name from old directory name
df_tn_farm_and_home_science[df_tn_farm_and_home_science["old_directory_name"].str.contains(directories_path_list[0].name)]['new_directory_name']

0    1952_01
Name: new_directory_name, dtype: object

In [24]:
# create new directory using crl deliverable name
# verify the number of pages from CVS is correct
# copy images to new directory with new crl deliverable name
# create PDF from TIF image using PIL

output_directory_path = data_directory_path.joinpath('agrtfhs_for_crl')
output_directory_path.mkdir(exist_ok=True)

dir_progress_bar = tqdm(directories_path_list, desc='Processing Directories', unit='directories')
for directory_path in dir_progress_bar:
    
    # get new_directory_name by searching for old_directory_name
    new_name = df_tn_farm_and_home_science[df_tn_farm_and_home_science["old_directory_name"].str.contains(directory_path.name)]['new_directory_name'].to_string(index=False)
    # print(f'{directory_path.name} -> {new_name}')
    
    # get number of pages from CSV and number of images from directory_path
    number_of_pages = int(df_tn_farm_and_home_science[df_tn_farm_and_home_science["old_directory_name"].str.contains(directory_path.name)]['pages'].to_string(index=False))
    image_paths_list = sorted(directory_path.glob('*.tif'))
    
    # delete all macOS index files that start with a '.'
    for image_path in image_paths_list:
        if image_path.name.startswith('.'):
            image_path.unlink()
    image_paths_list = sorted(directory_path.glob('*.tif'))
    
    number_of_images = len(image_paths_list)
    
    if number_of_pages != number_of_images:
        print(f'Check # of pages ({number_of_pages}) vs images ({number_of_images}): {directory_path.name}')    
    
    # create new directory
    new_directory_path = output_directory_path.joinpath(new_name)
    new_directory_path.mkdir()
    
    image_progress_bar = tqdm(image_paths_list, desc='Processing Images', unit='images', leave=False)
    # copy each image over to new directory
    for index, image_path in enumerate(image_progress_bar, start=1):
        new_file_name = f'{str(index).zfill(4)}.tif'
        new_file_path = new_directory_path.joinpath(new_file_name)
        
        # copy image
        copy(image_path, new_file_path)
        
        if new_file_path.is_file():
            # convert image to PDF
            image = Image.open(new_file_path)
            pdf_file_name = f'{new_file_path.stem}.pdf'
            pdf_file_path = new_directory_path.joinpath(pdf_file_name)
            image.save(pdf_file_path)
    

HBox(children=(IntProgress(value=0, description='Processing Directories', max=135, style=ProgressStyle(descrip…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=16, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=16, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=16, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=16, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=16, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=20, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=16, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=16, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=24, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=20, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=16, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=16, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=20, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=24, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=16, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=16, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=24, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=20, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=20, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=24, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=32, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=20, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=20, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=24, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=36, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=32, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=36, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=28, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=36, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=32, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=36, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=24, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=36, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=40, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=36, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=36, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=40, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=32, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=20, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=40, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=40, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=36, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=36, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=40, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=36, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=36, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=40, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=48, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=40, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=40, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=28, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=20, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=36, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=44, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=48, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=48, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=56, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=48, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=24, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=44, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=36, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=32, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=36, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=16, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=32, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=32, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=32, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=16, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=32, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=28, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=16, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=20, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=16, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=12, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=16, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=24, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=24, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=36, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=24, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=24, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=20, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=16, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=24, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=24, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=24, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=44, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=20, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=24, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=24, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=24, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=24, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=32, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=28, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=52, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=28, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=28, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=32, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Processing Images', max=32, style=ProgressStyle(description_w…




# Tennessee Farm and Home News

## Load data - as much as I have on 2019-04-10

In [None]:
# exported Google Docs spreadsheet as csv
df_tn_farm_and_home_news = pd.read_csv('data/tennessee_farm_and_home_news.csv')
df_tn_farm_and_home_news.head(5)

## Date from title

In [None]:
# split on comma, but only split once to get the date
df_tn_farm_and_home_news['title'].str.split(',', n=1).str[-1].head(5)

In [None]:
df_tn_farm_and_home_news['date'] = df_tn_farm_and_home_news['title'].str.split(',', n=1).str[-1]
df_tn_farm_and_home_news.head(5)

## Month

In [None]:
df_tn_farm_and_home_news['date'].str.split().str[0].tail(5)

In [None]:
df_tn_farm_and_home_news['Month'] = df_tn_farm_and_home_news['date'].str.split().str[0]
df_tn_farm_and_home_news.head(5)

## yyyy

In [None]:
df_tn_farm_and_home_news['date'].str.split().str[-1].tail(5)

In [None]:
df_tn_farm_and_home_news['yyyy'] = df_tn_farm_and_home_news['date'].str.split().str[-1]

## mm

In [None]:
# convert month to mm, e.g. January -> 01
str(time.strptime(df_tn_farm_and_home_news['Month'][2000], '%B').tm_mon).zfill(2)

In [None]:
# create list of mm
mm_list = []
for index, value in enumerate(df_tn_farm_and_home_news['Month'], start=1):
    try:
        try:
            mm = str(time.strptime(value, '%B').tm_mon).zfill(2)
            mm_list.append(mm)
        except TypeError:  # errors if NAN value
            if np.isnan(value):
                pass
            else:
                print(f'index: {index}, value: {value}')
    except ValueError:  # had errors in the metadata for the month, e.g. March11
        print(index, value)
mm_list[:6]

In [None]:
df_tn_farm_and_home_news.iloc[1887:1890, :]

In [None]:
# add mm to DataFrame
df_tn_farm_and_home_news['mm'] = mm_list
df_tn_farm_and_home_news.tail(5)

## dd

In [None]:
# get day from date field by splitting on comma and taking first value,
# then splitting on space and taking 2nd value
df_tn_farm_and_home_news['date'].str.split(',').str[0].str.split().str[1].head(5)

In [None]:
# split using above, but also zero-pad 2 digits
df_tn_farm_and_home_news['date'].str.split(',').str[0].str.split().str[1].str.zfill(2).head(5)

In [None]:
# use above code to set as 'dd' column
df_tn_farm_and_home_news['dd'] = df_tn_farm_and_home_news['date'].str.split(',').str[0].str.split().str[1].str.zfill(2)
df_tn_farm_and_home_news.tail(5)

## old_directory_name

In [None]:
df_tn_farm_and_home_news['old_directory_name'] = df_tn_farm_and_home_news['adminDB']
df_tn_farm_and_home_news.head(5)

## new_directory_name

In [None]:
df_tn_farm_and_home_news['new_directory_name'] = df_tn_farm_and_home_news['yyyy'].map(str) + '_' + df_tn_farm_and_home_news['mm'].map(str) + '_' + df_tn_farm_and_home_news['dd'].map(str)
df_tn_farm_and_home_news.count() ## verify that we have a unique number of old_ and new_directory_name(s)

## pages

In [None]:
# get just the number of pages, without " pages"
df_tn_farm_and_home_news['pages'] = df_tn_farm_and_home_news['extent'].str.split().str[0]
df_tn_farm_and_home_news.head(5)

## Process directory path

In [None]:
tn_farm_and_home_news_directory_path = Path('data/tn_farm_and_home_news_adminDB')