# Importing and Options

In [46]:
# importing and options
import time
from pathlib import Path

import pandas as pd
from PIL import Image

# == display 95% width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# set pandas option to display wider column info, use -1 to expand to largest necessary width
pd.set_option('max_colwidth', -1)

# Create folder structure for CRL deliverables

Deprecated Structure:

Root:
oclc#/
       
       v.1/
        
            1934-01-01/
            
                        001.tif
                        002.tif
                        last pg.tif
            1934-02-01/
            
                        001.tif
                        002.tif
                        last pg.tif
            1934-03-01/
            
                        001.tif
                        002.tif
                        last pg.tif
                        
        v.2/
        
            1935-01-01/
            
                        001.tif
                        002.tif
                        last pg.tif
                        
Levels:
1. oclc number for print title
2. volume, written as v. with number
3. issue level, written as yyyy-mm-dd or yyyy-mm if no day
4. individual page TIFFs 001.tif through last page

## New folder structure received from CRL on 2019-04-04

1. issue level date, written yyyy_mm_dd, or yyyy_mm if no day
1. individual PDFs 0001.pdf, 0002.pdf, through last page
1. no instructions given on creating PDFs
 - create PDFs with Pillow?
 
From Amy Wood (Wood@crl.edu)

*When you are ready to deliver the images and the metadata, please email my colleague, Maritza  Kellum, at mkellum@crl.edu  She is the best person to advise you on the file naming as well.  I have copied her on this email.*

*We have recently migrated to a new digital delivery system.  To load files into that system we use the following file naming:
Scans are placed into folders separated by issue. The folders are labeled using issue dates and underscores (yyyy_mm_dd). If there are multiple issues in a single year without month information, then the folders are labeled according to year, volume, and issue number (yyyy_v.#_no.#).*



# Processing Tennessee Farm and Home Science
1. Import csv with titles
 - ```Tennessee farm and home science, progress report 1, January - March 1952```
1. convert title into starting month and year
 - ```January 1952```
1. convert month and year into yyyy_mm
 - ```1952_01```

## Load data

In [3]:
# exported Google Docs spreadsheet as csv
df_tn_farm_and_home_science = pd.read_csv('data/tennessee_farm_and_home_science.csv')

In [4]:
df_tn_farm_and_home_science.head(5)

Unnamed: 0,adminDB,title,extent
0,0012_004267_000001,"Tennessee farm and home science, progress repo...",12 pages
1,0012_004267_000002,"Tennessee farm and home science, progress repo...",12 pages
2,0012_004267_000003,"Tennessee farm and home science, progress repo...",12 pages
3,0012_004267_000004,"Tennessee farm and home science, progress repo...",12 pages
4,0012_004267_000005,"Tennessee farm and home science, progress repo...",12 pages


In [6]:
df_tn_farm_and_home_science.head(5)

Unnamed: 0,adminDB,title,extent
0,0012_004267_000001,"Tennessee farm and home science, progress report 1, January - March 1952",12 pages
1,0012_004267_000002,"Tennessee farm and home science, progress report 2, April - June 1952",12 pages
2,0012_004267_000003,"Tennessee farm and home science, progress report 3, July - September 1952",12 pages
3,0012_004267_000004,"Tennessee farm and home science, progress report 4, October - December 1952",12 pages
4,0012_004267_000005,"Tennessee farm and home science, progress report 5, January - March 1953",12 pages


In [7]:
for value in df_tn_farm_and_home_science:
    print(value)

adminDB
title
extent


In [8]:
df_tn_farm_and_home_science['title'][0]

'Tennessee farm and home science, progress report 1, January - March 1952'

In [9]:
# rsplit the title value for the first index in dataframe with spaces and get the last field
df_tn_farm_and_home_science['yyyy'] = df_tn_farm_and_home_science['title'][0].rsplit(' ')[-1]

In [10]:
# add yyyy to DataFrame

# https://stackoverflow.com/a/52269469
# rsplit by ' ' (the default) and get the last field
df_tn_farm_and_home_science['yyyy'] = df_tn_farm_and_home_science['title'].str.rsplit().str[-1]
df_tn_farm_and_home_science.head(5)

Unnamed: 0,adminDB,title,extent,yyyy
0,0012_004267_000001,"Tennessee farm and home science, progress report 1, January - March 1952",12 pages,1952
1,0012_004267_000002,"Tennessee farm and home science, progress report 2, April - June 1952",12 pages,1952
2,0012_004267_000003,"Tennessee farm and home science, progress report 3, July - September 1952",12 pages,1952
3,0012_004267_000004,"Tennessee farm and home science, progress report 4, October - December 1952",12 pages,1952
4,0012_004267_000005,"Tennessee farm and home science, progress report 5, January - March 1953",12 pages,1953


In [11]:
# drop yyyy from DataFrame as it makes more sent to rsplit by ',' and get the last field
df_tn_farm_and_home_science = df_tn_farm_and_home_science.drop(columns='yyyy')
df_tn_farm_and_home_science.head(5)

Unnamed: 0,adminDB,title,extent
0,0012_004267_000001,"Tennessee farm and home science, progress report 1, January - March 1952",12 pages
1,0012_004267_000002,"Tennessee farm and home science, progress report 2, April - June 1952",12 pages
2,0012_004267_000003,"Tennessee farm and home science, progress report 3, July - September 1952",12 pages
3,0012_004267_000004,"Tennessee farm and home science, progress report 4, October - December 1952",12 pages
4,0012_004267_000005,"Tennessee farm and home science, progress report 5, January - March 1953",12 pages


## Date Range from Title

In [12]:
# rsplit by comma and get the last field
df_tn_farm_and_home_science['title'].str.rsplit(',').str[-1].head(5)

0     January - March 1952   
1     April - June 1952      
2     July - September 1952  
3     October - December 1952
4     January - March 1953   
Name: title, dtype: object

In [13]:
# add date_range to Dataframe by rsplitting on comma and taking the last field
df_tn_farm_and_home_science['date_range'] = df_tn_farm_and_home_science['title'].str.rsplit(',').str[-1]
df_tn_farm_and_home_science.head(5)

Unnamed: 0,adminDB,title,extent,date_range
0,0012_004267_000001,"Tennessee farm and home science, progress report 1, January - March 1952",12 pages,January - March 1952
1,0012_004267_000002,"Tennessee farm and home science, progress report 2, April - June 1952",12 pages,April - June 1952
2,0012_004267_000003,"Tennessee farm and home science, progress report 3, July - September 1952",12 pages,July - September 1952
3,0012_004267_000004,"Tennessee farm and home science, progress report 4, October - December 1952",12 pages,October - December 1952
4,0012_004267_000005,"Tennessee farm and home science, progress report 5, January - March 1953",12 pages,January - March 1953


## Month

In [14]:
# split date range by spaces and get the first month
df_tn_farm_and_home_science['date_range'].str.split().str[0].head(5)

0    January
1    April  
2    July   
3    October
4    January
Name: date_range, dtype: object

In [15]:
# add the month to the DataFrame
df_tn_farm_and_home_science['Month'] = df_tn_farm_and_home_science['date_range'].str.split().str[0]
df_tn_farm_and_home_science.head(5)

Unnamed: 0,adminDB,title,extent,date_range,month
0,0012_004267_000001,"Tennessee farm and home science, progress report 1, January - March 1952",12 pages,January - March 1952,January
1,0012_004267_000002,"Tennessee farm and home science, progress report 2, April - June 1952",12 pages,April - June 1952,April
2,0012_004267_000003,"Tennessee farm and home science, progress report 3, July - September 1952",12 pages,July - September 1952,July
3,0012_004267_000004,"Tennessee farm and home science, progress report 4, October - December 1952",12 pages,October - December 1952,October
4,0012_004267_000005,"Tennessee farm and home science, progress report 5, January - March 1953",12 pages,January - March 1953,January


## yyyy

In [16]:
# add yyyy to the DataFrame
df_tn_farm_and_home_science['yyyy'] = df_tn_farm_and_home_science['date_range'].str.split().str[-1]
df_tn_farm_and_home_science.head(5)

Unnamed: 0,adminDB,title,extent,date_range,month,yyyy
0,0012_004267_000001,"Tennessee farm and home science, progress report 1, January - March 1952",12 pages,January - March 1952,January,1952
1,0012_004267_000002,"Tennessee farm and home science, progress report 2, April - June 1952",12 pages,April - June 1952,April,1952
2,0012_004267_000003,"Tennessee farm and home science, progress report 3, July - September 1952",12 pages,July - September 1952,July,1952
3,0012_004267_000004,"Tennessee farm and home science, progress report 4, October - December 1952",12 pages,October - December 1952,October,1952
4,0012_004267_000005,"Tennessee farm and home science, progress report 5, January - March 1953",12 pages,January - March 1953,January,1953


## mm

In [17]:
# convert month to mm, e.g. January -> 01
str(time.strptime(df_tn_farm_and_home_science['Month'][0], '%B').tm_mon).zfill(2)

'01'

In [18]:
# create list of mm as that's easier to understand than trying to time.strptime a Pandas column
mm_list = []
for value in df_tn_farm_and_home_science['Month']:
    mm = str(time.strptime(value, '%B').tm_mon).zfill(2)
    mm_list.append(mm)
mm_list[:6]

['01', '04', '07', '10', '01', '04']

In [19]:
# add mm to DataFrame
df_tn_farm_and_home_science['mm'] = mm_list
df_tn_farm_and_home_science.tail(5)

Unnamed: 0,adminDB,title,extent,date_range,month,yyyy,mm
130,0012_004267_000131,"Tennessee farm and home science, progress report 152, October - December 1989",52 pages,October - December 1989,October,1989,10
131,0012_004267_000132,"Tennessee farm and home science, progress report 153, January - March 1990",28 pages,January - March 1990,January,1990,1
132,0012_004267_000133,"Tennessee farm and home science, progress report 154, April - June 1990",28 pages,April - June 1990,April,1990,4
133,0012_004267_000134,"Tennessee farm and home science, progress report 155, July - September 1990",32 pages,July - September 1990,July,1990,7
134,0012_004267_000135,"Tennessee farm and home science, progress report 156, October - December 1990",32 pages,October - December 1990,October,1990,10


## old_directory_name

In [20]:
df_tn_farm_and_home_science['old_directory_name'] = df_tn_farm_and_home_science['adminDB']
df_tn_farm_and_home_science.head(5)

Unnamed: 0,adminDB,title,extent,date_range,month,yyyy,mm,old_directory_name
0,0012_004267_000001,"Tennessee farm and home science, progress report 1, January - March 1952",12 pages,January - March 1952,January,1952,1,0012_004267_000001
1,0012_004267_000002,"Tennessee farm and home science, progress report 2, April - June 1952",12 pages,April - June 1952,April,1952,4,0012_004267_000002
2,0012_004267_000003,"Tennessee farm and home science, progress report 3, July - September 1952",12 pages,July - September 1952,July,1952,7,0012_004267_000003
3,0012_004267_000004,"Tennessee farm and home science, progress report 4, October - December 1952",12 pages,October - December 1952,October,1952,10,0012_004267_000004
4,0012_004267_000005,"Tennessee farm and home science, progress report 5, January - March 1953",12 pages,January - March 1953,January,1953,1,0012_004267_000005


## new_directory_name

In [21]:
df_tn_farm_and_home_science['new_directory_name'] = df_tn_farm_and_home_science['yyyy'].map(str) + '_' + df_tn_farm_and_home_science['mm'].map(str)
df_tn_farm_and_home_science.count() ## verify that we have a unique number of old_ and new_directory_name(s)

Unnamed: 0,adminDB,title,extent,date_range,month,yyyy,mm,old_directory_name,new_directory_name
0,0012_004267_000001,"Tennessee farm and home science, progress report 1, January - March 1952",12 pages,January - March 1952,January,1952,01,0012_004267_000001,1952_01
1,0012_004267_000002,"Tennessee farm and home science, progress report 2, April - June 1952",12 pages,April - June 1952,April,1952,04,0012_004267_000002,1952_04
2,0012_004267_000003,"Tennessee farm and home science, progress report 3, July - September 1952",12 pages,July - September 1952,July,1952,07,0012_004267_000003,1952_07
3,0012_004267_000004,"Tennessee farm and home science, progress report 4, October - December 1952",12 pages,October - December 1952,October,1952,10,0012_004267_000004,1952_10
4,0012_004267_000005,"Tennessee farm and home science, progress report 5, January - March 1953",12 pages,January - March 1953,January,1953,01,0012_004267_000005,1953_01
5,0012_004267_000006,"Tennessee farm and home science, progress report 6, April - June 1953",12 pages,April - June 1953,April,1953,04,0012_004267_000006,1953_04
6,0012_004267_000007,"Tennessee farm and home science, progress report 7, July - September 1953",12 pages,July - September 1953,July,1953,07,0012_004267_000007,1953_07
7,0012_004267_000008,"Tennessee farm and home science, progress report 8, October - December 1953",12 pages,October - December 1953,October,1953,10,0012_004267_000008,1953_10
8,0012_004267_000009,"Tennessee farm and home science, progress report 9, January - March 1954",12 pages,January - March 1954,January,1954,01,0012_004267_000009,1954_01
9,0012_004267_000010,"Tennessee farm and home science, progress report 10, April - June 1954",12 pages,April - June 1954,April,1954,04,0012_004267_000010,1954_04


## pages

In [22]:
# get just the number of pages, without " pages"
df_tn_farm_and_home_science['pages'] = df_tn_farm_and_home_science['extent'].str.split().str[0]
df_tn_farm_and_home_science.head(5)

Unnamed: 0,adminDB,title,extent,date_range,month,yyyy,mm,old_directory_name,new_directory_name,pages
0,0012_004267_000001,"Tennessee farm and home science, progress report 1, January - March 1952",12 pages,January - March 1952,January,1952,1,0012_004267_000001,1952_01,12
1,0012_004267_000002,"Tennessee farm and home science, progress report 2, April - June 1952",12 pages,April - June 1952,April,1952,4,0012_004267_000002,1952_04,12
2,0012_004267_000003,"Tennessee farm and home science, progress report 3, July - September 1952",12 pages,July - September 1952,July,1952,7,0012_004267_000003,1952_07,12
3,0012_004267_000004,"Tennessee farm and home science, progress report 4, October - December 1952",12 pages,October - December 1952,October,1952,10,0012_004267_000004,1952_10,12
4,0012_004267_000005,"Tennessee farm and home science, progress report 5, January - March 1953",12 pages,January - March 1953,January,1953,1,0012_004267_000005,1953_01,12


## Process directory path

In [23]:
tn_farm_and_home_science_directory_path = Path('data/tn_farm_and_home_science_adminDB')

In [29]:
directories_path_list = sorted([x for x in tn_farm_and_home_science_directory_path.iterdir() if x.is_dir()])
directories_path_list

[PosixPath('data/tn_home_and_farm_science_adminDB/0012_004267_000001'),
 PosixPath('data/tn_home_and_farm_science_adminDB/0012_004267_000132')]

In [30]:
df_tn_farm_and_home_science[df_tn_farm_and_home_science["old_directory_name"].str.contains(directories_path_list[0].name)]['new_directory_name']

0    1952_01
Name: new_directory_name, dtype: object

In [31]:
for directory_path in directories_path_list:
    # get new_directory_name by searching for old_directory_name
    new_name = df_tn_farm_and_home_science[df_tn_farm_and_home_science["old_directory_name"].str.contains(directory_path.name)]['new_directory_name'].to_string(index=False)
    print(f'{directory_path.name} -> {new_name}')
    

0012_004267_000001 -> 1952_01
0012_004267_000132 -> 1990_01


# Tennessee Farm and Home News

## Load data - as much as I have on 2019-04-10

In [47]:
# exported Google Docs spreadsheet as csv
df_tn_farm_and_home_news = pd.read_csv('data/tennessee_farm_and_home_news_processed_2019-04-10.csv')
df_tn_farm_and_home_news.head(5)

Unnamed: 0,adminDB,title,extent
0,0012_004266_000957,"Tennessee farm and home news, January 1, 1940",7
1,0012_004266_000958,"Tennessee farm and home news, January 8, 1940",7
2,0012_004266_000959,"Tennessee farm and home news, January 15, 1940",7
3,0012_004266_000960,"Tennessee farm and home news, January 22, 1940",7
4,0012_004266_000961,"Tennessee farm and home news, January 29, 1940",7


## Date from title

In [48]:
# split on comma, but only split once to get the date
df_tn_farm_and_home_news['title'].str.split(',', n=1).str[-1].head(5)

0     January 1, 1940 
1     January 8, 1940 
2     January 15, 1940
3     January 22, 1940
4     January 29, 1940
Name: title, dtype: object

In [49]:
df_tn_farm_and_home_news['date'] = df_tn_farm_and_home_news['title'].str.split(',', n=1).str[-1]
df_tn_farm_and_home_news.head(5)

Unnamed: 0,adminDB,title,extent,date
0,0012_004266_000957,"Tennessee farm and home news, January 1, 1940",7,"January 1, 1940"
1,0012_004266_000958,"Tennessee farm and home news, January 8, 1940",7,"January 8, 1940"
2,0012_004266_000959,"Tennessee farm and home news, January 15, 1940",7,"January 15, 1940"
3,0012_004266_000960,"Tennessee farm and home news, January 22, 1940",7,"January 22, 1940"
4,0012_004266_000961,"Tennessee farm and home news, January 29, 1940",7,"January 29, 1940"


## Month

In [50]:
df_tn_farm_and_home_news['date'].str.split().str[0].tail(5)

2315    June     
2316    April    
2317    September
2318    April    
2319    January  
Name: date, dtype: object

In [51]:
df_tn_farm_and_home_news['Month'] = df_tn_farm_and_home_news['date'].str.split().str[0]
df_tn_farm_and_home_news.head(5)

Unnamed: 0,adminDB,title,extent,date,Month
0,0012_004266_000957,"Tennessee farm and home news, January 1, 1940",7,"January 1, 1940",January
1,0012_004266_000958,"Tennessee farm and home news, January 8, 1940",7,"January 8, 1940",January
2,0012_004266_000959,"Tennessee farm and home news, January 15, 1940",7,"January 15, 1940",January
3,0012_004266_000960,"Tennessee farm and home news, January 22, 1940",7,"January 22, 1940",January
4,0012_004266_000961,"Tennessee farm and home news, January 29, 1940",7,"January 29, 1940",January


## yyyy

In [52]:
df_tn_farm_and_home_news['date'].str.split().str[-1].tail(5)

2315    1977
2316    1946
2317    1947
2318    1951
2319    1947
Name: date, dtype: object

In [53]:
df_tn_farm_and_home_news['yyyy'] = df_tn_farm_and_home_news['date'].str.split().str[-1]

## mm

In [54]:
# convert month to mm, e.g. January -> 01
str(time.strptime(df_tn_farm_and_home_news['Month'][0], '%B').tm_mon).zfill(2)

'01'

In [55]:
# create list of mm
mm_list = []
for index, value in enumerate(df_tn_farm_and_home_news['Month'], start=1):
    try:
        mm = str(time.strptime(value, '%B').tm_mon).zfill(2)
        mm_list.append(mm)
    except ValueError:  # had errors in the metadata for the month, e.g. March11
        print(index, value)
mm_list[:6]

['01', '01', '01', '01', '01', '02']

In [56]:
# add mm to DataFrame
df_tn_farm_and_home_news['mm'] = mm_list
df_tn_farm_and_home_news.tail(5)

Unnamed: 0,adminDB,title,extent,date,Month,yyyy,mm
2315,0012_004266_003446,"Tennessee farm and home news, June 27, 1977",3 pages,"June 27, 1977",June,1977,6
2316,0012_004266_003447,"Tennessee farm and home news, April 15, 1946",7 pages,"April 15, 1946",April,1946,4
2317,0012_004266_003448,"Tennessee farm and home news, September 15, 1947",7 pages,"September 15, 1947",September,1947,9
2318,0012_004266_003449,"Tennessee farm and home news, April 2, 1951",7 pages,"April 2, 1951",April,1951,4
2319,0012_004266_003450,"Tennessee farm and home news, January 27, 1947",1 page,"January 27, 1947",January,1947,1


## dd

In [61]:
# get day from date field by splitting on comma and taking first value,
# then splitting on space and taking 2nd value
df_tn_farm_and_home_news['date'].str.split(',').str[0].str.split().str[1].head(5)

0    1 
1    8 
2    15
3    22
4    29
Name: date, dtype: object

In [62]:
# split using above, but also zero-pad 2 digits
df_tn_farm_and_home_news['date'].str.split(',').str[0].str.split().str[1].str.zfill(2).head(5)

0    01
1    08
2    15
3    22
4    29
Name: date, dtype: object

In [64]:
# use above code to set as 'dd' column
df_tn_farm_and_home_news['dd'] = df_tn_farm_and_home_news['date'].str.split(',').str[0].str.split().str[1].str.zfill(2)
df_tn_farm_and_home_news.tail(5)

Unnamed: 0,adminDB,title,extent,date,Month,yyyy,mm,dd
2315,0012_004266_003446,"Tennessee farm and home news, June 27, 1977",3 pages,"June 27, 1977",June,1977,6,27
2316,0012_004266_003447,"Tennessee farm and home news, April 15, 1946",7 pages,"April 15, 1946",April,1946,4,15
2317,0012_004266_003448,"Tennessee farm and home news, September 15, 1947",7 pages,"September 15, 1947",September,1947,9,15
2318,0012_004266_003449,"Tennessee farm and home news, April 2, 1951",7 pages,"April 2, 1951",April,1951,4,2
2319,0012_004266_003450,"Tennessee farm and home news, January 27, 1947",1 page,"January 27, 1947",January,1947,1,27


## old_directory_name

In [66]:
df_tn_farm_and_home_news['old_directory_name'] = df_tn_farm_and_home_news['adminDB']
df_tn_farm_and_home_news.head(5)

Unnamed: 0,adminDB,title,extent,date,Month,yyyy,mm,dd,old_directory_name
0,0012_004266_000957,"Tennessee farm and home news, January 1, 1940",7,"January 1, 1940",January,1940,1,1,0012_004266_000957
1,0012_004266_000958,"Tennessee farm and home news, January 8, 1940",7,"January 8, 1940",January,1940,1,8,0012_004266_000958
2,0012_004266_000959,"Tennessee farm and home news, January 15, 1940",7,"January 15, 1940",January,1940,1,15,0012_004266_000959
3,0012_004266_000960,"Tennessee farm and home news, January 22, 1940",7,"January 22, 1940",January,1940,1,22,0012_004266_000960
4,0012_004266_000961,"Tennessee farm and home news, January 29, 1940",7,"January 29, 1940",January,1940,1,29,0012_004266_000961


## new_directory_name

In [68]:
df_tn_farm_and_home_news['new_directory_name'] = df_tn_farm_and_home_news['yyyy'].map(str) + '_' + df_tn_farm_and_home_news['mm'].map(str) + '_' + df_tn_farm_and_home_news['dd'].map(str)
df_tn_farm_and_home_news.count() ## verify that we have a unique number of old_ and new_directory_name(s)

adminDB               2320
title                 2320
extent                2316
date                  2320
Month                 2320
yyyy                  2320
mm                    2320
dd                    2320
old_directory_name    2320
new_directory_name    2320
dtype: int64

## pages

In [69]:
# get just the number of pages, without " pages"
df_tn_farm_and_home_news['pages'] = df_tn_farm_and_home_news['extent'].str.split().str[0]
df_tn_farm_and_home_news.head(5)

Unnamed: 0,adminDB,title,extent,date,Month,yyyy,mm,dd,old_directory_name,new_directory_name,pages
0,0012_004266_000957,"Tennessee farm and home news, January 1, 1940",7,"January 1, 1940",January,1940,1,1,0012_004266_000957,1940_01_01,7
1,0012_004266_000958,"Tennessee farm and home news, January 8, 1940",7,"January 8, 1940",January,1940,1,8,0012_004266_000958,1940_01_08,7
2,0012_004266_000959,"Tennessee farm and home news, January 15, 1940",7,"January 15, 1940",January,1940,1,15,0012_004266_000959,1940_01_15,7
3,0012_004266_000960,"Tennessee farm and home news, January 22, 1940",7,"January 22, 1940",January,1940,1,22,0012_004266_000960,1940_01_22,7
4,0012_004266_000961,"Tennessee farm and home news, January 29, 1940",7,"January 29, 1940",January,1940,1,29,0012_004266_000961,1940_01_29,7


## Process directory path

In [None]:
tn_farm_and_home_news_directory_path = Path('data/tn_farm_and_home_news_adminDB')