In [1]:
import os
import extract, date

In [2]:
# path to pdf file
file_path = "../test_files/LS_index_1920.pdf" # has three identation types
file_name = os.path.basename(file_path)

# extract index from document without doing date extraction for demonstration purposes
ind_df = extract.extract_indexes_pdf(file_path, date_extraction=False)

ind_df

Reading pdf from ../test_files/LS_index_1920.pdf
...
Finished reading 11 page(s)
Starting extraction for LS_index_1920.pdf...
Finished extraction


Unnamed: 0,country,region,text,page
0,INTERNATIONAL,,Convention between France and Poland respectin...,1
1,INTERNATIONAL,,Labour Treaty between France and Italy. 30th S...,1
2,INTERNATIONAL,,Convention between the Minister of Foreign Aff...,1
3,INTERNATIONAL,,"either country. 27th November, 1919. T., mt. &...",1
4,INTERNATIONAL,,Convention between the Minister of Foreign Aff...,1
...,...,...,...,...
235,URUGIJAY,,Decreeissuing new regulations under the Act re...,11
236,URUGIJAY,,Decree respecting the supervision of hours of ...,11
237,URUGIJAY,,Decree to include messengers under the Eight-H...,11
238,URUGIJAY,,Act respecting occupational accidents. 15th No...,11


In [3]:
# filter lines from one specified page for demonstration purposes
page = 3
df = ind_df.loc[ind_df["page"]==page].copy()

df

Unnamed: 0,country,region,text,page
32,AUsTRIA,,Order of the Federal Department for Social Adm...,3
33,BELGIUM,,Royal Order instituting an Industrial Medical ...,3
34,BELGIUM,,"1919. T., Bel. 1—3 . . . . Ministerial Order i...",3
35,BELGIUM,,Royal Order defining the to be taken by the in...,3
36,BELGIUM,,Royal Decree respecting electrical plant in co...,3
37,BELGIUM,,Royal Order respecting surface works in connec...,3
38,BELGIUM,,Royal Order : Institution of a staff of labour...,3
39,BELGIUM,,"Royal Orders Dangerous, unhealthy or noxious u...",3
40,BELGIUM,,"Royal Order Dangerous, unhealthy or noxious un...",3
41,BELGIUM,,Royal Order defining the action to be taken by...,3


### Date types with examples
1. Nov. 4 | July 25th
    - in the beginning of an entry
2. 13/III/1986
    - in the beginning of an entry
3. 25th February, 1929
    - towards the end of an entry
4. 16th Dec. 1965 | 7 May
    - in the beginning

In [4]:
# determine date type of this document
date_type = date.get_date_type(df)

date_type

3

In [5]:
# extract dates from text with date type
df_date = date.extract_dates_of_type(df, date_type)

df_date

Unnamed: 0,country,region,text,page,extracted_date,extracted_day,extracted_month,extracted_year,full_text
32,AUsTRIA,,Order of the Federal Department for Social Adm...,3,"10th November, 1920",10,November,1920,Order of the Federal Department for Social Adm...
33,BELGIUM,,Royal Order instituting an Industrial Medical ...,3,,,,,Royal Order instituting an Industrial Medical ...
34,BELGIUM,,"1919. T., Bel. 1—3 . . . . Ministerial Order i...",3,"28th April, 1884",28,April,1884,"1919. T., Bel. 1—3 . . . . Ministerial Order i..."
35,BELGIUM,,Royal Order defining the to be taken by the in...,3,"5th September, 1919",5,September,1919,Royal Order defining the to be taken by the in...
36,BELGIUM,,Royal Decree respecting electrical plant in co...,3,"15th September, 5919",15,September,5919,Royal Decree respecting electrical plant in co...
37,BELGIUM,,Royal Order respecting surface works in connec...,3,"5th September, 2919",5,September,2919,Royal Order respecting surface works in connec...
38,BELGIUM,,Royal Order : Institution of a staff of labour...,3,,,,,Royal Order : Institution of a staff of labour...
39,BELGIUM,,"Royal Orders Dangerous, unhealthy or noxious u...",3,"i5th November, i9r9",i5,November,i9r9,"Royal Orders Dangerous, unhealthy or noxious u..."
40,BELGIUM,,"Royal Order Dangerous, unhealthy or noxious un...",3,"i5th November, 1919",i5,November,1919,"Royal Order Dangerous, unhealthy or noxious un..."
41,BELGIUM,,Royal Order defining the action to be taken by...,3,"11th March, 1920",11,March,1920,Royal Order defining the action to be taken by...


In [6]:
# normalize dates to format d.m., also tries to correct misspelled digits
df_norm = date.norm_dates(df_date, date_type, file_name)

df_norm

Unnamed: 0,country,region,text,date,year,page,extracted_date,extracted_day,extracted_month,extracted_year,full_text
32,AUsTRIA,,Order of the Federal Department for Social Adm...,10.11.,1920.0,3,"10th November, 1920",10,November,1920,Order of the Federal Department for Social Adm...
33,BELGIUM,,Royal Order instituting an Industrial Medical ...,,,3,,,,,Royal Order instituting an Industrial Medical ...
34,BELGIUM,,"1919. T., Bel. 1—3 . . . . Ministerial Order i...",28.4.,1884.0,3,"28th April, 1884",28,April,1884,"1919. T., Bel. 1—3 . . . . Ministerial Order i..."
35,BELGIUM,,Royal Order defining the to be taken by the in...,5.9.,1919.0,3,"5th September, 1919",5,September,1919,Royal Order defining the to be taken by the in...
36,BELGIUM,,Royal Decree respecting electrical plant in co...,15.9.,1919.0,3,"15th September, 5919",15,September,5919,Royal Decree respecting electrical plant in co...
37,BELGIUM,,Royal Order respecting surface works in connec...,5.9.,1919.0,3,"5th September, 2919",5,September,2919,Royal Order respecting surface works in connec...
38,BELGIUM,,Royal Order : Institution of a staff of labour...,,,3,,,,,Royal Order : Institution of a staff of labour...
39,BELGIUM,,"Royal Orders Dangerous, unhealthy or noxious u...",15.11.,1919.0,3,"i5th November, i9r9",i5,November,i9r9,"Royal Orders Dangerous, unhealthy or noxious u..."
40,BELGIUM,,"Royal Order Dangerous, unhealthy or noxious un...",15.11.,1919.0,3,"i5th November, 1919",i5,November,1919,"Royal Order Dangerous, unhealthy or noxious un..."
41,BELGIUM,,Royal Order defining the action to be taken by...,11.3.,1920.0,3,"11th March, 1920",11,March,1920,Royal Order defining the action to be taken by...


In [7]:
# method that combines all of the previous steps and removes unnecessary columns
date.extract_dates(df, file_name)

Unnamed: 0,country,region,text,date,year,page,extracted_date,full_text
32,AUsTRIA,,Order of the Federal Department for Social Adm...,10.11.,1920.0,3,"10th November, 1920",Order of the Federal Department for Social Adm...
33,BELGIUM,,Royal Order instituting an Industrial Medical ...,,,3,,Royal Order instituting an Industrial Medical ...
34,BELGIUM,,"1919. T., Bel. 1—3 . . . . Ministerial Order i...",28.4.,1884.0,3,"28th April, 1884","1919. T., Bel. 1—3 . . . . Ministerial Order i..."
35,BELGIUM,,Royal Order defining the to be taken by the in...,5.9.,1919.0,3,"5th September, 1919",Royal Order defining the to be taken by the in...
36,BELGIUM,,Royal Decree respecting electrical plant in co...,15.9.,1919.0,3,"15th September, 5919",Royal Decree respecting electrical plant in co...
37,BELGIUM,,Royal Order respecting surface works in connec...,5.9.,1919.0,3,"5th September, 2919",Royal Order respecting surface works in connec...
38,BELGIUM,,Royal Order : Institution of a staff of labour...,,,3,,Royal Order : Institution of a staff of labour...
39,BELGIUM,,"Royal Orders Dangerous, unhealthy or noxious u...",15.11.,1919.0,3,"i5th November, i9r9","Royal Orders Dangerous, unhealthy or noxious u..."
40,BELGIUM,,"Royal Order Dangerous, unhealthy or noxious un...",15.11.,1919.0,3,"i5th November, 1919","Royal Order Dangerous, unhealthy or noxious un..."
41,BELGIUM,,Royal Order defining the action to be taken by...,11.3.,1920.0,3,"11th March, 1920",Royal Order defining the action to be taken by...
