# Summary

The following code iteratively (document by document) extracts from the source PDF document the desired table entitled:<br><br> "**ANNUAL RETURN: FORM 6', 'General: Singapore Insurance Fund', 'General: Offshore Insurance Fund**"<br><br>
This table may be in a single page, or may be split across 2 pages.<br><br>
After the table is extracted, it is saved as a PDF file in an output folder.

# Import Packages

In [1]:
from pdfminer.pdfinterp import resolve1
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.high_level import extract_text
import PyPDF4 as pypdf4
import pandas as pd
import os
import io

# Define Function: extract_tables()

In [20]:
def extract_tables(pdf_document, output_folder):
    
    search_string = ['ANNUAL RETURN: FORM 6', 'General: Singapore Insurance Fund', 'General: Offshore Insurance Fund']
    print(pdf_document)
    
    with open(pdf_document, "rb") as filehandle: # mode: 'r'=> read, 'b'=> binary
        parser = PDFParser(filehandle)
        document = PDFDocument(parser)
        num_pages = resolve1(document.catalog['Pages'])['Count']
        pages_to_extract=()
        
        search_start_page = 10 # Start searching at this page (zero-indexed)
        
        for i in range(search_start_page,num_pages):  
            pagetext = extract_text(pdf_document,page_numbers=[i])
            
            if search_string[0] in pagetext and search_string[1] in pagetext: # this page is for onshore insurance
                pagetext = extract_text(pdf_document,page_numbers=[i+1]) # take a look at the next page
                
                if (not search_string[0] in pagetext) and (not search_string[2] in pagetext): # continue on next page
                    pages_to_extract = (i,i+1) # this tuple contains the pages to extract
                
                else: # next page not a continuation of page; it is for offshore insurance
                    pages_to_extract = (i,-1) # this means only 1 page will be extracted
                    
                break # we need not search further, so break from loop      
                
            else:
                pass # search the next page
        
        # print('pages to extract (zero-indexed)',pages_to_extract)
        
        # extract table and save as new file (1 file per table)
        filename = os.path.splitext(os.path.basename(pdf_document))[0]
    
        output_filename = f'{filename}_page_{pages_to_extract[0]+1}.pdf' # zero-indexed pages, so add 1 to page number
        output_filepath = output_folder + output_filename
        with open(output_filepath, 'wb') as out1:
            pdf = pypdf4.PdfFileReader(filehandle)
            pdf_writer = pypdf4.PdfFileWriter()
            pdf_writer.addPage(pdf.getPage(pages_to_extract[0]))
            pdf_writer.write(out1)
            print(f'Created: {output_filepath}')
        
        if pages_to_extract[1] > 0: # that means the table is split over a second page
        
            output_filename = f'{filename}_page_{pages_to_extract[1]+1}.pdf'
            output_filepath = output_folder + output_filename                    
            with open(output_filepath, 'wb') as out2:
                pdf = pypdf4.PdfFileReader(filehandle)
                pdf_writer = pypdf4.PdfFileWriter()
                pdf_writer.addPage(pdf.getPage(pages_to_extract[1]))
                pdf_writer.write(out2)
                print(f'Created: {output_filepath}')
        else:
            pass # there is no second page to create

# Define Exception Handler

In [21]:
def handle_extract_tables_exception(file_name):
    global file_exception_list
    file_exception_list.append(file_name)
    return

# Iterate through source folder to extract tables

 Also get exception file names

In [None]:
# get file names of source directory into a list
source_folder = '../data/source/'
list_of_files = os.listdir(path=source_folder)
file_exception_list = [] # global variable for files that throw exceptions

# for i in file_list, call the function below
for i in list_of_files:
    filepath = source_folder + i
    try:
        extract_tables(filepath, '../data/output/')
    except:
        handle_extract_tables_exception(i)
    else:
        pass

print(f'\ntable extraction completed with {len(file_exception_list)} exceptions:\n{file_exception_list}')

file_exceptions = pd.Series(file_exception_list)
file_exceptions.to_csv('file_exceptions.csv')

In [24]:
# load file exceptions list
file_exceptions = pd.Series.from_csv('file_exceptions.csv')
# check file exceptions
file_exceptions[:5]

0    c014_2007.pdf
1    c014_2008.pdf
2    c017_2013.pdf
3    c017_2014.pdf
4    c017_2015.pdf
dtype: object

In [25]:
def check_pages(pdf_document, output_folder):
        
    search_string = ['ANNUAL RETURN: FORM 6', 'General: Singapore Insurance Fund', 'General: Offshore Insurance Fund']
    print(pdf_document)
    
    with open(pdf_document, "rb") as filehandle: # mode: 'r'=> read, 'b'=> binary
        parser = PDFParser(filehandle)
        document = PDFDocument(parser)
        num_pages = resolve1(document.catalog['Pages'])['Count']
        pages_to_extract=()
        
        ##### SET RANGE TO (10, num_pages) #####
        for i in range(10, num_pages): # Start searching at page 10 (zero-indexed) for speed
            pagetext = extract_text(pdf_document,page_numbers=[i])
            
#             #debug
#             print("i ---- ",i)
#             print('pagetext')
#             print(pagetext)
            
            if search_string[0] in pagetext and search_string[1] in pagetext: # this page is for onshore insurance
                pagetext = extract_text(pdf_document,page_numbers=[i+1]) # take a look at the next page
                
                if (not search_string[0] in pagetext) and (not search_string[2] in pagetext): # continue on next page
                    pages_to_extract = (i,i+1) # this tuple contains the pages to extract
                
                else: # next page not a continuation of page; it is for offshore insurance
                    pages_to_extract = (i,-1) # this means only 1 page will be extracted
    
    print('pages to extract:', pages_to_extract)

# Iterate through file exceptions to debug


In [22]:
# # check on a single file
# # file_to_check = '../data/source/c014_2007.pdf'
# for i in file_exceptions:
#     filepath = '../data/source/' + i
#     check_pages(filepath, '../data/output/')

##### No tables in thes docs, hence no data from these
1) ../data/source/c058_2005.pdf<br>
2) ../data/source/c105_2005.pdf<br>
3) ../data/source/c105_2006.pdf<br>
4) ../data/source/c105_2007.pdf<br>
5) ../data/source/c105_2008.pdf<br>
6) ../data/source/c105_2009.pdf<br>
7) ../data/source/c105_2010.pdf<br>
8) ../data/source/c151_2009.pdf<br>
9) ../data/source/c160_2013.pdf<br>


# Final pass of file_exceptions to extract tables

In [26]:
source_folder = '../data/source/'
list_of_files = list(file_exceptions)

file_exception_list = [] # global variable for files that throw exceptions

# for i in file_list, call the function below
for i in list_of_files:
    filepath = source_folder + i
    try:
        extract_tables(filepath, '../data/output/')
    except:
        handle_extract_tables_exception(i)
    else:
        pass

print(f'\ntable extraction completed with {len(file_exception_list)} exceptions:\n{file_exception_list}')
file_exceptions = pd.Series(file_exception_list)
file_exceptions.to_csv('file_exceptions_round2.csv')

../data/source/c014_2007.pdf
Created: ../data/output/c014_2007_page_50.pdf
../data/source/c014_2008.pdf
Created: ../data/output/c014_2008_page_50.pdf
../data/source/c017_2013.pdf
Created: ../data/output/c017_2013_page_15.pdf
../data/source/c017_2014.pdf
Created: ../data/output/c017_2014_page_15.pdf
../data/source/c017_2015.pdf
Created: ../data/output/c017_2015_page_14.pdf
Created: ../data/output/c017_2015_page_15.pdf
../data/source/c017_2016.pdf
Created: ../data/output/c017_2016_page_14.pdf
Created: ../data/output/c017_2016_page_15.pdf
../data/source/c017_2017.pdf
Created: ../data/output/c017_2017_page_14.pdf
Created: ../data/output/c017_2017_page_15.pdf
../data/source/c017_2018.pdf
Created: ../data/output/c017_2018_page_15.pdf
Created: ../data/output/c017_2018_page_16.pdf
../data/source/c029_2012.pdf
Created: ../data/output/c029_2012_page_50.pdf
../data/source/c029_2013.pdf
Created: ../data/output/c029_2013_page_50.pdf
../data/source/c029_2014.pdf
Created: ../data/output/c029_2014_pag

Created: ../data/output/c152_2017_page_13.pdf
Created: ../data/output/c152_2017_page_14.pdf
../data/source/c152_2018.pdf
Created: ../data/output/c152_2018_page_14.pdf
Created: ../data/output/c152_2018_page_15.pdf
../data/source/c154_2018.pdf
Created: ../data/output/c154_2018_page_13.pdf
Created: ../data/output/c154_2018_page_14.pdf
../data/source/c160_2013.pdf
../data/source/c163_2005.pdf
Created: ../data/output/c163_2005_page_15.pdf
../data/source/c163_2006.pdf
Created: ../data/output/c163_2006_page_15.pdf
../data/source/c163_2007.pdf
Created: ../data/output/c163_2007_page_15.pdf
../data/source/c163_2008.pdf
Created: ../data/output/c163_2008_page_15.pdf
../data/source/c163_2009.pdf
Created: ../data/output/c163_2009_page_15.pdf
../data/source/c163_2010.pdf
Created: ../data/output/c163_2010_page_16.pdf
../data/source/c163_2011.pdf
Created: ../data/output/c163_2011_page_16.pdf
../data/source/c163_2012.pdf
Created: ../data/output/c163_2012_page_14.pdf
../data/source/c163_2013.pdf
Created:

