In [1]:
# Load all the packages needed
from bs4 import BeautifulSoup
import nltk
import re
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [2]:
# HTML Input
html_document = 'bankstatement.html'

In [3]:
# Load the html file 
from bs4 import BeautifulSoup

soup = BeautifulSoup(open(html_document), 'html.parser')
pages = soup.findAll('page')

In [4]:
# find each span that store letters then join the letter so it can form specific words
# then store the properties that only has 'baseline' element, baseline exist as a line or block that compile words from each letter
# return each words with it tags so that we could use it for next text extraction

# center_coord_hor is a center coordinate of each word horizontally
# center_coord_ver is a center coordinate of each word vertically


def page_details(lines):
    page_detail = []
    for l in lines:
        letter = []
        for j in l.findAll('span'):
            letter.append(j.text)
        
        string = ''.join(letter)
        if l.get('baseline'):
            page_detail.append({'string': string,
                                'baseline': int(l['baseline']),
                                'l': int(l['l']),
                                'r': int(l['r']),
                                'b': int(l['b']),
                                't': int(l['t']), 
                                'center_coord_hor': (int(l['r']) + int(l['l'])) / 2, #Center coordinate of each word in horizontal, will be later use to extract string based on the word position.
                                'center_coord_ver': (int(l['b']) + int(l['t'])) / 2, #Center coordinate of each word in diagonal, will be later use to extract string based on the word position.
                                })
    
    return page_detail

In [5]:
# preprocess the input by filtering the stopwords and tokenize the words

def preprocess(document):
    document = ' '.join([i for i in document.split() if i not in stop])
    sentences = nltk.sent_tokenize(document)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    return sentences

In [6]:
# filter the words from the input that only in uppercase 
# join the uppercase string and then using Named-Entity Recognition to extract name from the list

def extract_name(document):
    name_is_uppercase = list(filter(lambda x: x['string'].isupper(), document))
    name_is_uppercase = ' '.join(list(map(lambda j: j['string'], name_is_uppercase)))
    sentences = preprocess(name_is_uppercase)
    
    names = []
    
    for tagged_sentence in sentences:
        for chunk in nltk.ne_chunk(tagged_sentence):
            if type(chunk) == nltk.tree.Tree:
                if chunk.label() == 'PERSON':
                    names.append(' '.join(c[0] for c in chunk))
                    
    return names

In [7]:
#load name with script below
#extract_name(page_details(pages[0]))

In [8]:
# firstly, treat the person name as a string instead of list
# and the split the dict after name, let us assume that address is often print next after name
# find the address pattern using regex


def extract_address(document):
    person_name = extract_name(page_details(pages[0]))
    person_name = ''.join(person_name)
    text = ' '.join(list(map(lambda j: j['string'], document)))
    text = text.split(person_name, 1)[1]
    address = re.findall('[A-Za-z0-9]{1}.+ [0-9]{6}', text)
    return address

In [9]:
#load address with script below
#extract_address(page_details(pages[0]))

In [10]:
# this is actually tell us to select text below any defined text above,
# with 'l' is the left coordinate, 'r' is the right coordinate, and 't' is the top line border coordinate and center_coord_hor is already explained above
# return a text below that have 'l' value is less than equal center coordinate of horizontal value of defined text above
# and the 'r' value is greater than equal center coordinate of horizontal value of defined text above
# and the 't' value of text below should be greater than defined text above
# Why is that? Because text below text is of course has greater 't' coordinate than the text above since the distance is calculated from the top of page
# By calculating the center position horizontally for each word, we can say that if text below should also has distance of 'l' coordinate less than the center_coord_hor of define word above
# since 'l' value is calculated from left to right, 
# by that also means the 'r' or right distance coordinate should be greater than the center horizontal coordinate 


def get_text_below(text_below, text_above):
    return text_below['t'] > text_above['t'] and text_below['l'] <= text_above['center_coord_hor'] and text_below['r'] >= text_above['center_coord_hor']


In [11]:
# here i try to get the account number by using the header of 'account number'
# if it's account_number_detail return False and the string is equal to header then it's gonna loop the document 
# and then find the element below with regex format because the condition is not returning False anymore


def account_number(document):
    account = []
    
    account_number_detail = []
    for doc in document:
        if account_number_detail != None and doc['string'].lower() == 'account number':
            account_number_detail = doc
        elif account_number_detail and get_text_below(doc, account_number_detail):
            account += re.findall(r'\d{2,3}-\d{6,11}-\d{1,2}', doc['string'])
            
    return account

In [12]:
#load account number with script below
#account_number(page_details(pages[0]))

In [13]:
# get the statement date by find date format from specified regex match 

def extract_statement_date(document):
    stat_date = []
    for doc in document:
        stat_date += re.findall(r'[0-9]{1,2} [a-zA-Z]{3} [0-9]{4}', doc['string'])
    return stat_date

In [14]:
#load statement date with script below
#extract_statement_date(page_details(pages[0]))

In [15]:
# get the right text that next to other words
# by using the element r, l, and t. 
# right text should have greater 'l' coordinate value than the left text
# right text should have greater 'r' coordinate value than the left text
# right text should have greater 'b' coordinate value than the left text
# right text should have greater 't' coordinate value than the left text

def get_right_text(right_text, left_text):
    return right_text['l'] >= left_text['l'] and right_text['r'] >= left_text['r'] and right_text['b'] >= left_text['t'] and right_text['t'] <= left_text['b']


In [16]:
def get_transaction(document):
    table_header = [['Date'],['Description'],['Withdrawal'],['Deposit']]
    
    table_header_column = {}
    for doc in document:
        for detail in table_header:
            if any(doc['string'].lower() == column.lower() for column in detail):
                table_header_column[detail[0]] = doc
                
    dates = []
    for doc in document:
        date_text = get_text_below(doc, table_header_column['Date']) and re.match('[0-9]{1,2} [a-zA-Z]{3}', doc['string'])
        if date_text:
            dates.append(doc)
            
            
    transactions = []
    
    for date in dates:
        transaction = {}
        transaction['Date'] = date['string']
        
        rows = list(filter(lambda x: get_right_text(x, date), document))
        
        for row in rows:
            desc = get_text_below(row, table_header_column['Description'])
            withdraws = get_text_below(row, table_header_column['Withdrawal'])
            deposit = get_text_below(row, table_header_column['Deposit'])
            
            if desc:
                transaction['Description'] = row['string']
                second_desc = list(filter(lambda x: get_text_below(x, row), document))
                below_second_desc = list(filter(lambda x: x['t']-50 <= row['t'], second_desc)) #get the element below the text before another main description
                for second_descrow in below_second_desc:
                    transaction['Description'] += ', ' + second_descrow['string']
                    
            if withdraws:
                transaction['Amount(SGD)'] = row['string']
                transaction['Type'] = 'Withdrawal'
            
            if deposit:
                transaction['Amount(SGD)'] = row['string']
                transaction['Type'] = 'Deposit'
                
        transactions.append(transaction)
    return transactions
        

In [17]:
#map the output

{'name': extract_name(page_details(pages[0])),
 'address': extract_address(page_details(pages[0])),
 'account_number': account_number(page_details(pages[0])),
 'statement_date': extract_statement_date(page_details(pages[0])),
 'transactions': get_transaction(page_details(pages[1]))
}

{'name': ['JOHN RIEGER'],
 'address': ['KTGIF SINGAPORE PTE. LTD. 26B TEMPLE STREET #03-00 SINGAPORE 058571'],
 'account_number': ['12-145753-2'],
 'statement_date': ['31 Aug 2018'],
 'transactions': [{'Date': '28 Aug',
   'Amount(SGD)': '1.254.12',
   'Type': 'Deposit',
   'Description': 'Quick Cheque Deposit'},
  {'Date': '30 Aug',
   'Description': 'Point-of-Sale Transaction, TRANSIT LINK PTE LTD',
   'Amount(SGD)': '20.00',
   'Type': 'Withdrawal'},
  {'Date': '30 Aug',
   'Description': 'Point-of-Sale Transaction, S & S LINKERS PTE LTD',
   'Amount(SGD)': '465.00',
   'Type': 'Withdrawal'},
  {'Date': '31 Aug',
   'Description': 'Point-of-Sale Transaction, GAYATRI RESTAURANT',
   'Amount(SGD)': '26.50',
   'Type': 'Withdrawal'},
  {'Date': '31 Aug',
   'Description': 'Point-of-Sale Transaction, FOCUS NETWORK AGENCIES (S) PTE LTD',
   'Amount(SGD)': '16.00',
   'Type': 'Withdrawal'}]}