In [None]:
!unzip po_fre_xml_cmu.zip

In [None]:
pip install fuzzywuzzy

In [None]:
!sudo apt-get install curl autoconf automake libtool pkg-config
!git clone https://github.com/openvenues/libpostal
%cd libpostal
!./bootstrap.sh
!./configure --datadir=/content
!make -j4
!sudo make install

In [None]:
!sudo ldconfig
!pip install postal
from postal.parser import parse_address
!pwd
%cd /content

In [None]:
import xml.etree.ElementTree as ET 
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [None]:
import pandas as pd
import numpy as np

from tensorflow import keras
import pickle

In [None]:
model = keras.models.load_model('address_detection_bilstm.h5')
with open("address_detection_bilstm_tokenizer.pickle", "rb") as output_file:
    tokenizer = pickle.load(output_file)

    print(model.summary())

In [None]:
# str1 - company name, str2 - address that contains company name
def string_matcher(str1, str2):
    words = str1.split(' ')
    cnt = 0
    for word in words:
        # print(word)
        if fuzz.token_set_ratio(word, str2) == 100:
            # print(fuzz.partial_ratio(word, line[1]))
            cnt += 1

    if cnt / len(words) > 0.5:
        return True
    else:
        return False

In [None]:
def get_address(row):
    address = list()
    count = 0
    address_list = list()
    address_new = ""
    po_list = list()
    for item in row:
        if item[1] != "house":
            if item[1] == "postcode":
                count +=1
            address.append (item)
        elif item[1] == "house":
            index1 = row.index(item)
            if index1 == 0:
                continue
            elif index1 == len(row) - 1:
                continue
            else:
                if row[index1-1][1] != "house" and row[index1+1] != "house":
                    address.append (item)
            
    if count > 1:
        for item in address:
            if item[1] != "postcode":
                address_new = address_new + item[0] + " "
            else:
                address_new = address_new + item[0] + " "
                index1 = address.index (item)
                if index1 == len(address)-1:
                    address_list.append (address_new)
                    continue
                else:
                    if address[index1+1][1] == "country":
                        address_new = address_new + address[index1+1][0] + " "
                address_list.append (address_new)
                address_new = ""
    elif count == 1:
        for item in address:
            address_new = address_new + item[0] + " "
        address_list.append (address_new)
    
    else:
        address_list.append (address_new)
        
    if address_list == ['']:
        return np.nan
    else:
        return address_list

In [None]:
def GenerateAddressEntitiesText(text_data):
    list_entities = parse_address(text_data)
    list_transformed_text = list()
    for text, entity in list_entities:
        list_transformed_text.append('[%s]' % entity)
    return ' '.join(list_transformed_text)

In [None]:

def address_detect (df):
    testing_values = df['text_transformed'].values
    vocab_size = 1000
    embedding_dim = 16
    max_length = 120
    trunc_type='post'
    oov_tok = "<OOV>"

    from tensorflow.keras.preprocessing.sequence import pad_sequences
    testing_sequences = tokenizer.texts_to_sequences(testing_values)
    testing_padded = pad_sequences(testing_sequences,maxlen=max_length)
    testing_predicted_probs = model.predict(testing_padded)
    
    df['pred_sigmoid'] = testing_predicted_probs
    df['pred_class'] = df['pred_sigmoid'].apply(lambda x: 1 if x>=0.5 else 0)
    
    data_address = df[df["pred_class"] == 1]
    data_address = data_address.reset_index (drop = True)
    
    data_address["Parsed"] = data_address["text"].apply (lambda x: parse_address(str(x)))
    data_address["Address"] = data_address["Parsed"].apply (lambda x: get_address(x))
    data_address = data_address.dropna(subset=['Address'])
    data_address = data_address.reset_index (drop = True)
    
    return data_address

In [None]:
import os

In [None]:
cnt1 = 0
cnt2 = 0
cnt3 = 0
total = 0
for xmlfile in os.listdir("/content/po_fre_xml_cmu/po_fre_xml_cmu/train"):
    total += 1
    
    print(xmlfile + ":")
    print(total)
    print(cnt1)
    print(cnt2)
    print(cnt3)

    with open("/content/error_xml.txt", "r") as error_xml:
        error_xml = error_xml.read().splitlines()
        # if total < int(error_xml[0]):
        #     continue
        if xmlfile in error_xml:
            continue

    address_key_file = "true_address_key_case_sen.txt"
    with open("/content/po_fre_xml_cmu/po_fre_xml_cmu/train/" + xmlfile,'r') as xml:
        xml = xml.readlines()
        xml[1] = "<document>\n"

    with open("/content/po_fre_xml_cmu/po_fre_xml_cmu/train/" + xmlfile,'w') as xml2:
        xml = ''.join(xml)
        xml2.write(xml)

    tree = ET.parse("/content/po_fre_xml_cmu/po_fre_xml_cmu/train/" + xmlfile)
    root = tree.getroot() 

    with open(xmlfile+'.parsed','w') as output:

        for block_item in root.findall(".//*/block"):
            block_str = ''
            for line_item in block_item.findall(".//*/formatting"):
                if line_item.text is not None: # To Solve the bug - TypeError: 'NoneType' object is not iterable
                    if ':' in list(line_item.text):
                        if block_str != '':
                            block_str += '\t' + block_item.attrib['l'] + '\t' + block_item.attrib['t'] + '\t' + block_item.attrib['r'] + '\t' + block_item.attrib['b'] + '\n'
                    block_str += line_item.text+' '
            if block_str != '':
                output.write(block_str + '\t' + block_item.attrib['l'] + '\t' + block_item.attrib['t'] + '\t' + block_item.attrib['r'] + '\t' + block_item.attrib['b'] + '\n')


    with open(xmlfile+'.parsed','r') as parsed_xml:
        parsed_xml = parsed_xml.readlines()
        with open('should_be_address_or_not_item.txt', 'w') as output:
            for item in parsed_xml:
                if len(item.split('\t')) < 5: #To Solve the List out of Range Error
                    break
                b = item.split('\t')[4].strip()
                item = item.split('\t')[0].strip()
                with open(address_key_file, 'r') as address_key:
                  address_key = address_key.readlines()
                  for key in address_key:
                      key = key.strip()
                      if string_matcher(key, item) and len(item) < 140: # filter out the text that is too long
                          output.write(item + '\t' + b + '\n')
                          break

    data = pd.read_csv('should_be_address_or_not_item.txt', sep='\t', header=None, names=['text', 'position'])
    data['text_transformed'] = data['text'].apply(lambda x: GenerateAddressEntitiesText(x))
    data_address = address_detect(data)
    data_address.to_csv(r'/content/output.csv', index = False)


    address_candidate = data_address['text'].values.tolist()
    position_data = data_address[['text', 'position']].values.tolist()
    output = []
    for item in address_candidate:
        output.append(item)

    with open('should_be_address_or_not_item.txt', 'r') as input:
        input = input.readlines()
        flag_cnt1 = False
        flag_cnt2 = False
        for item in input:
            item = item.strip().split('\t')
            if item[0] not in address_candidate: # Special cases: 1. Company name. 2. See below
                if string_matcher('below', item[0].lower()) or string_matcher('above', item[0].lower()) or string_matcher('previous', item[0].lower()):
                    flag_cnt2 = True

                    cnt2 += 1
                    below_b = float(item[1])
                    below_candidate = []
                    for candidate, candidate_b in position_data:
                        if candidate_b < below_b:
                            below_candidate.append(candidate) # ideally, only append address whose key matches the below's key, see code for above
                    output.append(item[0] + ' ' + ''.join(below_candidate))
                elif string_matcher('above', item[0].lower()) or string_matcher('previous', item[0].lower()):
                    cnt2 += 1
                    above_b = float(item[1])
                    above_candidate = []
                    for candidate, candidate_b in position_data:
                        if candidate_b > above_b:
                            address_candidate.append(candidate)
                    output.append(item[0] + ' ' + ''.join(above_candidate))
                else:
                    for candidate in address_candidate:
                        if string_matcher(item[0], candidate): # Company name.
                            flag_cnt1 = True
                            output.append(item[0])  # We need the original company name's key + the real matched address
                            break
    if flag_cnt1:
        cnt1 += 1
    if flag_cnt2:
        cnt2 += 1
    print(len(address_candidate))
    print(address_candidate)
    print(len(output))
    print(output)
    if len(output) == 0:
        cnt3 += 1
print("Company Name Percentage: " + str(cnt1 / total))
print("Above or Below Percentage: " + str(cnt2 / total))