In [1]:
import spacy
from spacy.tokens import DocBin
from spacy.util import filter_spans
import pandas as pd
import numpy as np
import re
import csv
from fuzzywuzzy import fuzz
from geopy.geocoders import Nominatim
import pgeocode

In [2]:
def check_consecutive_digits(address):
    '''
    Extract all instances of consecutive 6 digit substrings in input address.

    Args:
        address (str): Input raw address.

    Returns:
        List: List containing all instances of consecutive 6 digit substrings
    '''
    # regex pattern for six consecutive digits
    pattern = r"\d{6}"  
    # note: there may be multiple substrings with 6 consecutive digits
    matches = re.findall(pattern, address)
    return matches

In [15]:
def validate_postal_code(postal_code):
    '''
    Validate if postal code provided is valid in Singapore.
    
    Args:
        postal_code (str): input postal code
    
    Returns:
        bool: True if postal code is valid in Singapore, False otherwise.
    '''
    # validate postal code using pgeocode
    geocoder = pgeocode.Nominatim('sg')
    result = geocoder.query_postal_code(postal_code)
    
    if not result.empty:
        print("[pgeocode] Postal code: " + str(postal_code) + ", Result: (" + str(result.latitude) + ", " + str(result.longitude) + ")")
        if check_location_in_sg(result.latitude, result.longitude):
            return True  # postal code is valid in SG

    else:
        # do a secondary validation using geopy
#         geolocator = Nominatim(user_agent="myGeocoder")
#         location = geolocator.geocode(postal_code)
#         if location is not None:
#             print("[geopy___] Postal code: " + str(postal_code) + ", Result: (" + str(location.latitude) + ", " + str(location.longitude) + ")")
#             if check_location_in_sg(location.latitude, location.longitude):
#                 return True # postal code is valid in SG

        print("[pgeocode] Postal code: " + str(postal_code) + ", Result: " + str(result))
        return False  # postal code is NOT valid in SG

In [4]:
def check_location_in_sg(latitude, longitude):
    '''
    Checks if coordinates lie in Singapore boundary.
    
    Args:
        latitude (float): latitude of extracted location
        longitude (float): longitude of extracted location
        
    Returns:
        bool: True if coordinates lie in Singapore boundary, False otherwise.
    '''
    # latitude [1.15, 1.47]
    # longitude [103.6, 104.1]

    min_lat_sg = 1.15
    max_lat_sg = 1.47
    min_lng_sg = 103.6
    max_lng_sg = 104.1

    if (min_lat_sg <= latitude <= max_lat_sg) and (min_lng_sg <= longitude <= max_lng_sg):
        return True
    return False  

In [5]:
def validate_singapore_address(input_address):
    '''
    Extract all instances of consecutive 6 digit substrings and validate them.
    
    Args:
        input_address (str): Input address.
        
    Returns:
        "SINGAPORE" if one of the extracted pattern is a valid postal code in Singapore,
        "FOREIGN" otherwise.
    '''
     
    consecutive_digit_matches = check_consecutive_digits(input_address)
    if consecutive_digit_matches:
        for match in consecutive_digit_matches:
            if validate_postal_code(match):
                return "SINGAPORE"
    return "FOREIGN"

In [6]:
##############################################################################
# workaround to extract postal code. may remove this portion if model becomes more robust in future
# can consider using this portion as a cross check with the postal code predicted by model
def get_singapore_postal_code(address):
    consecutive_digit_matches = check_consecutive_digits(address)
    if consecutive_digit_matches:
        for match in consecutive_digit_matches:
            if validate_postal_code(match):
                return match
    return None
##############################################################################

In [7]:
# Methods for SG address segmentation
# Note: to be used only if entire dataframe contains only SG addresses

def predict_fields_sg(df):
    block_list = []
    street_list = []
    building_list = []
    floor_list = []
    unit_list = []
    country_list = []
    postal_code_list = []

    for index, address in enumerate(df['address']):

        # note: ent.label is an integer identifier for entity while ent.label_ uses a string representation
        entities = {ent.label_: ent.text for ent in nlp(str(address)).ents}

        block_list.append(entities.get('BLOCK', ''))
        street_list.append(entities.get('STREET', ''))
        building_list.append(entities.get('BUILDING', ''))
        floor_list.append(entities.get('FLOOR_NUM', ''))
        unit_list.append(entities.get('UNIT_NUM', ''))
        country_list.append(entities.get('COUNTRY', ''))
#         postal_code_list.append(entities.get('POSTAL_CODE', ''))

        ##############################################################################
        # workaround to extract postal code. may remove this portion if model becomes more robust in future
        # can consider using this portion as a cross check with the postal code predicted by model
        postal_code = get_singapore_postal_code(address)
        postal_code_list.append(postal_code)
        ##############################################################################

        print(f"Iteration: {index+1}/{len(df)}")

    tested_data = df.assign(
        predicted_block = block_list,
        predicted_street = street_list,
        predicted_building = building_list,
        predicted_floor_num = floor_list,
        predicted_unit_num = unit_list,
        predicted_country = country_list,
        predicted_postal_code = postal_code_list
    )
    
    tested_data = tested_data.replace(r'^\s*$', np.nan, regex=True)
    return tested_data

In [8]:
# Methods for foreign address segmentation
# Note: to be used only if entire dataframe contains only MY and ID addresses

def predict_fields_foreign(df):
    street_list = []
    zipcode_list = []
    city_list = []
    province_list = []
    country_list = []

    for index, address in enumerate(df['address']):

        # note: ent.label is an integer identifier for entity while ent.label_ uses a string representation
        entities = {ent.label_: ent.text for ent in nlp(str(address)).ents}

        street_list.append(entities.get('STREET', ''))
        zipcode_list.append(entities.get('ZIPCODE', ''))
        city_list.append(entities.get('CITY', ''))
        province_list.append(entities.get('PROVINCE', ''))
        country_list.append(entities.get('COUNTRY', ''))

        print(f"Iteration: {index+1}/{len(df)}")

    tested_data = df.assign(
        predicted_street = street_list,
        predicted_zipcode = zipcode_list,
        predicted_city = city_list,
        predicted_province = province_list,
        predicted_country = country_list
    )
    
    tested_data = tested_data.replace(r'^\s*$', np.nan, regex=True)
    return tested_data

### Input test addresses

In [9]:
model_mapping = {
    'SINGAPORE': './ner-sg/output/models/model-best',
    'FOREIGN': './ner-foreign/output/models/model-best',
    'DEFAULT': './ner-sg/output/models/model-best'  # TODO: Add default model path
}

# Load all models
print("Loading models...")
loaded_models = {}
for country, model_directory in model_mapping.items():
    loaded_models[country] = spacy.load(model_directory)

for model in loaded_models:
    print(model)

print("Models loaded!")

Loading models...
SINGAPORE
FOREIGN
DEFAULT
Models loaded!


In [34]:
# TEST DATA
# NOTE: this test shows the flaws in current implementation. workaround done in test with csv below

address_list = [
    "CIKARANG TIMUR JATIREJA NO. 425 BEKASI KABUPATEN JAWA BARAT 17530 INDONESIA",
    "KOTA RINGIN MEMPURA 895 SIAK RIAU 28773 INDONESIA",
    "MOJOLEBAK JETIS 547L MOJOKERTO JAWA TIMUR 61352 INDONESIA"
    "NO. 44 JALAN DESA MELUR 56000 CHERAS KUALA LUMPUR MALAYSIA",
    "LOT 3595 JALAN HASAN 84000 MUAR JOHOR MALAYSIA",
    "210K KAMPUNG RAJA 48000 RAWANG SELANGOR MALAYSIA",
    "101 THOMSON ROAD #01-23 UNITED SQUARE 123456 SINGAPORE 307591",
    "101 THOMSON ROAD #01-23 UNITED SQUARE S(307591)",
    "101 THOMSON ROAD #01-23 UNITED SQUARE S307591",
    "101 THOMSON ROAD #01-23 UNITED SQUARE 307591",
    "53 ANG MO KIO AVENUE 3 SINGAPORE 569933",
    "53 ANG MO KIO AVENUE 3 S(569933)",
    "53 ANG MO KIO AVENUE 3 S 569933",
    "53 ANG MO KIO AVENUE 3 569933"
]

In [35]:
for address in address_list:
    
    cleaned_address = address.replace(',', '').strip()    
    identified_country = validate_singapore_address(cleaned_address)
    
    # Select the corresponding model or use the default model
    nlp = loaded_models.get(identified_country, loaded_models['DEFAULT'])
    doc = nlp(cleaned_address)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    
    print("Address string:", cleaned_address)
    print("Identified country:", identified_country)
    print("Parsed address:", str(entities))
    print("--------------------------")

Address string: CIKARANG TIMUR JATIREJA NO. 425 BEKASI KABUPATEN JAWA BARAT 17530 INDONESIA
Identified country: FOREIGN
Parsed address: [('CIKARANG TIMUR JATIREJA NO. 425', 'STREET'), ('BEKASI KABUPATEN', 'CITY'), ('JAWA BARAT', 'PROVINCE'), ('17530', 'ZIPCODE'), ('INDONESIA', 'COUNTRY')]
--------------------------
Address string: KOTA RINGIN MEMPURA 895 SIAK RIAU 28773 INDONESIA
Identified country: FOREIGN
Parsed address: [('KOTA RINGIN MEMPURA 895', 'STREET'), ('SIAK', 'CITY'), ('RIAU', 'PROVINCE'), ('28773', 'ZIPCODE'), ('INDONESIA', 'COUNTRY')]
--------------------------
Address string: MOJOLEBAK JETIS 547L MOJOKERTO JAWA TIMUR 61352 INDONESIANO. 44 JALAN DESA MELUR 56000 CHERAS KUALA LUMPUR MALAYSIA
Identified country: FOREIGN
Parsed address: [('MOJOLEBAK JETIS 547L', 'STREET'), ('MOJOKERTO', 'CITY'), ('JAWA TIMUR', 'PROVINCE'), ('61352', 'ZIPCODE'), ('56000', 'ZIPCODE'), ('CHERAS', 'CITY'), ('KUALA LUMPUR', 'PROVINCE'), ('MALAYSIA', 'COUNTRY')]
--------------------------
Address 

### Testing with data in csv files

In [36]:
# TEST DATA
test_data = pd.read_csv("./test-data/test_data.csv")
test_data

Unnamed: 0,address
0,CIKARANG TIMUR JATIREJA NO. 425 BEKASI KABUPAT...
1,KOTA RINGIN MEMPURA 895 SIAK RIAU 28773 INDONESIA
2,MOJOLEBAK JETIS 547L MOJOKERTO JAWA TIMUR 6135...
3,NO. 44 JALAN DESA MELUR 56000 CHERAS KUALA LUM...
4,LOT 3595 JALAN HASAN 84000 MUAR JOHOR MALAYSIA
5,210K KAMPUNG RAJA 48000 RAWANG SELANGOR MALAYSIA
6,101 THOMSON ROAD #01-23 UNITED SQUARE 123456 S...
7,101 THOMSON ROAD #01-23 UNITED SQUARE S(307591)
8,101 THOMSON ROAD #01-23 UNITED SQUARE S307591
9,101 THOMSON ROAD #01-23 UNITED SQUARE 307591


In [37]:
model_mapping = {
    'SINGAPORE': './ner-sg/output/models/model-best',
    'FOREIGN': './ner-foreign/output/models/model-best',
    'DEFAULT': './ner-sg/output/models/model-best'  # TODO: Add default model path
}

# Load all models
print("Loading models...")
loaded_models = {}
for country, model_directory in model_mapping.items():
    loaded_models[country] = spacy.load(model_directory)

for model in loaded_models:
    print(model)
    
print("Models loaded!")

Loading models...
SINGAPORE
FOREIGN
DEFAULT
Models loaded!


In [None]:
##############################################################################
# workaround: may remove this portion if model becomes more robust in future
# can consider using this portion as a cross check with the postal code predicted by model
    if component == "postal_code":
        postal_code = get_singapore_postal_code(cleaned_address)
        ent = postal_code
    else:
        row[component] = row[component].str.replace[postal_code, '']
##############################################################################

In [38]:
component_lists = {
    'block': [],
    'street': [],
    'building': [],
    'floor_num': [],
    'unit_num': [],
    'country': [],
    'postal_code': [],
    'city': [],
    'province': [],
    'zipcode': []
}

for index, row in test_data.iterrows():
    print(f"Iteration: {index+1}/{len(test_data)}")
    cleaned_address = row['address'].replace(',', '').strip()
    identified_country = validate_singapore_address(cleaned_address)
    
    nlp = loaded_models.get(identified_country, loaded_models['DEFAULT'])

    doc = nlp(cleaned_address)

    entities = {ent.label_: ent.text for ent in doc.ents}

    for component in component_lists:
        try:
            ent = entities.get(component.upper(), '')
        except:
            ent = np.nan
        
        if identified_country=="SINGAPORE":
            if component == "country":
                ent = "SINGAPORE"
        ##############################################################################
        # workaround: may remove this portion if model becomes more robust in future
        # can consider using this portion as a cross check with the postal code predicted by model
            elif component == "postal_code":
                ent = get_singapore_postal_code(cleaned_address)
        ##############################################################################
            
        component_lists[component].append(ent)

    print("Address string:", cleaned_address)
    print("Identified country:", identified_country)
    print("--------------------------")

tested_data = pd.DataFrame(component_lists)
tested_data = tested_data.replace(r'^\s*$', np.nan, regex=True)
tested_data['postcode'] = tested_data['postal_code'].fillna(tested_data['zipcode'])
tested_data = tested_data.drop(['zipcode', 'postal_code'], axis=1)

# remove postal code from other columns (for cases where there is > 1 substrings with 6 consecutive digits)
for index, row in tested_data.iterrows():
    for col in tested_data.columns:
        if col != 'postcode':
            component_value = row[col]
            if not pd.isnull(component_value):
                tested_data.loc[index, col] = component_value.replace(row['postcode'], '')

tested_data

Iteration: 1/14
Address string: CIKARANG TIMUR JATIREJA NO. 425 BEKASI KABUPATEN JAWA BARAT 17530 INDONESIA
Identified country: FOREIGN
--------------------------
Iteration: 2/14
Address string: KOTA RINGIN MEMPURA 895 SIAK RIAU 28773 INDONESIA
Identified country: FOREIGN
--------------------------
Iteration: 3/14
Address string: MOJOLEBAK JETIS 547L MOJOKERTO JAWA TIMUR 61352 INDONESIA
Identified country: FOREIGN
--------------------------
Iteration: 4/14
Address string: NO. 44 JALAN DESA MELUR 56000 CHERAS KUALA LUMPUR MALAYSIA
Identified country: FOREIGN
--------------------------
Iteration: 5/14
Address string: LOT 3595 JALAN HASAN 84000 MUAR JOHOR MALAYSIA
Identified country: FOREIGN
--------------------------
Iteration: 6/14
Address string: 210K KAMPUNG RAJA 48000 RAWANG SELANGOR MALAYSIA
Identified country: FOREIGN
--------------------------
Iteration: 7/14
[pgeocode] Postal code: 123456, Result: (nan, nan)
[pgeocode] Postal code: 307591, Result: (1.3172, 103.8437)
[pgeocode] Po

Unnamed: 0,block,street,building,floor_num,unit_num,country,city,province,postcode
0,,CIKARANG TIMUR JATIREJA NO. 425,,,,INDONESIA,BEKASI KABUPATEN,JAWA BARAT,17530
1,,KOTA RINGIN MEMPURA 895,,,,INDONESIA,SIAK,RIAU,28773
2,,MOJOLEBAK JETIS 547L,,,,INDONESIA,MOJOKERTO,JAWA TIMUR,61352
3,,NO. 44 JALAN DESA MELUR,,,,MALAYSIA,CHERAS,KUALA LUMPUR,56000
4,,LOT 3595 JALAN HASAN,,,,MALAYSIA,MUAR,JOHOR,84000
5,,210K KAMPUNG RAJA,,,,MALAYSIA,RAWANG,SELANGOR,48000
6,101.0,THOMSON ROAD,UNITED SQUARE 123456,1.0,23.0,SINGAPORE,,,307591
7,101.0,THOMSON ROAD,UNITED SQUARE,1.0,23.0,SINGAPORE,,,307591
8,101.0,THOMSON ROAD,UNITED SQUARE,1.0,23.0,SINGAPORE,,,307591
9,101.0,THOMSON ROAD,UNITED SQUARE,1.0,23.0,SINGAPORE,,,307591


In [49]:
tested_data.to_csv("./test-data/tested_data.csv")