In [18]:
import spacy
from spacy.tokens import DocBin
from spacy.util import filter_spans
import pandas as pd
import numpy as np
import re
import csv
from fuzzywuzzy import fuzz

from geopy.geocoders import Nominatim
import pgeocode

In [19]:
def check_consecutive_digits(address):
    pattern = r"\d{6}"  # regex pattern for six consecutive digits
    matches = re.findall(pattern, address)
    return matches

def validate_postal_code(postal_code):
    geocoder = pgeocode.Nominatim('sg')
    result = geocoder.query_postal_code(postal_code)
    if not result.empty:
        print("[pgeocode] Postal code: " + str(postal_code) + ", Result: (" + str(result.latitude) + ", " + str(result.longitude) + ")")
        if result.country_code.upper() == 'SG' and check_location_in_sg(result.latitude, result.longitude):
            return True  # Postal code is valid in Singapore

    else:
        geolocator = Nominatim(user_agent="myGeocoder")
        location = geolocator.geocode(postal_code)
        if location is not None:
            print("[geopy___] Postal code: " + str(postal_code) + ", Result: (" + str(location.latitude) + ", " + str(location.longitude) + ")")
            if check_location_in_sg(location.latitude, location.longitude):
                return True

        print("[pgeocode] Postal code: " + str(postal_code) + ", Result: " + str(result))
        return False  # Postal code is not valid in Singapore

def check_location_in_sg(latitude, longitude):
    # latitude [1.15, 1.47]
    # longitude [103.6, 104.1]

    min_lat_sg = 1.15
    max_lat_sg = 1.47
    min_lng_sg = 103.6
    max_lng_sg = 104.1

    if (min_lat_sg <= latitude <= max_lat_sg) and (min_lng_sg <= longitude <= max_lng_sg):
        return True
    return False  
    
def validate_singapore_address(input_address):
    consecutive_digit_matches = check_consecutive_digits(input_address)
    if consecutive_digit_matches:
        for match in consecutive_digit_matches:
            if validate_postal_code(match):
                return "SINGAPORE"  # At least one valid postal code found in Singapore
    return "FOREIGN"  # No valid postal code found in Singapore

In [20]:
# Methods for SG address segmentation

##########################################
# workaround to extract postal code. to remove this portion after training with new data.
# can consider using this portion as a cross check with the postal code predicted by model
def get_singapore_postal_code(address):
    consecutive_digit_matches = check_consecutive_digits(address)
    if consecutive_digit_matches:
        for match in consecutive_digit_matches:
            if validate_postal_code(match):
                return match
    return None
##########################################


def predict_fields_sg(df):
    block_list = []
    street_list = []
    building_list = []
    floor_list = []
    unit_list = []
    country_list = []
    postal_code_list = []

    for index, address in enumerate(df['address']):

        # note: ent.label is an integer identifier for entity while ent.label_ uses a string representation
        entities = {ent.label_: ent.text for ent in nlp(str(address)).ents}

        block_list.append(entities.get('BLOCK', ''))
        street_list.append(entities.get('STREET', ''))
        building_list.append(entities.get('BUILDING', ''))
        floor_list.append(entities.get('FLOOR_NUM', ''))
        unit_list.append(entities.get('UNIT_NUM', ''))
        country_list.append(entities.get('COUNTRY', ''))
#         postal_code_list.append(entities.get('POSTAL_CODE', ''))

        ####################################
        # workaround to extract postal code. to remove this portion after training with new data.
        # can consider using this portion as a cross check with the postal code predicted by model
        postal_code = get_singapore_postal_code(address)
        postal_code_list.append(postal_code)
        ####################################

        print(f"Iteration: {index+1}/{len(df)}")

    tested_data = df.assign(
        predicted_block = block_list,
        predicted_street = street_list,
        predicted_building = building_list,
        predicted_floor_num = floor_list,
        predicted_unit_num = unit_list,
        predicted_country = country_list,
        predicted_postal_code = postal_code_list
    )
    
    tested_data = tested_data.replace(r'^\s*$', np.nan, regex=True)
    return tested_data

In [21]:
# Methods for foreign address segmentation

def predict_fields_foreign(df):
    street_list = []
    zipcode_list = []
    city_list = []
    province_list = []
    country_list = []

    for index, address in enumerate(df['address']):

        # note: ent.label is an integer identifier for entity while ent.label_ uses a string representation
        entities = {ent.label_: ent.text for ent in nlp(str(address)).ents}

        street_list.append(entities.get('STREET', ''))
        zipcode_list.append(entities.get('ZIPCODE', ''))
        city_list.append(entities.get('CITY', ''))
        province_list.append(entities.get('PROVINCE', ''))
        country_list.append(entities.get('COUNTRY', ''))

        print(f"Iteration: {index+1}/{len(df)}")

    tested_data = df.assign(
        predicted_street = street_list,
        predicted_zipcode = zipcode_list,
        predicted_city = city_list,
        predicted_province = province_list,
        predicted_country = country_list
    )
    
    tested_data = tested_data.replace(r'^\s*$', np.nan, regex=True)
    return tested_data

### Input test addresses

In [15]:
model_mapping = {
    'SINGAPORE': './ner-sg/output/models/model-best',
    'FOREIGN': './ner-foreign/output/models/model-best',
    'DEFAULT': './ner-sg/output/models/model-best'  # TODO: Add default model path
}

# Load all models
print("Loading models...")
loaded_models = {}
for country, model_directory in model_mapping.items():
    loaded_models[country] = spacy.load(model_directory)

for model in loaded_models:
    print(model)

print("Models loaded!")

Loading models...
SINGAPORE
FOREIGN
DEFAULT
Models loaded!


In [32]:
# TEST DATA

address_list = [
    "CIKARANG TIMUR JATIREJA NO. 425 BEKASI KABUPATEN JAWA BARAT 17530 INDONESIA",
    "NO. 44 JALAN DESA MELUR 56000 CHERAS KUALA LUMPUR MALAYSIA",
    "9 TEMASEK ROAD #07-01 SUNTEC TOWER 2 SINGAPORE 038989",
    "9 TEMASEK ROAD #07-01 SUNTEC TOWER 2 S038989",
    "9 TEMASEK ROAD #07-01 SUNTEC TOWER 2 S(038989)",
    "9 TEMASEK ROAD #07-01 SUNTEC TOWER 2 038989"
]

In [33]:
for address in address_list:
    
    cleaned_address = address.replace(',', '').strip()
    
    identified_country = validate_singapore_address(cleaned_address)
    
    # Select the corresponding model or use the default model
    nlp = loaded_models.get(identified_country, loaded_models['DEFAULT'])

    doc = nlp(cleaned_address)

    entities = [(ent.text, ent.label_) for ent in doc.ents]

#     formatted_entities = ', '.join(f'({text.replace(",", "").strip()}, {label})' for text, label in entities if text)
    
    print("Address string:", cleaned_address)
    print("Identified country:", identified_country)
    print("Parsed address:", str(entities))
    print("--------------------------")

Address string: CIKARANG TIMUR JATIREJA NO. 425 BEKASI KABUPATEN JAWA BARAT 17530 INDONESIA
Identified country: FOREIGN
Parsed address: [('CIKARANG TIMUR JATIREJA NO. 425', 'STREET'), ('BEKASI KABUPATEN', 'CITY'), ('JAWA BARAT', 'PROVINCE'), ('17530', 'ZIPCODE'), ('INDONESIA', 'COUNTRY')]
--------------------------
Address string: NO. 44 JALAN DESA MELUR 56000 CHERAS KUALA LUMPUR MALAYSIA
Identified country: FOREIGN
Parsed address: [('NO. 44 JALAN DESA MELUR', 'STREET'), ('56000', 'ZIPCODE'), ('CHERAS', 'CITY'), ('KUALA LUMPUR', 'PROVINCE'), ('MALAYSIA', 'COUNTRY')]
--------------------------
[pgeocode] Postal code: 038989, Result: (1.2956, 103.859)
Address string: 9 TEMASEK ROAD #07-01 SUNTEC TOWER 2 SINGAPORE 038989
Identified country: SINGAPORE
Parsed address: [('9', 'BLOCK'), ('TEMASEK ROAD', 'STREET'), ('07', 'FLOOR_NUM'), ('01', 'UNIT_NUM'), ('SUNTEC TOWER 2', 'BUILDING'), ('SINGAPORE', 'COUNTRY'), ('038989', 'POSTAL_CODE')]
--------------------------
[pgeocode] Postal code: 0389

### Testing with data in csv files

In [34]:
# TEST DATA
test_data = pd.read_csv("./test-data/test_data.csv")
test_data

Unnamed: 0,address
0,CIKARANG TIMUR JATIREJA NO. 425 BEKASI KABUPAT...
1,NO. 44 JALAN DESA MELUR 56000 CHERAS KUALA LUM...
2,9 TEMASEK ROAD #07-01 SUNTEC TOWER 2 SINGAPORE...
3,9 TEMASEK ROAD #07-01 SUNTEC TOWER 2 S038989
4,9 TEMASEK ROAD #07-01 SUNTEC TOWER 2 S(038989)
5,9 TEMASEK ROAD #07-01 SUNTEC TOWER 2 038989


In [35]:
model_mapping = {
    'SINGAPORE': './ner-sg/output/models/model-best',
    'FOREIGN': './ner-foreign/output/models/model-best',
    'DEFAULT': './ner-sg/output/models/model-best'  # TODO: Add default model path
}

# Load all models
print("Loading models...")
loaded_models = {}
for country, model_directory in model_mapping.items():
    loaded_models[country] = spacy.load(model_directory)

for model in loaded_models:
    print(model)
    
print("Models loaded!")

Loading models...
SINGAPORE
FOREIGN
DEFAULT
Models loaded!


In [40]:
component_lists = {
    'block': [],
    'street': [],
    'building': [],
    'floor_num': [],
    'unit_num': [],
    'postal_code': [],
    'country': [],
    'city': [],
    'province': [],
    'zipcode': []
}

for index, row in test_data.iterrows():
    cleaned_address = row['address'].replace(',', '').strip()
    identified_country = validate_singapore_address(cleaned_address)
    
    
    
    nlp = loaded_models.get(identified_country, loaded_models['DEFAULT'])

    doc = nlp(cleaned_address)

    entities = {ent.label_: ent.text for ent in doc.ents}

    for component in address_components:
        try:
            ent = entities.get(component.upper(), '')
        except:
            ent = np.nan
        component_lists[component].append(ent)
        ####################################
        # workaround to extract postal code. to remove this portion after training with new data.
        # can consider using this portion as a cross check with the postal code predicted by model
        if identified_country == "SINGAPORE":
            postal_code = get_singapore_postal_code(address)
            component_lists['postal_code'].append(postal_code)
        ####################################

    print(f"Iteration: {index+1}/{len(test_data)}")
    print("Address string:", cleaned_address)
    print("Identified country:", identified_country)
    print("Parsed address:", str(entities))
    print("--------------------------")

tested_data = pd.DataFrame(component_lists)
tested_data = tested_data.replace(r'^\s*$', np.nan, regex=True)
tested_data['postcode'] = tested_data['postal_code'].fillna(tested_data['zipcode'])
tested_data = tested_data.drop(['zipcode', 'postal_code'], axis=1)
tested_data

Iteration: 1/6
Address string: CIKARANG TIMUR JATIREJA NO. 425 BEKASI KABUPATEN JAWA BARAT 17530 INDONESIA
Identified country: FOREIGN
Parsed address: {'STREET': 'CIKARANG TIMUR JATIREJA NO. 425', 'CITY': 'BEKASI KABUPATEN', 'PROVINCE': 'JAWA BARAT', 'ZIPCODE': '17530', 'COUNTRY': 'INDONESIA'}
--------------------------
Iteration: 2/6
Address string: NO. 44 JALAN DESA MELUR 56000 CHERAS KUALA LUMPUR MALAYSIA
Identified country: FOREIGN
Parsed address: {'STREET': 'NO. 44 JALAN DESA MELUR', 'ZIPCODE': '56000', 'CITY': 'CHERAS', 'PROVINCE': 'KUALA LUMPUR', 'COUNTRY': 'MALAYSIA'}
--------------------------
[pgeocode] Postal code: 038989, Result: (1.2956, 103.859)
[pgeocode] Postal code: 038989, Result: (1.2956, 103.859)
[pgeocode] Postal code: 038989, Result: (1.2956, 103.859)
[pgeocode] Postal code: 038989, Result: (1.2956, 103.859)
[pgeocode] Postal code: 038989, Result: (1.2956, 103.859)
[pgeocode] Postal code: 038989, Result: (1.2956, 103.859)
[pgeocode] Postal code: 038989, Result: (1

ValueError: All arrays must be of the same length

In [60]:
tested_data.to_csv("./test-data/tested_data.csv")