In [1]:
import spacy
from spacy.tokens import DocBin
from spacy.util import filter_spans
import pandas as pd
import numpy as np
import re
import csv
from fuzzywuzzy import fuzz

In [2]:
# validate if address given is an address in sg
# consider postal code pattern - 6 digits?

def validate_singapore_address(address):
    regex_pattern = r"\b(SINGAPORE|SGP|SG)\b"

    if re.search(regex_pattern, address, re.IGNORECASE):
        return "SINGAPORE"

    return "FOREIGN"

In [4]:
# Methods for SG address segmentation

def predict_fields_sg(df):
    block_list = []
    street_list = []
    building_list = []
    floor_list = []
    unit_list = []
    country_list = []
    postal_code_list = []

    for index, address in enumerate(df['address']):

        # note: ent.label is an integer identifier for entity while ent.label_ uses a string representation
        entities = {ent.label_: ent.text for ent in nlp(str(address)).ents}

        block_list.append(entities.get('BLOCK', ''))
        street_list.append(entities.get('STREET', ''))
        building_list.append(entities.get('BUILDING', ''))
        floor_list.append(entities.get('FLOOR_NUM', ''))
        unit_list.append(entities.get('UNIT_NUM', ''))
        country_list.append(entities.get('COUNTRY', ''))
        postal_code_list.append(entities.get('POSTAL_CODE', ''))

        print(f"Iteration: {index+1}/{len(df)}")

    tested_data = df.assign(
        predicted_block = block_list,
        predicted_street = street_list,
        predicted_building = building_list,
        predicted_floor_num = floor_list,
        predicted_unit_num = unit_list,
        predicted_country = country_list,
        predicted_postal_code = postal_code_list
    )
    
    tested_data = tested_data.replace(r'^\s*$', np.nan, regex=True)
    return tested_data

In [5]:
# Methods for foreign address segmentation

def predict_fields_foreign(df):
    street_list = []
    zipcode_list = []
    city_list = []
    province_list = []
    country_list = []

    for index, address in enumerate(df['address']):

        # note: ent.label is an integer identifier for entity while ent.label_ uses a string representation
        entities = {ent.label_: ent.text for ent in nlp(str(address)).ents}

        street_list.append(entities.get('STREET', ''))
        zipcode_list.append(entities.get('ZIPCODE', ''))
        city_list.append(entities.get('CITY', ''))
        province_list.append(entities.get('PROVINCE', ''))
        country_list.append(entities.get('COUNTRY', ''))

        print(f"Iteration: {index+1}/{len(df)}")

    tested_data = df.assign(
        predicted_street = street_list,
        predicted_zipcode = zipcode_list,
        predicted_city = city_list,
        predicted_province = province_list,
        predicted_country = country_list
    )
    
    tested_data = tested_data.replace(r'^\s*$', np.nan, regex=True)
    return tested_data

### Input test addresses

In [13]:
# TEST DATA

address_list = [
    "CIKARANG TIMUR JATIREJA NO. 425 BEKASI KABUPATEN JAWA BARAT 17530 INDONESIA",
    "NO. 44 JALAN DESA MELUR 56000 CHERAS KUALA LUMPUR MALAYSIA",
    "9 TEMASEK BOULEVARD #07-01 SUNTEC TOWER 2 SINGAPORE 038989"
]

In [20]:
model_mapping = {
    'SINGAPORE': './ner-sg/output/models/model-best',
    'FOREIGN': './ner-foreign/output/models/model-best',
    'DEFAULT': './ner-sg/output/models/model-best'  # TODO: Add default model path
}

# Load all models
print("Loading models...")
loaded_models = {}
for country, model_directory in model_mapping.items():
    loaded_models[country] = spacy.load(model_directory)

for model in loaded_models:
    print(model)

print("Models loaded!")

Loading models...
SINGAPORE
FOREIGN
DEFAULT
Models loaded!


In [14]:
for address in address_list:
    cleaned_address = address.replace(',', '').strip()
    identified_country = validate_singapore_address(cleaned_address)
    
    # Select the corresponding model or use the default model
    nlp = loaded_models.get(identified_country, loaded_models['DEFAULT'])

    doc = nlp(cleaned_address)

    entities = [(ent.text, ent.label_) for ent in doc.ents]

#     formatted_entities = ', '.join(f'({text.replace(",", "").strip()}, {label})' for text, label in entities if text)
    
    print("Address string:", cleaned_address)
    print("Identified country:", identified_country)
    print("Parsed address:", str(entities))
    print("--------------------------")

Address string: CIKARANG TIMUR JATIREJA NO. 425 BEKASI KABUPATEN JAWA BARAT 17530 INDONESIA
Identified country: FOREIGN
Parsed address: [('CIKARANG TIMUR JATIREJA NO. 425', 'STREET'), ('BEKASI KABUPATEN', 'CITY'), ('JAWA BARAT', 'PROVINCE'), ('17530', 'ZIPCODE'), ('INDONESIA', 'COUNTRY')]
--------------------------
Address string: NO. 44 JALAN DESA MELUR 56000 CHERAS KUALA LUMPUR MALAYSIA
Identified country: FOREIGN
Parsed address: [('NO. 44 JALAN DESA MELUR', 'STREET'), ('56000', 'ZIPCODE'), ('CHERAS', 'CITY'), ('KUALA LUMPUR', 'PROVINCE'), ('MALAYSIA', 'COUNTRY')]
--------------------------
Address string: 9 TEMASEK BOULEVARD #07-01 SUNTEC TOWER 2 SINGAPORE 038989
Identified country: SINGAPORE
Parsed address: [('9', 'BLOCK'), ('TEMASEK BOULEVARD', 'STREET'), ('07', 'FLOOR_NUM'), ('01', 'UNIT_NUM'), ('SUNTEC TOWER 2', 'BUILDING'), ('SINGAPORE', 'COUNTRY'), ('038989', 'POSTAL_CODE')]
--------------------------


### Testing with data in csv files

In [56]:
# TEST DATA
test_data = pd.read_csv("./test-data/test_data.csv")
test_data

Unnamed: 0,address
0,CIKARANG TIMUR JATIREJA NO. 425 BEKASI KABUPAT...
1,NO. 44 JALAN DESA MELUR 56000 CHERAS KUALA LUM...
2,9 TEMASEK BOULEVARD #07-01 SUNTEC TOWER 2 SING...


In [57]:
model_mapping = {
    'SINGAPORE': './ner-sg/output/models/model-best',
    'FOREIGN': './ner-foreign/output/models/model-best',
    'DEFAULT': './ner-sg/output/models/model-best'  # TODO: Add default model path
}

# Load all models
print("Loading models...")
loaded_models = {}
for country, model_directory in model_mapping.items():
    loaded_models[country] = spacy.load(model_directory)

for model in loaded_models:
    print(model)
    
print("Models loaded!")

Loading models...
SINGAPORE
FOREIGN
DEFAULT
Models loaded!


In [58]:
address_components = ['block', 'street', 'building', 'floor_num', 'unit_num', 'postal_code', 'zipcode', 'city', 'province', 'country']
component_lists = {component: [] for component in address_components}

In [59]:
for index, row in test_data.iterrows():
    cleaned_address = row['address'].replace(',', '').strip()
    identified_country = validate_singapore_address(cleaned_address)
    
    nlp = loaded_models.get(identified_country, loaded_models['DEFAULT'])

    doc = nlp(cleaned_address)

    entities = {ent.label_: ent.text for ent in doc.ents}

    for component in address_components:
        try:
            ent = entities.get(component.upper(), '')
        except:
            ent = np.nan
        component_lists[component].append(ent)

    print(f"Iteration: {index+1}/{len(test_data)}")
    print("Address string:", cleaned_address)
    print("Identified country:", identified_country)
    print("Parsed address:", str(entities))
    print("--------------------------")

tested_data = pd.DataFrame(component_lists)
tested_data = tested_data.replace(r'^\s*$', np.nan, regex=True)
tested_data['postcode'] = tested_data['postal_code'].fillna(tested_data['zipcode'])
tested_data = tested_data.drop(['zipcode', 'postal_code'], axis=1)
tested_data

Iteration: 1/3
Address string: CIKARANG TIMUR JATIREJA NO. 425 BEKASI KABUPATEN JAWA BARAT 17530 INDONESIA
Identified country: FOREIGN
Parsed address: {'STREET': 'CIKARANG TIMUR JATIREJA NO. 425', 'CITY': 'BEKASI KABUPATEN', 'PROVINCE': 'JAWA BARAT', 'ZIPCODE': '17530', 'COUNTRY': 'INDONESIA'}
--------------------------
Iteration: 2/3
Address string: NO. 44 JALAN DESA MELUR 56000 CHERAS KUALA LUMPUR MALAYSIA
Identified country: FOREIGN
Parsed address: {'STREET': 'NO. 44 JALAN DESA MELUR', 'ZIPCODE': '56000', 'CITY': 'CHERAS', 'PROVINCE': 'KUALA LUMPUR', 'COUNTRY': 'MALAYSIA'}
--------------------------
Iteration: 3/3
Address string: 9 TEMASEK BOULEVARD #07-01 SUNTEC TOWER 2 SINGAPORE 038989
Identified country: SINGAPORE
Parsed address: {'BLOCK': '9', 'STREET': 'TEMASEK BOULEVARD', 'FLOOR_NUM': '07', 'UNIT_NUM': '01', 'BUILDING': 'SUNTEC TOWER 2', 'COUNTRY': 'SINGAPORE', 'POSTAL_CODE': '038989'}
--------------------------


Unnamed: 0,block,street,building,floor_num,unit_num,city,province,country,postcode
0,,CIKARANG TIMUR JATIREJA NO. 425,,,,BEKASI KABUPATEN,JAWA BARAT,INDONESIA,17530
1,,NO. 44 JALAN DESA MELUR,,,,CHERAS,KUALA LUMPUR,MALAYSIA,56000
2,9.0,TEMASEK BOULEVARD,SUNTEC TOWER 2,7.0,1.0,,,SINGAPORE,38989


In [60]:
tested_data.to_csv("./test-data/tested_data.csv")