In [16]:
import pandas as pd
import requests
import time
from tqdm import tqdm

def geocode_with_nominatim(block, street_name):
    url = "https://nominatim.openstreetmap.org/search"
    
    address_formats = [
        f"{block} {street_name}, Singapore",
        f"Block {block} {street_name}, Singapore",
        f"{block} {street_name}, SG",
    ]
    
    headers = {'User-Agent': 'HDB-Geocoder/1.0'}
    
    for address in address_formats:
        params = {
            'q': address,
            'format': 'json',
            'limit': 1,
            'countrycodes': 'sg'
        }
        
        try:
            response = requests.get(url, params=params, headers=headers, timeout=10)
            data = response.json()
            
            if len(data) > 0:
                result = data[0]
                lat = float(result['lat'])
                lon = float(result['lon'])
                
                if 1.15 <= lat <= 1.47 and 103.6 <= lon <= 104.0:
                    return (lat, lon)
            
            time.sleep(1.1)
        except:
            time.sleep(1.1)
    
    return (None, None)

# Load dataset
df = pd.read_csv("01_Original_datasets/HDB_with_coordinates.csv")
print(f"Loaded {len(df):,} rows")

# Replaceing C'WEALTH to COMMONWEALTH
df['street_name'] = df['street_name'].str.replace("C'WEALTH", "COMMONWEALTH", regex=False)

# Find missing
missing_mask = df['latitude'].isna() | df['longitude'].isna()
missing_df = df[missing_mask]
unique_missing = missing_df[['block', 'street_name']].drop_duplicates()

print(f"Geocoding {len(unique_missing):,} unique addresses...")

# Geocode
address_coords = {}
for idx, row in tqdm(unique_missing.iterrows(), total=len(unique_missing)):
    key = (str(row['block']), str(row['street_name']))
    lat, lon = geocode_with_nominatim(row['block'], row['street_name'])
    address_coords[key] = (lat, lon)

# Map back
for idx, row in missing_df.iterrows():
    key = (str(row['block']), str(row['street_name']))
    if key in address_coords:
        lat, lon = address_coords[key]
        df.at[idx, 'latitude'] = lat
        df.at[idx, 'longitude'] = lon

# Save
df.to_csv("HDB_geocoded.csv", index=False)
print(f"Done! Missing: {df['latitude'].isna().sum()}")

Loaded 217,372 rows
Geocoding 92 unique addresses...


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 92/92 [01:39<00:00,  1.08s/it]


Done! Missing: 0
