# Geocode Singapore Addresses

In [9]:
import pandas as pd
import glob
import geopandas as gpd
from geopy.geocoders import MapBox
from geopy.extra.rate_limiter import RateLimiter

##### Read in CSV

In [10]:
resale_price_all = glob.glob("./*.csv")

li = []

for filename in resale_price_all:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)
    
resale_df = pd.concat(li, axis=0, ignore_index=True)
resale_df['ADDRESS'] = resale_df['block'].astype(str) + ' ' + resale_df['street_name'] + ',' + ' Singapore'   
resale_df.info()
resale_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 746207 entries, 0 to 746206
Data columns (total 12 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   month                746207 non-null  object 
 1   town                 746207 non-null  object 
 2   flat_type            746207 non-null  object 
 3   block                746207 non-null  object 
 4   street_name          746207 non-null  object 
 5   storey_range         746207 non-null  object 
 6   floor_area_sqm       746207 non-null  float64
 7   flat_model           746207 non-null  object 
 8   lease_commence_date  746207 non-null  int64  
 9   resale_price         746207 non-null  float64
 10  remaining_lease      37153 non-null   float64
 11  ADDRESS              746207 non-null  object 
dtypes: float64(3), int64(1), object(8)
memory usage: 68.3+ MB


Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,remaining_lease,ADDRESS
0,2012-03,ANG MO KIO,2 ROOM,172,ANG MO KIO AVE 4,06 TO 10,45.0,Improved,1986,250000.0,,"172 ANG MO KIO AVE 4, Singapore"
1,2012-03,ANG MO KIO,2 ROOM,510,ANG MO KIO AVE 8,01 TO 05,44.0,Improved,1980,265000.0,,"510 ANG MO KIO AVE 8, Singapore"
2,2012-03,ANG MO KIO,3 ROOM,610,ANG MO KIO AVE 4,06 TO 10,68.0,New Generation,1980,315000.0,,"610 ANG MO KIO AVE 4, Singapore"
3,2012-03,ANG MO KIO,3 ROOM,474,ANG MO KIO AVE 10,01 TO 05,67.0,New Generation,1984,320000.0,,"474 ANG MO KIO AVE 10, Singapore"
4,2012-03,ANG MO KIO,3 ROOM,604,ANG MO KIO AVE 5,06 TO 10,67.0,New Generation,1980,321000.0,,"604 ANG MO KIO AVE 5, Singapore"
...,...,...,...,...,...,...,...,...,...,...,...,...
746202,2016-12,YISHUN,5 ROOM,297,YISHUN ST 20,13 TO 15,112.0,Improved,2000,488000.0,82.0,"297 YISHUN ST 20, Singapore"
746203,2016-12,YISHUN,5 ROOM,838,YISHUN ST 81,01 TO 03,122.0,Improved,1987,455000.0,69.0,"838 YISHUN ST 81, Singapore"
746204,2016-12,YISHUN,EXECUTIVE,664,YISHUN AVE 4,10 TO 12,181.0,Apartment,1992,778000.0,74.0,"664 YISHUN AVE 4, Singapore"
746205,2016-12,YISHUN,EXECUTIVE,325,YISHUN CTRL,01 TO 03,146.0,Maisonette,1988,575000.0,70.0,"325 YISHUN CTRL, Singapore"


##### Check bad data transforms
There should be 0 bad data transform

In [11]:
resale_df[pd.isnull(resale_df['ADDRESS'])]

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,remaining_lease,ADDRESS


##### Get unique addresses needed and rename naming shortcuts for geocode
To lower no. of requests to APIs -> Lower costs to be incurred

In [12]:
unique_address_df = resale_df['ADDRESS'].drop_duplicates().to_frame()
unique_address_df['ADDRESS'] = unique_address_df['ADDRESS'].str.replace('BT', 'BUKIT')
unique_address_df['ADDRESS'] = unique_address_df['ADDRESS'].str.replace("C'WEALTH", "COMMONWEALTH")
unique_address_df['ADDRESS'] = unique_address_df['ADDRESS'].str.replace('JLN', 'JALAN')
unique_address_df['ADDRESS'] = unique_address_df['ADDRESS'].str.replace('NILE', 'NIEL')
unique_address_df

Unnamed: 0,ADDRESS
0,"172 ANG MO KIO AVE 4, Singapore"
1,"510 ANG MO KIO AVE 8, Singapore"
2,"610 ANG MO KIO AVE 4, Singapore"
3,"474 ANG MO KIO AVE 10, Singapore"
4,"604 ANG MO KIO AVE 5, Singapore"
5,"154 ANG MO KIO AVE 5, Singapore"
6,"110 ANG MO KIO AVE 4, Singapore"
7,"445 ANG MO KIO AVE 10, Singapore"
8,"476 ANG MO KIO AVE 10, Singapore"
9,"631 ANG MO KIO AVE 4, Singapore"


# Geocoding APIs

### MapBox Geocoding

##### Initialise and testing

In [13]:
geolocator_mapbox = MapBox(api_key="INSERT_API_KEY_HERE", timeout=5)
# geocode = lambda query: geolocator_mapbox.geocode("%s, Singapore" % query)
location = geolocator_mapbox.geocode("602C PUNGGOL CTRL, Singapore")
print(location.point)
print(location.point.latitude)
location

1 24m 9.9576s N, 103 54m 25.7976s E
1.402766


Location(602c Punggol Central, Singapore 823602, Singapore, (1.402766, 103.907166, 0.0))

### Begin Geocoding

In [14]:
mapbox_geocode_df = unique_address_df.copy()
mapbox_geocode = RateLimiter(geolocator_mapbox.geocode, min_delay_seconds=1/5000)
mapbox_geocode_df['geocode'] = mapbox_geocode_df['ADDRESS'].apply(mapbox_geocode)
pd.options.display.max_rows = 9000
mapbox_geocode_df

Unnamed: 0,ADDRESS,geocode
0,"172 ANG MO KIO AVE 4, Singapore","(172 Ang Mo Kio Avenue 4, Singapore 561172, Si..."
1,"510 ANG MO KIO AVE 8, Singapore","(510 Ang Mo Kio Avenue 8, Singapore 560510, Si..."
2,"610 ANG MO KIO AVE 4, Singapore","(610 Ang Mo Kio Avenue 4, Singapore 560610, Si..."
3,"474 ANG MO KIO AVE 10, Singapore","(474 Ang Mo Kio Avenue 10, Singapore 560474, S..."
4,"604 ANG MO KIO AVE 5, Singapore","(604 Ang Mo Kio Avenue 5, Singapore 560604, Si..."
5,"154 ANG MO KIO AVE 5, Singapore","(154 Ang Mo Kio Avenue 5, Singapore 560154, Si..."
6,"110 ANG MO KIO AVE 4, Singapore","(110 Ang Mo Kio Avenue 4, Singapore 560110, Si..."
7,"445 ANG MO KIO AVE 10, Singapore","(445 Ang Mo Kio Avenue 10, Singapore 560445, S..."
8,"476 ANG MO KIO AVE 10, Singapore","(476 Ang Mo Kio Avenue 10, Singapore 560476, S..."
9,"631 ANG MO KIO AVE 4, Singapore","(631 Ang Mo Kio Avenue 4, Singapore 560631, Si..."


#### Missing results
if things go well, there should be no None returned from MapBox geolocator

In [50]:
mapbox_geocode_df[pd.isnull(mapbox_geocode_df['geocode'])]

Unnamed: 0,ADDRESS,geocode


##### Bad results
geolocators often return the coordinates outside of Singapore, so we should filter those out and rectify it
if things go well, there should be nothing returned from MapBox geolocator

In [19]:
bad_mapbox_df = mapbox_geocode_df.copy()

bad_mapbox_df.info()
# bad_mapbox_df = bad_mapbox_df[(bad_mapbox_df['geocode']).isin([g for g in bad_mapbox_df['geocode'] if 'Singapore' not in g.raw['place_name']])]
[g for g in bad_mapbox_df['geocode'] if 'Singapore' not in g.raw['place_name']]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8712 entries, 0 to 745815
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ADDRESS  8712 non-null   object
 1   geocode  8712 non-null   object
dtypes: object(2)
memory usage: 204.2+ KB


[]

#### Merge unique addresses to original resale_df

In [40]:
resale_df = resale_df.drop(['geocode'], axis=1)

In [45]:
merge_address_df = mapbox_geocode_df.copy()
original_resale_df = resale_df.copy()

merge_address_df['ADDRESS'] = merge_address_df['ADDRESS'].str.replace('BUKIT', 'BT')
merge_address_df['ADDRESS'] = merge_address_df['ADDRESS'].str.replace("COMMONWEALTH", "C'WEALTH")
merge_address_df['ADDRESS'] = merge_address_df['ADDRESS'].str.replace('JALAN', 'JLN')
merge_address_df['ADDRESS'] = merge_address_df['ADDRESS'].str.replace('NIEL', 'NILE')

geocoded_resale_df = pd.merge(original_resale_df, merge_address_df, on=['ADDRESS'], how='inner').copy()

geocoded_resale_df

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,remaining_lease,ADDRESS,geocode
0,2012-03,ANG MO KIO,2 ROOM,172,ANG MO KIO AVE 4,06 TO 10,45.0,Improved,1986,250000.0,,"172 ANG MO KIO AVE 4, Singapore","(172 Ang Mo Kio Avenue 4, Singapore 561172, Si..."
1,2012-04,ANG MO KIO,3 ROOM,172,ANG MO KIO AVE 4,06 TO 10,60.0,Improved,1986,302500.0,,"172 ANG MO KIO AVE 4, Singapore","(172 Ang Mo Kio Avenue 4, Singapore 561172, Si..."
2,2012-05,ANG MO KIO,3 ROOM,172,ANG MO KIO AVE 4,06 TO 10,60.0,Improved,1986,295000.0,,"172 ANG MO KIO AVE 4, Singapore","(172 Ang Mo Kio Avenue 4, Singapore 561172, Si..."
3,2012-06,ANG MO KIO,2 ROOM,172,ANG MO KIO AVE 4,04 TO 06,45.0,Improved,1986,259000.0,,"172 ANG MO KIO AVE 4, Singapore","(172 Ang Mo Kio Avenue 4, Singapore 561172, Si..."
4,2012-08,ANG MO KIO,2 ROOM,172,ANG MO KIO AVE 4,07 TO 09,45.0,Improved,1986,263000.0,,"172 ANG MO KIO AVE 4, Singapore","(172 Ang Mo Kio Avenue 4, Singapore 561172, Si..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
746202,2016-12,PUNGGOL,4 ROOM,273A,PUNGGOL PL,13 TO 15,93.0,Premium Apartment,2013,560000.0,95.0,"273A PUNGGOL PL, Singapore","(273a Punggol Place, Singapore 821273, Singapo..."
746203,2016-12,PUNGGOL,5 ROOM,617C,PUNGGOL DR,13 TO 15,120.0,Premium Apartment,2013,570000.0,95.0,"617C PUNGGOL DR, Singapore","(617c Punggol Drive, Singapore 823617, Singapo..."
746204,2016-12,PUNGGOL,5 ROOM,618B,PUNGGOL DR,16 TO 18,113.0,Premium Apartment,2013,538000.0,95.0,"618B PUNGGOL DR, Singapore","(618b Punggol Drive, Singapore 822618, Singapo..."
746205,2016-12,SENGKANG,4 ROOM,435A,FERNVALE RD,04 TO 06,94.0,Premium Apartment,2013,410000.0,95.0,"435A FERNVALE RD, Singapore","(435a Fernvale Road, Singapore 791435, Singapo..."


#### Missing results
if things go well, there should be no None returned from MapBox geolocator

In [46]:
geocoded_resale_df[pd.isnull(geocoded_resale_df['geocode'])]

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,remaining_lease,ADDRESS,geocode


### Finalise DataFrame for output

In [48]:
latlng_resale_df = geocoded_resale_df.copy()
latlng_resale_df['point'] = geocoded_resale_df['geocode'].apply(lambda loc: tuple(loc.point) if loc else None)
latlng_resale_df[['latitude', 'longitude', 'altitude']] = pd.DataFrame(latlng_resale_df['point'].tolist(), index=geocoded_resale_df.index)
latlng_resale_df = latlng_resale_df.drop(['ADDRESS','geocode','altitude','point'], axis=1)
latlng_resale_df

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,remaining_lease,latitude,longitude
0,2012-03,ANG MO KIO,2 ROOM,172,ANG MO KIO AVE 4,06 TO 10,45.0,Improved,1986,250000.0,,1.374573,103.836692
1,2012-04,ANG MO KIO,3 ROOM,172,ANG MO KIO AVE 4,06 TO 10,60.0,Improved,1986,302500.0,,1.374573,103.836692
2,2012-05,ANG MO KIO,3 ROOM,172,ANG MO KIO AVE 4,06 TO 10,60.0,Improved,1986,295000.0,,1.374573,103.836692
3,2012-06,ANG MO KIO,2 ROOM,172,ANG MO KIO AVE 4,04 TO 06,45.0,Improved,1986,259000.0,,1.374573,103.836692
4,2012-08,ANG MO KIO,2 ROOM,172,ANG MO KIO AVE 4,07 TO 09,45.0,Improved,1986,263000.0,,1.374573,103.836692
...,...,...,...,...,...,...,...,...,...,...,...,...,...
746202,2016-12,PUNGGOL,4 ROOM,273A,PUNGGOL PL,13 TO 15,93.0,Premium Apartment,2013,560000.0,95.0,1.402284,103.902104
746203,2016-12,PUNGGOL,5 ROOM,617C,PUNGGOL DR,13 TO 15,120.0,Premium Apartment,2013,570000.0,95.0,1.401941,103.912461
746204,2016-12,PUNGGOL,5 ROOM,618B,PUNGGOL DR,16 TO 18,113.0,Premium Apartment,2013,538000.0,95.0,1.401458,103.912119
746205,2016-12,SENGKANG,4 ROOM,435A,FERNVALE RD,04 TO 06,94.0,Premium Apartment,2013,410000.0,95.0,1.393394,103.876290


In [49]:
latlng_resale_df.to_csv('./resale.csv', index=False)