### Getting the MRT stations within 1km radius of HDB flat

We will first load the `hdb_working_data.csv` file

In [36]:
import sys
sys.path.append('../api')

import pandas as pd
from openstreetmap_api import get_bto_coordinates
from datetime import datetime
import geopandas as gpd
from shapely.geometry import Point

hdb_cleaned_df = pd.read_csv('../data/modified/hdb_working_data.csv')
hdb_cleaned_df.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price,address,latitude,longitude,sora
0,2015-01,ANG MO KIO,3 ROOM,174,ANG MO KIO AVE 4,07 TO 09,60.0,Improved,1986,70.0,255000.0,174 ANG MO KIO AVE 4,1.375097,103.837619,0.129019
1,2015-01,ANG MO KIO,3 ROOM,541,ANG MO KIO AVE 10,01 TO 03,68.0,New Generation,1981,65.0,275000.0,541 ANG MO KIO AVE 10,1.373922,103.855621,0.129019
2,2015-01,ANG MO KIO,3 ROOM,163,ANG MO KIO AVE 4,01 TO 03,69.0,New Generation,1980,64.0,285000.0,163 ANG MO KIO AVE 4,1.373552,103.838169,0.129019
3,2015-01,ANG MO KIO,3 ROOM,446,ANG MO KIO AVE 10,01 TO 03,68.0,New Generation,1979,63.0,290000.0,446 ANG MO KIO AVE 10,1.367761,103.855357,0.129019
4,2015-01,ANG MO KIO,3 ROOM,557,ANG MO KIO AVE 10,07 TO 09,68.0,New Generation,1980,64.0,290000.0,557 ANG MO KIO AVE 10,1.371626,103.857736,0.129019


Then we load the `mrt_stations` dataset

In [37]:
mrt_stations = pd.read_csv('../data/modified/mrt_stations.csv')
mrt_stations.head()

Unnamed: 0,mrt_station_name,latitude,longitude,opening_date
0,Jurong East MRT Station,1.333333,103.742222,5 November 1988
1,Bukit Batok MRT Station,1.349167,103.749722,10 March 1990
2,Bukit Gombak MRT Station,1.358611,103.751667,10 March 1990
3,Yew Tee MRT Station,1.396986,103.747239,10 February 1996
4,Kranji MRT Station,1.425047,103.761853,10 February 1996


Next we create a method to run through the entire `mrt_stations.csv` to compare the distance between the flat and the mrt stations provided that opening date of the mrt stations is before the transaction record's date. For simplicity sake, we will assume that the transaction record is taken on the first day of the month. It will return the mrt stations that are within 1km of the flat.

In [38]:
# method to calculate the distance between two points
def calculate_geospatial_distance(start_coords, end_coords):
    
    # Unpack the tuples
    lat1, lon1 = start_coords
    lat2, lon2 = end_coords
    
    # Create GeoSeries from the points
    point1 = gpd.GeoSeries([Point(lon1, lat1)], crs="EPSG:4326")
    point2 = gpd.GeoSeries([Point(lon2, lat2)], crs="EPSG:4326")
    
    # Reproject to a CRS that uses meters as distance units (e.g., World Mercator)
    point1 = point1.to_crs("EPSG:3395")
    point2 = point2.to_crs("EPSG:3395")
    
    # Calculate the distance between the points
    distance = point1.distance(point2).iloc[0]  # distance in meters
    
    return distance/1000  # convert to kilometers

# Example usage
start_coords = (1.352083, 103.819836) 
end_coords = (1.290270, 103.851959)  

distance = calculate_geospatial_distance(start_coords, end_coords)
print(f"Distance: {distance} kilometers")

Distance: 7.715477804836227 kilometers


We noted that some HDB flats may be slightly over 1km from the nearest MRT. If that's the case then we will find the next closest MRT station from the HDB flat.

In [47]:
# create a method to run through the entire mrt_stations.csv to compare the distance between the flat and the mrt stations provided that opening date of the mrt stations is before the transaction record year
def get_mrt_stations_within_1km_and_nearest(flat_coordinates, mrt_stations_df, transaction_record_year):
    # iterate through the mrt stations dataframe
    mrt = []
    mrt_dic = {}
    for index, row in mrt_stations_df.iterrows():
        mrt_latitude, mrt_longitude = float(row['latitude']), float(row['longitude'])
        mrt_coordinates = (mrt_latitude, mrt_longitude)  # Use a tuple for coordinates
        # if mrt opening year is before the transaction record date, then we query the distance else we skip
        opening_date = row['opening_date']
        if datetime.strptime(opening_date, '%d %B %Y') < datetime.strptime(transaction_record_year, '%Y-%m-%d'):
            distance = calculate_geospatial_distance(flat_coordinates, mrt_coordinates)
            # Check if distance rounded down to nearest whole number is less than or equal to 1 km
            if distance <= 1:
                mrt.append(f"{row['mrt_station_name']} ({distance * 1000:.2f}m)")
            mrt_dic[row['mrt_station_name']] = round(distance * 1000, 2)
    # get the nearest mrt station
    nearest_mrt = min(mrt_dic, key=mrt_dic.get)
    nearest_mrt = f"{nearest_mrt} ({mrt_dic[nearest_mrt]}m)"
    return mrt, nearest_mrt

# Example usage
print(get_mrt_stations_within_1km_and_nearest((1.37509746867904, 103.83761896123), mrt_stations, '2015-01-01')) # for flat 174 ANG MO KIO AVE 4 - Mayflower MRT station only opened in 2021, so it won't be included in the list

([], 'Yio Chu Kang MRT Station (1099.56m)')


Next we create a new column in the `hdb_cleaned_df` that will store `mrt_stations_within_1km`

In [48]:
processed_flats = {} # declare this outisde so that we can use it next time in the event we exceed the API limit

# create a new column in hdb_cleaned_df mrt_stations_within_1km 
def create_mrt_stations_within_1km_column_and_nearest_mrt(hdb_cleaned_df):
    mrt_stations_within_1km = []
    nearest_mrt_station = []
    for index, row in hdb_cleaned_df.iterrows():
        # get the address
        address = row['address']
        if address in processed_flats:
            mrt_stations_within_1km.append(processed_flats[address][0])
            nearest_mrt_station.append(processed_flats[address][1])
        else:
            # get the coordinates from hdb_cleaned_df
            flat_coordinates = (float(row['latitude']), float(row['longitude']))
            transaction_record_year = row['month'] + '-01'
            stations_within_1km, nearest_station = get_mrt_stations_within_1km_and_nearest(flat_coordinates, mrt_stations, transaction_record_year)
            mrt_stations_within_1km.append(stations_within_1km)
            nearest_mrt_station.append(nearest_station)
            processed_flats[address] = (stations_within_1km, nearest_station)

    hdb_cleaned_df['mrt_stations_within_1km'] = mrt_stations_within_1km
    hdb_cleaned_df['nearest_mrt_station'] = nearest_mrt_station
    return hdb_cleaned_df

hdb_cleaned_df = create_mrt_stations_within_1km_column_and_nearest_mrt(hdb_cleaned_df)
hdb_cleaned_df.head(10)


Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price,address,latitude,longitude,sora,mrt_stations_within_1km,nearest_mrt_station
0,2015-01,ANG MO KIO,3 ROOM,174,ANG MO KIO AVE 4,07 TO 09,60.0,Improved,1986,70.0,255000.0,174 ANG MO KIO AVE 4,1.375097,103.837619,0.129019,[],Yio Chu Kang MRT Station (1099.56m)
1,2015-01,ANG MO KIO,3 ROOM,541,ANG MO KIO AVE 10,01 TO 03,68.0,New Generation,1981,65.0,275000.0,541 ANG MO KIO AVE 10,1.373922,103.855621,0.129019,[Ang Mo Kio MRT Station (811.53m)],Ang Mo Kio MRT Station (811.53m)
2,2015-01,ANG MO KIO,3 ROOM,163,ANG MO KIO AVE 4,01 TO 03,69.0,New Generation,1980,64.0,285000.0,163 ANG MO KIO AVE 4,1.373552,103.838169,0.129019,[],Yio Chu Kang MRT Station (1183.8m)
3,2015-01,ANG MO KIO,3 ROOM,446,ANG MO KIO AVE 10,01 TO 03,68.0,New Generation,1979,63.0,290000.0,446 ANG MO KIO AVE 10,1.367761,103.855357,0.129019,[Ang Mo Kio MRT Station (703.32m)],Ang Mo Kio MRT Station (703.32m)
4,2015-01,ANG MO KIO,3 ROOM,557,ANG MO KIO AVE 10,07 TO 09,68.0,New Generation,1980,64.0,290000.0,557 ANG MO KIO AVE 10,1.371626,103.857736,0.129019,[Ang Mo Kio MRT Station (939.42m)],Ang Mo Kio MRT Station (939.42m)
5,2015-01,ANG MO KIO,3 ROOM,603,ANG MO KIO AVE 5,07 TO 09,67.0,New Generation,1980,64.0,290000.0,603 ANG MO KIO AVE 5,1.380201,103.835756,0.129019,[],Yio Chu Kang MRT Station (1026.13m)
6,2015-01,ANG MO KIO,3 ROOM,709,ANG MO KIO AVE 8,01 TO 03,68.0,New Generation,1980,64.0,290000.0,709 ANG MO KIO AVE 8,1.371137,103.847662,0.129019,[Ang Mo Kio MRT Station (234.44m)],Ang Mo Kio MRT Station (234.44m)
7,2015-01,ANG MO KIO,3 ROOM,333,ANG MO KIO AVE 1,01 TO 03,68.0,New Generation,1981,65.0,293000.0,333 ANG MO KIO AVE 1,1.361343,103.851699,0.129019,[Ang Mo Kio MRT Station (991.53m)],Ang Mo Kio MRT Station (991.53m)
8,2015-01,ANG MO KIO,3 ROOM,109,ANG MO KIO AVE 4,01 TO 03,67.0,New Generation,1978,62.0,300000.0,109 ANG MO KIO AVE 4,1.370097,103.837688,0.129019,[],Ang Mo Kio MRT Station (1309.39m)
9,2015-01,ANG MO KIO,3 ROOM,564,ANG MO KIO AVE 3,13 TO 15,68.0,New Generation,1985,69.0,307500.0,564 ANG MO KIO AVE 3,1.369848,103.859404,0.129019,[],Ang Mo Kio MRT Station (1108.25m)


Note that for one HDB flat, we will need to query using the API **129** times (since we have 129 rows of mrt stations). OpenRouteService API gives us **2000** queries for querying distances per 24h, so that will allow us to process 2000/129 which is about **15** flats per account. Since we have around **7921** unique flats in the dataset, that means we need 7921/15 which is approximately **528** accounts.

That is why we chose to use geometric distance using python libraries (GeoPandas) instead of the API. The tradeoff would be that we lose out on the geospatial context.

### Getting the supply of BTO flats within 4km of the flat address in the respective time periods

In [49]:
# load bto_supply dataset
bto_supply = pd.read_csv('../data/modified/bto_supply_cleaned.csv')
bto_supply.head()

Unnamed: 0,Town name,Launch,Estimated,Number,Brochure,Unnamed: 6,BTO name,BTO,latitude,longitude
0,Yishun,24-Nov-11,1Q 2015,834,,Info Plans,Acacia Breeze,Yishun Acacia Breeze,1.418407,103.844548
1,Woodlands,22-May-14,2Q 2017,402,Brochure,Info Plans,Admiralty Flora,Woodlands Admiralty Flora,1.44063,103.806198
2,Woodlands,26-Nov-13,3Q 2017,993,Brochure,Info Plans,Admiralty Grove,Woodlands Admiralty Grove,1.43793,103.804571
3,Bukit Merah,05-Dec-23,59 months,904,Brochure,,Alexandra Peaks,Bukit Merah Alexandra Peaks,1.291536,103.821537
4,Bukit Merah,30-Aug-22,2Q 2028,782,Brochure,,Alexandra Vale,Bukit Merah Alexandra Vale,1.290035,103.818436


In [50]:
get_bto_coordinates('Yishun Acacia Breeze')

('1.4184874', '103.84414047972172')

Now we do the same thing as the MRTs but this time identify the BTOs within 4km of the flat in that period of time - we will compare the transaction record with the launch date of the BTO.

For the outputs, we will get the number of BTOs within a 4km radius and the total supply of BTO units in the vicinity.

In [51]:
def get_btos_within_4km(flat_coordinates, bto_supply_df, transaction_record_year):
    btos = []
    overall_supply = 0
    for index, row in bto_supply_df.iterrows():
        bto = row['BTO name']
        launch_date = row['Launch']
        supply = int(row['Number'])
        bto_latitude, bto_longitude = float(row['latitude']), float(row['longitude'])
        coordinates = (bto_latitude, bto_longitude)
        # compare the launch date with the transaction record year
        if datetime.strptime(launch_date, '%d-%b-%y') < datetime.strptime(transaction_record_year, '%Y-%m-%d'):
            distance = calculate_geospatial_distance(flat_coordinates, coordinates)
            if round(distance) <= 4:
                btos.append(bto)
                overall_supply += supply
    return len(btos), overall_supply

In [52]:
def create_bto_within_4km_supply_columns(hdb_cleaned_df, bto_supply_df):
    processed_flats = {}
    bto_within_4km = []
    bto_supply = []
    for index, row in hdb_cleaned_df.iterrows():
        address = row['address']
        if address in processed_flats:
            bto_within_4km.append(processed_flats[address][0])
            bto_supply.append(processed_flats[address][1])
        else:
            flat_coordinates = (float(row['latitude']), float(row['longitude']))
            transaction_record_year = row['month'] + '-01'
            num_btos, supply = get_btos_within_4km(flat_coordinates, bto_supply_df, transaction_record_year)
            bto_within_4km.append(num_btos)
            bto_supply.append(supply)
            processed_flats[address] = (num_btos, supply)
    
    hdb_cleaned_df['bto_within_4km'] = bto_within_4km
    hdb_cleaned_df['bto_supply_within_4km'] = bto_supply
    return hdb_cleaned_df

hdb_cleaned_df = create_bto_within_4km_supply_columns(hdb_cleaned_df, bto_supply)
hdb_cleaned_df.head(10)

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price,address,latitude,longitude,sora,mrt_stations_within_1km,nearest_mrt_station,bto_within_4km,bto_supply_within_4km
0,2015-01,ANG MO KIO,3 ROOM,174,ANG MO KIO AVE 4,07 TO 09,60.0,Improved,1986,70.0,255000.0,174 ANG MO KIO AVE 4,1.375097,103.837619,0.129019,[],Yio Chu Kang MRT Station (1099.56m),12,6587
1,2015-01,ANG MO KIO,3 ROOM,541,ANG MO KIO AVE 10,01 TO 03,68.0,New Generation,1981,65.0,275000.0,541 ANG MO KIO AVE 10,1.373922,103.855621,0.129019,[Ang Mo Kio MRT Station (811.53m)],Ang Mo Kio MRT Station (811.53m),39,23252
2,2015-01,ANG MO KIO,3 ROOM,163,ANG MO KIO AVE 4,01 TO 03,69.0,New Generation,1980,64.0,285000.0,163 ANG MO KIO AVE 4,1.373552,103.838169,0.129019,[],Yio Chu Kang MRT Station (1183.8m),10,4941
3,2015-01,ANG MO KIO,3 ROOM,446,ANG MO KIO AVE 10,01 TO 03,68.0,New Generation,1979,63.0,290000.0,446 ANG MO KIO AVE 10,1.367761,103.855357,0.129019,[Ang Mo Kio MRT Station (703.32m)],Ang Mo Kio MRT Station (703.32m),34,20043
4,2015-01,ANG MO KIO,3 ROOM,557,ANG MO KIO AVE 10,07 TO 09,68.0,New Generation,1980,64.0,290000.0,557 ANG MO KIO AVE 10,1.371626,103.857736,0.129019,[Ang Mo Kio MRT Station (939.42m)],Ang Mo Kio MRT Station (939.42m),45,26356
5,2015-01,ANG MO KIO,3 ROOM,603,ANG MO KIO AVE 5,07 TO 09,67.0,New Generation,1980,64.0,290000.0,603 ANG MO KIO AVE 5,1.380201,103.835756,0.129019,[],Yio Chu Kang MRT Station (1026.13m),16,9077
6,2015-01,ANG MO KIO,3 ROOM,709,ANG MO KIO AVE 8,01 TO 03,68.0,New Generation,1980,64.0,290000.0,709 ANG MO KIO AVE 8,1.371137,103.847662,0.129019,[Ang Mo Kio MRT Station (234.44m)],Ang Mo Kio MRT Station (234.44m),27,14734
7,2015-01,ANG MO KIO,3 ROOM,333,ANG MO KIO AVE 1,01 TO 03,68.0,New Generation,1981,65.0,293000.0,333 ANG MO KIO AVE 1,1.361343,103.851699,0.129019,[Ang Mo Kio MRT Station (991.53m)],Ang Mo Kio MRT Station (991.53m),28,14701
8,2015-01,ANG MO KIO,3 ROOM,109,ANG MO KIO AVE 4,01 TO 03,67.0,New Generation,1978,62.0,300000.0,109 ANG MO KIO AVE 4,1.370097,103.837688,0.129019,[],Ang Mo Kio MRT Station (1309.39m),8,3717
9,2015-01,ANG MO KIO,3 ROOM,564,ANG MO KIO AVE 3,13 TO 15,68.0,New Generation,1985,69.0,307500.0,564 ANG MO KIO AVE 3,1.369848,103.859404,0.129019,[],Ang Mo Kio MRT Station (1108.25m),46,27317


In [53]:
# check bto_supply_within_4km column for any rows with 0 supply
hdb_na = hdb_cleaned_df[hdb_cleaned_df['bto_supply_within_4km'] == 0]
hdb_na

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price,address,latitude,longitude,sora,mrt_stations_within_1km,nearest_mrt_station,bto_within_4km,bto_supply_within_4km


In [54]:
# save the dataframe to csv 
hdb_cleaned_df.to_csv('../data/modified/hdb_working_data.csv', index=False)