# Table of Content
01. Import Libraries
02. Import Data
03. Change Date Columns From String To Datetime
04. Convert Data to GeoData
05. Change CRS from EPSG:4326 (longitude & latitude) to EPSG:3414 (projected planar x and y coordinates for Singapore)
06. Fill N/A Train Station Closure Dates with Dummies
07. Reduce Granularity for Train Station Opening and Closure Dates
08. Get Nearest Train Station and Its Distance for Each Flat
09. Data Validation
10. Combine The Result of Nearest Train Station into Main Dataframe
11. Export Data

# 01. Import Libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import geopandas as gpd
import os

In [2]:
# Adjust setting to allow seeing all the rows in the output of this notebook
pd.options.display.max_rows = None

In [3]:
# Adjust setting to allow seeing all the columns in the output of this notebook
pd.options.display.max_columns = None

# 02. Import Data

In [4]:
# Define the main project folder path
path = r'C:\Users\saich\Desktop\CareerFoundry\Data Immersion\Achievement 6 Advanced Analytics & Dashboard Design\11-2023 HDB Flat Resale Analysis'

In [5]:
# 1) 'flat_resale' 

# Import 'flat_resale_all_w_coord.csv' from 'Prepared Data' folder
flat_resale = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'flat_resale_all_w_coord.csv'), index_col = 0)

In [6]:
flat_resale.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,remaining_lease,row_id,address,latitude,longitude
0,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,Improved,1977,9000.0,86,0,309 ANG MO KIO AVE 1,1.365517,103.843968
1,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,04 TO 06,31.0,Improved,1977,6000.0,86,1,309 ANG MO KIO AVE 1,1.365517,103.843968
2,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,Improved,1977,8000.0,86,2,309 ANG MO KIO AVE 1,1.365517,103.843968
3,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,07 TO 09,31.0,Improved,1977,6000.0,86,3,309 ANG MO KIO AVE 1,1.365517,103.843968
4,1990-01-01,ANG MO KIO,3 ROOM,216,ANG MO KIO AVE 1,04 TO 06,73.0,New Generation,1976,47200.0,85,4,216 ANG MO KIO AVE 1,1.366197,103.841505


In [7]:
flat_resale.shape

(915371, 15)

In [8]:
# 2) 'train_station'

# Import 'train_station_w_coord.csv' from 'Prepared Data' folder
train_station = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'train_station_w_coord.csv'), index_col = 0)

In [9]:
train_station.head()

Unnamed: 0,station_code,station_name,line,opening,closure,station_name_w_code,latitude,longitude
0,NS1,Jurong East,North-South Line,1990-03-10,,Jurong East MRT Station (NS1),1.333153,103.742286
1,NS2,Bukit Batok,North-South Line,1990-03-10,,Bukit Batok MRT Station (NS2),1.349033,103.749566
2,NS3,Bukit Gombak,North-South Line,1990-03-10,,Bukit Gombak MRT Station (NS3),1.358612,103.751791
3,NS4,Choa Chu Kang,North-South Line,1990-03-10,,Choa Chu Kang MRT Station (NS4),1.385363,103.744371
4,NS5,Yew Tee,North-South Line,1996-02-10,,Yew Tee MRT Station (NS5),1.397535,103.747405


In [10]:
train_station.shape

(204, 8)

# 03. Change Date Columns From String To Datetime

In [11]:
flat_resale['month'] = pd.to_datetime(flat_resale['month'])

In [12]:
train_station['opening'] = pd.to_datetime(train_station['opening'])

In [13]:
train_station['closure'] = pd.to_datetime(train_station['closure'])

# 04. Convert Data to GeoData

In [14]:
# 1) 'flat_resale'

type(flat_resale)

pandas.core.frame.DataFrame

In [15]:
# Create geometry for 'flat_resale' with its longitude and latitude 

# geopandas.points_from_xy() - Generate GeometryArray of shapely Point geometries from x, y(, z) coordinates.
# In case of geographic coordinates, it is assumed that longitude is captured by x coordinates and latitude by y.

flat_resale_geometry = gpd.points_from_xy(flat_resale['longitude'], flat_resale['latitude'])

In [16]:
# Convert 'flat_resale' into geodataframe
flat_resale_gdf = gpd.GeoDataFrame(flat_resale, geometry = flat_resale_geometry, crs = 'EPSG:4326')
# EPSG:4326 is a coordinate system that defines latitude and longitude coordinates. 

In [17]:
type(flat_resale_gdf)

geopandas.geodataframe.GeoDataFrame

In [18]:
flat_resale_gdf.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,remaining_lease,row_id,address,latitude,longitude,geometry
0,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,Improved,1977,9000.0,86,0,309 ANG MO KIO AVE 1,1.365517,103.843968,POINT (103.84397 1.36552)
1,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,04 TO 06,31.0,Improved,1977,6000.0,86,1,309 ANG MO KIO AVE 1,1.365517,103.843968,POINT (103.84397 1.36552)
2,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,Improved,1977,8000.0,86,2,309 ANG MO KIO AVE 1,1.365517,103.843968,POINT (103.84397 1.36552)
3,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,07 TO 09,31.0,Improved,1977,6000.0,86,3,309 ANG MO KIO AVE 1,1.365517,103.843968,POINT (103.84397 1.36552)
4,1990-01-01,ANG MO KIO,3 ROOM,216,ANG MO KIO AVE 1,04 TO 06,73.0,New Generation,1976,47200.0,85,4,216 ANG MO KIO AVE 1,1.366197,103.841505,POINT (103.84151 1.36620)


In [19]:
# Check the crs (Coordinate Reference System) of 'flat_resale_gdf'
flat_resale_gdf.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [20]:
# 2) 'train_station'

type(train_station)

pandas.core.frame.DataFrame

In [21]:
# Create geometry for 'train_station' with its longitude and latitude 

# geopandas.points_from_xy() - Generate GeometryArray of shapely Point geometries from x, y(, z) coordinates.
# In case of geographic coordinates, it is assumed that longitude is captured by x coordinates and latitude by y.

train_station_geometry = gpd.points_from_xy(train_station['longitude'], train_station['latitude'])

In [22]:
# Convert 'train_station' into geodataframe
train_station_gdf = gpd.GeoDataFrame(train_station, geometry = train_station_geometry, crs = 'EPSG:4326')
# EPSG:4326 is a coordinate system that defines latitude and longitude coordinates. 

In [23]:
type(train_station_gdf)

geopandas.geodataframe.GeoDataFrame

In [24]:
train_station_gdf.head()

Unnamed: 0,station_code,station_name,line,opening,closure,station_name_w_code,latitude,longitude,geometry
0,NS1,Jurong East,North-South Line,1990-03-10,NaT,Jurong East MRT Station (NS1),1.333153,103.742286,POINT (103.74229 1.33315)
1,NS2,Bukit Batok,North-South Line,1990-03-10,NaT,Bukit Batok MRT Station (NS2),1.349033,103.749566,POINT (103.74957 1.34903)
2,NS3,Bukit Gombak,North-South Line,1990-03-10,NaT,Bukit Gombak MRT Station (NS3),1.358612,103.751791,POINT (103.75179 1.35861)
3,NS4,Choa Chu Kang,North-South Line,1990-03-10,NaT,Choa Chu Kang MRT Station (NS4),1.385363,103.744371,POINT (103.74437 1.38536)
4,NS5,Yew Tee,North-South Line,1996-02-10,NaT,Yew Tee MRT Station (NS5),1.397535,103.747405,POINT (103.74741 1.39754)


In [25]:
# Check the crs (Coordinate Reference System) of 'train_station_gdf'
train_station_gdf.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

# 05. Change CRS from EPSG:4326 (longitude & latitude) to EPSG:3414 (projected planar x and y coordinates for Singapore)

Localised datums (such as SVY21 in Singaporeâ€™s case) can provide a more accurate representation of the area of than the global WGS 84 datum. <br>
https://epsg.io/?q=Singapore <br>
https://app.sla.gov.sg/sirent/About/PlaneCoordinateSystem

In [26]:
# 1) 'flat_resale_gdf'

# geopandas.to_crs(): Transform geometries to a new coordinate reference system.
flat_resale_gdf_reprojected = flat_resale_gdf.to_crs('EPSG:3414')

In [27]:
flat_resale_gdf_reprojected.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,remaining_lease,row_id,address,latitude,longitude,geometry
0,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,Improved,1977,9000.0,86,0,309 ANG MO KIO AVE 1,1.365517,103.843968,POINT (29185.176 38617.449)
1,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,04 TO 06,31.0,Improved,1977,6000.0,86,1,309 ANG MO KIO AVE 1,1.365517,103.843968,POINT (29185.176 38617.449)
2,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,Improved,1977,8000.0,86,2,309 ANG MO KIO AVE 1,1.365517,103.843968,POINT (29185.176 38617.449)
3,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,07 TO 09,31.0,Improved,1977,6000.0,86,3,309 ANG MO KIO AVE 1,1.365517,103.843968,POINT (29185.176 38617.449)
4,1990-01-01,ANG MO KIO,3 ROOM,216,ANG MO KIO AVE 1,04 TO 06,73.0,New Generation,1976,47200.0,85,4,216 ANG MO KIO AVE 1,1.366197,103.841505,POINT (28911.052 38692.617)


In [28]:
flat_resale_gdf_reprojected.crs

<Projected CRS: EPSG:3414>
Name: SVY21 / Singapore TM
Axis Info [cartesian]:
- N[north]: Northing (metre)
- E[east]: Easting (metre)
Area of Use:
- name: Singapore - onshore and offshore.
- bounds: (103.59, 1.13, 104.07, 1.47)
Coordinate Operation:
- name: Singapore Transverse Mercator
- method: Transverse Mercator
Datum: SVY21
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [29]:
# 2) 'train_station_gdf'

# geopandas.to_crs(): Transform geometries to a new coordinate reference system.
train_station_gdf_reprojected = train_station_gdf.to_crs('EPSG:3414')

In [30]:
train_station_gdf_reprojected.head()

Unnamed: 0,station_code,station_name,line,opening,closure,station_name_w_code,latitude,longitude,geometry
0,NS1,Jurong East,North-South Line,1990-03-10,NaT,Jurong East MRT Station (NS1),1.333153,103.742286,POINT (17869.057 35038.969)
1,NS2,Bukit Batok,North-South Line,1990-03-10,NaT,Bukit Batok MRT Station (NS2),1.349033,103.749566,POINT (18679.322 36794.926)
2,NS3,Bukit Gombak,North-South Line,1990-03-10,NaT,Bukit Gombak MRT Station (NS3),1.358612,103.751791,POINT (18926.913 37854.036)
3,NS4,Choa Chu Kang,North-South Line,1990-03-10,NaT,Choa Chu Kang MRT Station (NS4),1.385363,103.744371,POINT (18101.247 40812.122)
4,NS5,Yew Tee,North-South Line,1996-02-10,NaT,Yew Tee MRT Station (NS5),1.397535,103.747405,POINT (18438.983 42158.018)


In [31]:
train_station_gdf_reprojected.crs

<Projected CRS: EPSG:3414>
Name: SVY21 / Singapore TM
Axis Info [cartesian]:
- N[north]: Northing (metre)
- E[east]: Easting (metre)
Area of Use:
- name: Singapore - onshore and offshore.
- bounds: (103.59, 1.13, 104.07, 1.47)
Coordinate Operation:
- name: Singapore Transverse Mercator
- method: Transverse Mercator
Datum: SVY21
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

# 06. Fill N/A Train Station Closure Dates with Dummies

This could prevent the addition handling for N/A dates (NaT) later. 

In [32]:
# Check the number of N/A dates in 'closure' column
train_station_gdf_reprojected['closure'].value_counts(dropna = False)

closure
NaT           203
2019-01-13      1
Name: count, dtype: int64

In [33]:
# Create a column 'closure_w_dummy' that equals to 'closure'
train_station_gdf_reprojected['closure_w_dummy'] = train_station_gdf_reprojected['closure']

In [34]:
# Fill the N/A dates with a dummy date (2099-01-01)
train_station_gdf_reprojected['closure_w_dummy'] = train_station_gdf_reprojected['closure_w_dummy'].fillna('2099-01-01')

In [35]:
# Verify the changes
train_station_gdf_reprojected.head()

Unnamed: 0,station_code,station_name,line,opening,closure,station_name_w_code,latitude,longitude,geometry,closure_w_dummy
0,NS1,Jurong East,North-South Line,1990-03-10,NaT,Jurong East MRT Station (NS1),1.333153,103.742286,POINT (17869.057 35038.969),2099-01-01
1,NS2,Bukit Batok,North-South Line,1990-03-10,NaT,Bukit Batok MRT Station (NS2),1.349033,103.749566,POINT (18679.322 36794.926),2099-01-01
2,NS3,Bukit Gombak,North-South Line,1990-03-10,NaT,Bukit Gombak MRT Station (NS3),1.358612,103.751791,POINT (18926.913 37854.036),2099-01-01
3,NS4,Choa Chu Kang,North-South Line,1990-03-10,NaT,Choa Chu Kang MRT Station (NS4),1.385363,103.744371,POINT (18101.247 40812.122),2099-01-01
4,NS5,Yew Tee,North-South Line,1996-02-10,NaT,Yew Tee MRT Station (NS5),1.397535,103.747405,POINT (18438.983 42158.018),2099-01-01


In [36]:
train_station_gdf_reprojected['closure_w_dummy'].value_counts(dropna = False)

closure_w_dummy
2099-01-01    203
2019-01-13      1
Name: count, dtype: int64

In [37]:
train_station_gdf_reprojected['closure_w_dummy'].dtype

dtype('<M8[ns]')

# 07. Reduce Granularity for Train Station Opening and Closure Dates

<b>Reduce the date granularity of 'train_station' from day (yyyy-mm-dd) to month (yyyy-mm-01) because the date granularity in 'flat_resale' is up to month only. </b><br>
If date granularity is not reduced, the flat filtering in the section 7 below will be inaccurate. 

Example: <br>
<u>Before</u> <br>
Date: <b>1990-03-10</b> (only elements smaller than this date is included) <br>
Train station: from 1988-03-12 to 1989-12-16 (40 in total) <br>
Flat: from 1990-01-01 to 1990-03-01 (3588 in total) <br>
Flat in 1990-03-01 (1990 march) by right should be exposed with train stations opened up to 1990-03-10, but this is not the case here.

<u>After</u> <br>
Date: <b>1990-03-01</b> (only elements smaller than this date is included) <br>
Train station: from 1988-03-12 to 1989-12-16 (40 in total) <br>
Flat: from 1990-01-01 to 1990-02-01 (2438 in total) <br>
After the change, flat in 1990-03-01 (1990 march) will not be included in this timeframe.

Next date: <b>1990-07-01</b> (only elements smaller than this date is included) <br>
Train station: from 1988-03-12 to 1990-03-10 (44 in total) <br>
Flat: from 1990-03-01 to 1990-06-01 (3799 in total) <br>
Now flat in 1990-03-01 (1990 march) are exposed with train stations opened up to 1990-03-10. 

In [38]:
# 1) train_station_gdf_reprojected['opening']

# Extract the year and month from existing opening dates and store into new column 'opening_yymm'
train_station_gdf_reprojected['opening_yymm'] = train_station_gdf_reprojected['opening'].apply(lambda x: x.strftime('%Y-%m'))

In [39]:
train_station_gdf_reprojected.head()

Unnamed: 0,station_code,station_name,line,opening,closure,station_name_w_code,latitude,longitude,geometry,closure_w_dummy,opening_yymm
0,NS1,Jurong East,North-South Line,1990-03-10,NaT,Jurong East MRT Station (NS1),1.333153,103.742286,POINT (17869.057 35038.969),2099-01-01,1990-03
1,NS2,Bukit Batok,North-South Line,1990-03-10,NaT,Bukit Batok MRT Station (NS2),1.349033,103.749566,POINT (18679.322 36794.926),2099-01-01,1990-03
2,NS3,Bukit Gombak,North-South Line,1990-03-10,NaT,Bukit Gombak MRT Station (NS3),1.358612,103.751791,POINT (18926.913 37854.036),2099-01-01,1990-03
3,NS4,Choa Chu Kang,North-South Line,1990-03-10,NaT,Choa Chu Kang MRT Station (NS4),1.385363,103.744371,POINT (18101.247 40812.122),2099-01-01,1990-03
4,NS5,Yew Tee,North-South Line,1996-02-10,NaT,Yew Tee MRT Station (NS5),1.397535,103.747405,POINT (18438.983 42158.018),2099-01-01,1996-02


In [40]:
# The extracted value is in string form (Object)
train_station_gdf_reprojected['opening_yymm'].dtype

dtype('O')

In [41]:
# Convert 'opening_yymm' into datetime
train_station_gdf_reprojected['opening_yymm'] = pd.to_datetime(train_station_gdf_reprojected['opening_yymm'])

In [42]:
train_station_gdf_reprojected['opening_yymm'].dtype

dtype('<M8[ns]')

In [43]:
train_station_gdf_reprojected.head()

Unnamed: 0,station_code,station_name,line,opening,closure,station_name_w_code,latitude,longitude,geometry,closure_w_dummy,opening_yymm
0,NS1,Jurong East,North-South Line,1990-03-10,NaT,Jurong East MRT Station (NS1),1.333153,103.742286,POINT (17869.057 35038.969),2099-01-01,1990-03-01
1,NS2,Bukit Batok,North-South Line,1990-03-10,NaT,Bukit Batok MRT Station (NS2),1.349033,103.749566,POINT (18679.322 36794.926),2099-01-01,1990-03-01
2,NS3,Bukit Gombak,North-South Line,1990-03-10,NaT,Bukit Gombak MRT Station (NS3),1.358612,103.751791,POINT (18926.913 37854.036),2099-01-01,1990-03-01
3,NS4,Choa Chu Kang,North-South Line,1990-03-10,NaT,Choa Chu Kang MRT Station (NS4),1.385363,103.744371,POINT (18101.247 40812.122),2099-01-01,1990-03-01
4,NS5,Yew Tee,North-South Line,1996-02-10,NaT,Yew Tee MRT Station (NS5),1.397535,103.747405,POINT (18438.983 42158.018),2099-01-01,1996-02-01


In [44]:
# Verify the changes
train_station_gdf_reprojected['opening_yymm'].value_counts(dropna = False)

opening_yymm
2017-10-01    16
1999-11-01    14
2003-06-01    14
2011-10-01    12
2015-12-01    12
2010-04-01    11
2022-11-01    11
1987-12-01    11
2005-01-01    11
1989-11-01    10
1988-03-01     6
2021-08-01     6
2013-12-01     6
2003-01-01     6
1996-02-01     6
1987-11-01     5
2009-05-01     5
1990-03-01     4
2017-06-01     4
1989-12-01     3
2020-01-01     3
1988-11-01     3
2014-06-01     3
2012-01-01     2
1988-12-01     2
2009-02-01     2
2011-06-01     2
2002-02-01     1
2001-10-01     1
2006-01-01     1
2014-11-01     1
2001-01-01     1
1990-07-01     1
2007-11-01     1
2015-06-01     1
2019-11-01     1
2007-06-01     1
2016-02-01     1
2016-12-01     1
2017-03-01     1
2013-01-01     1
Name: count, dtype: int64

In [45]:
train_station_gdf_reprojected['opening'].value_counts(dropna = False)

opening
2017-10-21    16
1999-11-06    14
2003-06-20    14
2011-10-08    12
2015-12-27    12
2010-04-17    11
2022-11-13    11
1987-12-12    11
2005-01-29    11
1989-11-04    10
1988-03-12     6
2021-08-28     6
2013-12-22     6
2003-01-18     6
1996-02-10     6
1987-11-07     5
2009-05-28     5
1990-03-10     4
2017-06-18     4
1989-12-16     3
2020-01-31     3
1988-11-05     3
2014-06-29     3
2012-01-14     2
1988-12-20     2
2009-02-28     2
2011-06-20     2
2002-02-08     1
2001-10-18     1
2006-01-15     1
2014-11-23     1
2001-01-10     1
1990-07-06     1
2007-11-15     1
2015-06-27     1
2019-11-02     1
2007-06-15     1
2016-02-29     1
2016-12-29     1
2017-03-31     1
2013-01-01     1
Name: count, dtype: int64

In [46]:
# 2) train_station_gdf_reprojected['closure_w_dummy']

# Extract the year and month from existing closure dates (with dummy) and store into new column 'closure_w_dummy_yymm'
train_station_gdf_reprojected['closure_w_dummy_yymm'] = train_station_gdf_reprojected['closure_w_dummy'].apply(lambda x: x.strftime('%Y-%m'))

In [47]:
# The extracted value is in string form (Object)
train_station_gdf_reprojected['closure_w_dummy_yymm'].dtype

dtype('O')

In [48]:
# Convert 'closure_w_dummy_yymm' into datetime
train_station_gdf_reprojected['closure_w_dummy_yymm'] = pd.to_datetime(train_station_gdf_reprojected['closure_w_dummy_yymm'])

In [49]:
train_station_gdf_reprojected['closure_w_dummy_yymm'].dtype

dtype('<M8[ns]')

In [50]:
train_station_gdf_reprojected.head()

Unnamed: 0,station_code,station_name,line,opening,closure,station_name_w_code,latitude,longitude,geometry,closure_w_dummy,opening_yymm,closure_w_dummy_yymm
0,NS1,Jurong East,North-South Line,1990-03-10,NaT,Jurong East MRT Station (NS1),1.333153,103.742286,POINT (17869.057 35038.969),2099-01-01,1990-03-01,2099-01-01
1,NS2,Bukit Batok,North-South Line,1990-03-10,NaT,Bukit Batok MRT Station (NS2),1.349033,103.749566,POINT (18679.322 36794.926),2099-01-01,1990-03-01,2099-01-01
2,NS3,Bukit Gombak,North-South Line,1990-03-10,NaT,Bukit Gombak MRT Station (NS3),1.358612,103.751791,POINT (18926.913 37854.036),2099-01-01,1990-03-01,2099-01-01
3,NS4,Choa Chu Kang,North-South Line,1990-03-10,NaT,Choa Chu Kang MRT Station (NS4),1.385363,103.744371,POINT (18101.247 40812.122),2099-01-01,1990-03-01,2099-01-01
4,NS5,Yew Tee,North-South Line,1996-02-10,NaT,Yew Tee MRT Station (NS5),1.397535,103.747405,POINT (18438.983 42158.018),2099-01-01,1996-02-01,2099-01-01


In [51]:
# Verify the changes
train_station_gdf_reprojected['closure_w_dummy_yymm'].value_counts(dropna = False)

closure_w_dummy_yymm
2099-01-01    203
2019-01-01      1
Name: count, dtype: int64

In [52]:
train_station_gdf_reprojected['closure_w_dummy'].value_counts(dropna = False)

closure_w_dummy
2099-01-01    203
2019-01-13      1
Name: count, dtype: int64

# 08. Get Nearest Train Station and Its Distance for Each Flat

### 08.1 Create a date filter list based on train station opening and closure dates

This filter will be used later to create smaller dataframes in searching nearest train stations, improving the code efficiency. 

In [53]:
# 1) List of opening dates (reduced granularity)
opening_date_list = train_station_gdf_reprojected['opening_yymm'].tolist()

In [54]:
len(opening_date_list)

204

In [55]:
# Get the unique elements in a list using set(), then convert the set into list using list()
opening_date_list = list(set(opening_date_list))

In [56]:
len(opening_date_list)

41

In [57]:
# 2) List of opening dates (reduced granularity)
closure_date_list = train_station_gdf_reprojected['closure_w_dummy_yymm'].tolist()

In [58]:
len(closure_date_list)

204

In [59]:
# Get the unique elements in a list using set(), then convert the set into list using list()
closure_date_list = list(set(closure_date_list))

In [60]:
len(closure_date_list)

2

In [61]:
# 3) Combine 'opening_date_list' and 'closure_date_list'
date_filter_list = opening_date_list + closure_date_list

In [62]:
len(date_filter_list)

43

In [63]:
date_filter_list

[Timestamp('2006-01-01 00:00:00'),
 Timestamp('1988-12-01 00:00:00'),
 Timestamp('2016-12-01 00:00:00'),
 Timestamp('2013-12-01 00:00:00'),
 Timestamp('2009-05-01 00:00:00'),
 Timestamp('2003-06-01 00:00:00'),
 Timestamp('2016-02-01 00:00:00'),
 Timestamp('1989-12-01 00:00:00'),
 Timestamp('1988-11-01 00:00:00'),
 Timestamp('2014-11-01 00:00:00'),
 Timestamp('2017-06-01 00:00:00'),
 Timestamp('2011-06-01 00:00:00'),
 Timestamp('2017-03-01 00:00:00'),
 Timestamp('1999-11-01 00:00:00'),
 Timestamp('2022-11-01 00:00:00'),
 Timestamp('2015-06-01 00:00:00'),
 Timestamp('2009-02-01 00:00:00'),
 Timestamp('2001-10-01 00:00:00'),
 Timestamp('2011-10-01 00:00:00'),
 Timestamp('1987-11-01 00:00:00'),
 Timestamp('2015-12-01 00:00:00'),
 Timestamp('2003-01-01 00:00:00'),
 Timestamp('2020-01-01 00:00:00'),
 Timestamp('1987-12-01 00:00:00'),
 Timestamp('2019-11-01 00:00:00'),
 Timestamp('2010-04-01 00:00:00'),
 Timestamp('2005-01-01 00:00:00'),
 Timestamp('1996-02-01 00:00:00'),
 Timestamp('2014-06-

In [64]:
# 4) Sort the elements in the list in ascending order
date_filter_list.sort()

In [65]:
date_filter_list

[Timestamp('1987-11-01 00:00:00'),
 Timestamp('1987-12-01 00:00:00'),
 Timestamp('1988-03-01 00:00:00'),
 Timestamp('1988-11-01 00:00:00'),
 Timestamp('1988-12-01 00:00:00'),
 Timestamp('1989-11-01 00:00:00'),
 Timestamp('1989-12-01 00:00:00'),
 Timestamp('1990-03-01 00:00:00'),
 Timestamp('1990-07-01 00:00:00'),
 Timestamp('1996-02-01 00:00:00'),
 Timestamp('1999-11-01 00:00:00'),
 Timestamp('2001-01-01 00:00:00'),
 Timestamp('2001-10-01 00:00:00'),
 Timestamp('2002-02-01 00:00:00'),
 Timestamp('2003-01-01 00:00:00'),
 Timestamp('2003-06-01 00:00:00'),
 Timestamp('2005-01-01 00:00:00'),
 Timestamp('2006-01-01 00:00:00'),
 Timestamp('2007-06-01 00:00:00'),
 Timestamp('2007-11-01 00:00:00'),
 Timestamp('2009-02-01 00:00:00'),
 Timestamp('2009-05-01 00:00:00'),
 Timestamp('2010-04-01 00:00:00'),
 Timestamp('2011-06-01 00:00:00'),
 Timestamp('2011-10-01 00:00:00'),
 Timestamp('2012-01-01 00:00:00'),
 Timestamp('2013-01-01 00:00:00'),
 Timestamp('2013-12-01 00:00:00'),
 Timestamp('2014-06-

In [66]:
# 5) Remove the dummy date (2099-01-01) from 'date_filter_list'
date_filter_list.remove(pd.to_datetime('2099-01-01'))

In [67]:
date_filter_list

[Timestamp('1987-11-01 00:00:00'),
 Timestamp('1987-12-01 00:00:00'),
 Timestamp('1988-03-01 00:00:00'),
 Timestamp('1988-11-01 00:00:00'),
 Timestamp('1988-12-01 00:00:00'),
 Timestamp('1989-11-01 00:00:00'),
 Timestamp('1989-12-01 00:00:00'),
 Timestamp('1990-03-01 00:00:00'),
 Timestamp('1990-07-01 00:00:00'),
 Timestamp('1996-02-01 00:00:00'),
 Timestamp('1999-11-01 00:00:00'),
 Timestamp('2001-01-01 00:00:00'),
 Timestamp('2001-10-01 00:00:00'),
 Timestamp('2002-02-01 00:00:00'),
 Timestamp('2003-01-01 00:00:00'),
 Timestamp('2003-06-01 00:00:00'),
 Timestamp('2005-01-01 00:00:00'),
 Timestamp('2006-01-01 00:00:00'),
 Timestamp('2007-06-01 00:00:00'),
 Timestamp('2007-11-01 00:00:00'),
 Timestamp('2009-02-01 00:00:00'),
 Timestamp('2009-05-01 00:00:00'),
 Timestamp('2010-04-01 00:00:00'),
 Timestamp('2011-06-01 00:00:00'),
 Timestamp('2011-10-01 00:00:00'),
 Timestamp('2012-01-01 00:00:00'),
 Timestamp('2013-01-01 00:00:00'),
 Timestamp('2013-12-01 00:00:00'),
 Timestamp('2014-06-

In [68]:
len(date_filter_list)

42

In [69]:
# 6) Include the latest date of the flat resale record
date_filter_list.append(flat_resale_gdf_reprojected['month'].max())

In [70]:
date_filter_list

[Timestamp('1987-11-01 00:00:00'),
 Timestamp('1987-12-01 00:00:00'),
 Timestamp('1988-03-01 00:00:00'),
 Timestamp('1988-11-01 00:00:00'),
 Timestamp('1988-12-01 00:00:00'),
 Timestamp('1989-11-01 00:00:00'),
 Timestamp('1989-12-01 00:00:00'),
 Timestamp('1990-03-01 00:00:00'),
 Timestamp('1990-07-01 00:00:00'),
 Timestamp('1996-02-01 00:00:00'),
 Timestamp('1999-11-01 00:00:00'),
 Timestamp('2001-01-01 00:00:00'),
 Timestamp('2001-10-01 00:00:00'),
 Timestamp('2002-02-01 00:00:00'),
 Timestamp('2003-01-01 00:00:00'),
 Timestamp('2003-06-01 00:00:00'),
 Timestamp('2005-01-01 00:00:00'),
 Timestamp('2006-01-01 00:00:00'),
 Timestamp('2007-06-01 00:00:00'),
 Timestamp('2007-11-01 00:00:00'),
 Timestamp('2009-02-01 00:00:00'),
 Timestamp('2009-05-01 00:00:00'),
 Timestamp('2010-04-01 00:00:00'),
 Timestamp('2011-06-01 00:00:00'),
 Timestamp('2011-10-01 00:00:00'),
 Timestamp('2012-01-01 00:00:00'),
 Timestamp('2013-01-01 00:00:00'),
 Timestamp('2013-12-01 00:00:00'),
 Timestamp('2014-06-

In [71]:
len(date_filter_list)

43

### 08.2 Get the nearest station and its distance with the date filter

#### Introduction of geopandas.sindex.nearest() function

In [72]:
# Using geopandas.sindex.nearest() function, we could get the nearest train station for each input hdb geometry
# Parameter 'return_distance = True' will return distances in addition to indexes. By default it's False. 
# Parameter 'return_all = False' will return only a single nearest geometry if there are multiple equidistant or intersecting nearest geometries. By default it's True. 

# Example: Get the nearest train stations for the first 10 hdb address geometries
testing = train_station_gdf_reprojected.sindex.nearest(flat_resale_gdf_reprojected['geometry'][:10], return_distance = True, return_all = False)

In [73]:
testing

(array([[  0,   1,   2,   3,   4,   5,   6,   7,   8,   9],
        [ 14,  14,  14,  14, 147, 147,  14, 147, 147, 147]], dtype=int64),
 array([790.83121537, 790.83121537, 790.83121537, 790.83121537,
        800.63218697, 620.31021896, 584.67227213, 513.09179219,
        513.09179219, 351.6956691 ]))

The result is returned in two NumPy ndarrays (n-dimensional arrays), named "indices" and "distances" with specific shapes. <br>
1. The first array is <b>'indices' array</b>. It is a 2-dimensional array with two subarrays. 
   - The first subarray stores the indices of the input geometries. 
   - The second subarray stores the indices of the resulted tree geometry. 
2. The second array is <b>'distances' array</b> (if present). It is a 1-dimensional array with 'n' elements. 
   - Each element represents the distance between the corresponding elements in the input and resulted tree geometries.

In [74]:
# Retrieval of infomation

# This is the index of the first input hdb address geometry
testing[0][0][0]

0

In [75]:
flat_resale_gdf_reprojected['address'][testing[0][0][0]]

'309 ANG MO KIO AVE 1'

In [76]:
# This is the index of the resulted geometry (nearest train station) of the first input hdb address geometry
testing[0][1][0]

14

In [77]:
train_station_gdf_reprojected['station_name_w_code'][testing[0][1][0]]

'Ang Mo Kio MRT Station (NS16)'

In [78]:
# This is the distance between the input hdb address geometry and its nearest train station (in meter based on the crs)
testing[1][0]

790.8312153737169

#### Actual code

In [79]:
month_list = []
row_id_list = []
address_list = []
nearest_station_list = []
station_distance_list = []
nearest_station_opening_list = []
nearest_station_closure_list = []
nearest_station_opening_yymm_list = []
nearest_station_closure_w_dummy_yymm_list = []

# Define a function to find the nearest train station and its distance using the filtered dataframes
def get_nearest_station(filtered_train_station, filtered_flat_resale): 
    
    # geopandas.sindex.nearest(): Return the nearest geometry in the tree for each input geometry.
    # Parameter 'return_distance = True' will return distances in addition to indexes. By default it's False. 
    # Parameter 'return_all = False' will return only a single nearest geometry if there are multiple equidistant or intersecting nearest geometries. By default it's True. 
    result = filtered_train_station.sindex.nearest(filtered_flat_resale['geometry'], return_distance = True, return_all = False)
    
    for i in range(len(result[0][0])): 
        
        month_list.append(filtered_flat_resale['month'][i])
        row_id_list.append(filtered_flat_resale['row_id'][i])
        address_list.append(filtered_flat_resale['address'][i])
        
        nearest_station_index = result[0][1][i]
        nearest_station = filtered_train_station['station_name_w_code'][nearest_station_index]
        nearest_station_list.append(nearest_station)

        station_distance = result[1][i]
        station_distance_list.append(station_distance)
        
        nearest_station_opening = filtered_train_station['opening'][nearest_station_index]
        nearest_station_opening_list.append(nearest_station_opening)
        
        nearest_station_closure = filtered_train_station['closure'][nearest_station_index]
        nearest_station_closure_list.append(nearest_station_closure)
        
        nearest_station_opening_yymm = filtered_train_station['opening_yymm'][nearest_station_index]
        nearest_station_opening_yymm_list.append(nearest_station_opening_yymm)
        
        nearest_station_closure_w_dummy_yymm = filtered_train_station['closure_w_dummy_yymm'][nearest_station_index]
        nearest_station_closure_w_dummy_yymm_list.append(nearest_station_closure_w_dummy_yymm)
    
    print('filtered_train_station: {}'.format(len(filtered_train_station)))
    print('filtered_flat_resale: {}'.format(len(filtered_flat_resale)))
    print('result: {}'.format(len(result[0][0])))
    print('nearest_station_list: {}'.format(len(nearest_station_list)))
    print('station_distance_list: {}'.format(len(station_distance_list)))
    print('nearest_station_opening_list: {}'.format(len(nearest_station_opening_list)))
    print('nearest_station_closure_list: {}'.format(len(nearest_station_closure_list)))
    print()

    
# Here is where we create smaller filtered 'flat_resale' and 'train_station' dataframes using 'date_filter_list' 
for i in range(len(date_filter_list)): 
    
    print(date_filter_list[i])
    
    # 1) FILTERING OF TRAIN STATION
    # Select the train stations available before the given date based on the reduced granularity opening and closure dates
    # Do not include train stations with opening date equals to the given date
    
    filtered_train_station = train_station_gdf_reprojected.loc[(train_station_gdf_reprojected['opening_yymm'] < date_filter_list[i]) & 
                                                               (train_station_gdf_reprojected['closure_w_dummy_yymm'] > date_filter_list[i])]
    
    # Reset the index of 'filtered_train_station' because: 
    # the index of 'filtered_train_station' is no longer in sequence after the filter (eg. 2, 3, 7, 12, 32)
    # to align with the resulted indexing of sindex.nearest() function (eg. 0, 1, 2, 3, 4)
    
    filtered_train_station.reset_index(drop = True, inplace = True)
    
    # 2) FILTERING OF FLAT
    # Select the flat resale records from the last filter date up to but not including the given date
    # Do not include flat resale records of the given date, except the very last date
    
    if i == 0:     # for the first date in the date filter list
        filtered_flat_resale = flat_resale_gdf_reprojected.loc[flat_resale_gdf_reprojected['month'] < date_filter_list[i]]
    
    elif i == (len(date_filter_list) - 1):     # for the very last date in date filter list
        filtered_flat_resale = flat_resale_gdf_reprojected.loc[((flat_resale_gdf_reprojected['month'] <= date_filter_list[i]) & 
                                                               (flat_resale_gdf_reprojected['month'] >= date_filter_list[i-1]))]
    
    else:
        filtered_flat_resale = flat_resale_gdf_reprojected.loc[((flat_resale_gdf_reprojected['month'] < date_filter_list[i]) & 
                                                                (flat_resale_gdf_reprojected['month'] >= date_filter_list[i-1]))]
    
    # Reset the index of 'filtered_flat_resale' (same as the reason mentioned in 'filtered_train_station') 
    filtered_flat_resale.reset_index(drop = True, inplace = True)
    
    # 3) SEARCH FOR NEAREST STATION USING THE FILTERED 'TRAIN_STATION' AND 'FLAT_RESALE' DATAFRAMES
    get_nearest_station(filtered_train_station, filtered_flat_resale)


1987-11-01 00:00:00
filtered_train_station: 0
filtered_flat_resale: 0
result: 0
nearest_station_list: 0
station_distance_list: 0
nearest_station_opening_list: 0
nearest_station_closure_list: 0

1987-12-01 00:00:00
filtered_train_station: 5
filtered_flat_resale: 0
result: 0
nearest_station_list: 0
station_distance_list: 0
nearest_station_opening_list: 0
nearest_station_closure_list: 0

1988-03-01 00:00:00
filtered_train_station: 16
filtered_flat_resale: 0
result: 0
nearest_station_list: 0
station_distance_list: 0
nearest_station_opening_list: 0
nearest_station_closure_list: 0

1988-11-01 00:00:00
filtered_train_station: 22
filtered_flat_resale: 0
result: 0
nearest_station_list: 0
station_distance_list: 0
nearest_station_opening_list: 0
nearest_station_closure_list: 0

1988-12-01 00:00:00
filtered_train_station: 25
filtered_flat_resale: 0
result: 0
nearest_station_list: 0
station_distance_list: 0
nearest_station_opening_list: 0
nearest_station_closure_list: 0

1989-11-01 00:00:00
filtere

filtered_train_station: 182
filtered_flat_resale: 18428
result: 18428
nearest_station_list: 806701
station_distance_list: 806701
nearest_station_opening_list: 806701
nearest_station_closure_list: 806701

2020-01-01 00:00:00
filtered_train_station: 183
filtered_flat_resale: 3758
result: 3758
nearest_station_list: 810459
station_distance_list: 810459
nearest_station_opening_list: 810459
nearest_station_closure_list: 810459

2021-08-01 00:00:00
filtered_train_station: 186
filtered_flat_resale: 39678
result: 39678
nearest_station_list: 850137
station_distance_list: 850137
nearest_station_opening_list: 850137
nearest_station_closure_list: 850137

2022-11-01 00:00:00
filtered_train_station: 192
filtered_flat_resale: 35097
result: 35097
nearest_station_list: 885234
station_distance_list: 885234
nearest_station_opening_list: 885234
nearest_station_closure_list: 885234

2023-12-01 00:00:00
filtered_train_station: 203
filtered_flat_resale: 30137
result: 30137
nearest_station_list: 915371
station

The code completes in less than 30 seconds. 
The total number of elements in the output lists aligns with the total row number of flat_resale.

# 09. Data Validation
To make sure the nearest stations are present during the flat resale transactions.

In [80]:
# Create a new dataframe to store the result above
df_nearest_station = pd.DataFrame({'month' : month_list, 
                                   'row_id' : row_id_list, 
                                   'address' : address_list, 
                                   'nearest_station' : nearest_station_list, 
                                   'station_distance' : station_distance_list, 
                                   'station_opening' : nearest_station_opening_list, 
                                   'station_closure' : nearest_station_closure_list, 
                                   'station_opening_yymm' : nearest_station_opening_yymm_list, 
                                   'station_closure_w_dummy_yymm' : nearest_station_closure_w_dummy_yymm_list
                                  }) 

In [81]:
df_nearest_station.head()

Unnamed: 0,month,row_id,address,nearest_station,station_distance,station_opening,station_closure,station_opening_yymm,station_closure_w_dummy_yymm
0,1990-01-01,0,309 ANG MO KIO AVE 1,Ang Mo Kio MRT Station (NS16),790.831215,1987-11-07,NaT,1987-11-01,2099-01-01
1,1990-01-01,1,309 ANG MO KIO AVE 1,Ang Mo Kio MRT Station (NS16),790.831215,1987-11-07,NaT,1987-11-01,2099-01-01
2,1990-01-01,2,309 ANG MO KIO AVE 1,Ang Mo Kio MRT Station (NS16),790.831215,1987-11-07,NaT,1987-11-01,2099-01-01
3,1990-01-01,3,309 ANG MO KIO AVE 1,Ang Mo Kio MRT Station (NS16),790.831215,1987-11-07,NaT,1987-11-01,2099-01-01
4,1990-01-01,4,216 ANG MO KIO AVE 1,Ang Mo Kio MRT Station (NS16),986.842379,1987-11-07,NaT,1987-11-01,2099-01-01


In [82]:
df_nearest_station.tail()

Unnamed: 0,month,row_id,address,nearest_station,station_distance,station_opening,station_closure,station_opening_yymm,station_closure_w_dummy_yymm
915366,2023-12-01,915366,826 YISHUN ST 81,Khatib MRT Station (NS14),359.654884,1988-12-20,NaT,1988-12-01,2099-01-01
915367,2023-12-01,915367,834 YISHUN ST 81,Khatib MRT Station (NS14),278.011616,1988-12-20,NaT,1988-12-01,2099-01-01
915368,2023-01-01,915368,666 YISHUN AVE 4,Khatib MRT Station (NS14),863.123975,1988-12-20,NaT,1988-12-01,2099-01-01
915369,2023-12-01,915369,666 YISHUN AVE 4,Khatib MRT Station (NS14),863.123975,1988-12-20,NaT,1988-12-01,2099-01-01
915370,2023-06-01,915370,633 YISHUN ST 61,Khatib MRT Station (NS14),776.554343,1988-12-20,NaT,1988-12-01,2099-01-01


In [83]:
# Check if the train station opening date is smaller than or equal to flat resale month
df_nearest_station_check_opening = df_nearest_station['station_opening_yymm'] <= df_nearest_station['month']
df_nearest_station_check_opening.value_counts()

True    915371
Name: count, dtype: int64

In [84]:
# Check if the train station closure date is greater than flat resale month
df_nearest_station_check_closure = df_nearest_station['station_closure_w_dummy_yymm'] > df_nearest_station['month']
df_nearest_station_check_closure.value_counts()

True    915371
Name: count, dtype: int64

All nearest train stations are present for every HDB resale record. No issue.

# 10. Combine The Result of Nearest Train Station into Main Dataframe

In [85]:
# Merge 'df_nearest_station' with the main dataframe 'flat_resale_gdf_reprojected'
flat_resale_gdf_reprojected_merge = flat_resale_gdf_reprojected.merge(df_nearest_station, on = ['month', 'row_id', 'address'], how = 'outer', indicator = True)

In [86]:
flat_resale_gdf_reprojected_merge.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,remaining_lease,row_id,address,latitude,longitude,geometry,nearest_station,station_distance,station_opening,station_closure,station_opening_yymm,station_closure_w_dummy_yymm,_merge
0,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,Improved,1977,9000.0,86,0,309 ANG MO KIO AVE 1,1.365517,103.843968,POINT (29185.176 38617.449),Ang Mo Kio MRT Station (NS16),790.831215,1987-11-07,NaT,1987-11-01,2099-01-01,both
1,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,04 TO 06,31.0,Improved,1977,6000.0,86,1,309 ANG MO KIO AVE 1,1.365517,103.843968,POINT (29185.176 38617.449),Ang Mo Kio MRT Station (NS16),790.831215,1987-11-07,NaT,1987-11-01,2099-01-01,both
2,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,Improved,1977,8000.0,86,2,309 ANG MO KIO AVE 1,1.365517,103.843968,POINT (29185.176 38617.449),Ang Mo Kio MRT Station (NS16),790.831215,1987-11-07,NaT,1987-11-01,2099-01-01,both
3,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,07 TO 09,31.0,Improved,1977,6000.0,86,3,309 ANG MO KIO AVE 1,1.365517,103.843968,POINT (29185.176 38617.449),Ang Mo Kio MRT Station (NS16),790.831215,1987-11-07,NaT,1987-11-01,2099-01-01,both
4,1990-01-01,ANG MO KIO,3 ROOM,216,ANG MO KIO AVE 1,04 TO 06,73.0,New Generation,1976,47200.0,85,4,216 ANG MO KIO AVE 1,1.366197,103.841505,POINT (28911.052 38692.617),Ang Mo Kio MRT Station (NS16),986.842379,1987-11-07,NaT,1987-11-01,2099-01-01,both


In [87]:
# Check the merging rate
flat_resale_gdf_reprojected_merge['_merge'].value_counts()

_merge
both          915371
left_only          0
right_only         0
Name: count, dtype: int64

In [88]:
# Drop the unnecessary 'station_opening_yymm', 'station_closure_w_dummy_yymm', '_merge' columns
flat_resale_gdf_reprojected_merge.drop(columns = ['station_opening_yymm', 'station_closure_w_dummy_yymm', '_merge'], inplace = True)

In [89]:
flat_resale_gdf_reprojected_merge.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,remaining_lease,row_id,address,latitude,longitude,geometry,nearest_station,station_distance,station_opening,station_closure
0,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,Improved,1977,9000.0,86,0,309 ANG MO KIO AVE 1,1.365517,103.843968,POINT (29185.176 38617.449),Ang Mo Kio MRT Station (NS16),790.831215,1987-11-07,NaT
1,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,04 TO 06,31.0,Improved,1977,6000.0,86,1,309 ANG MO KIO AVE 1,1.365517,103.843968,POINT (29185.176 38617.449),Ang Mo Kio MRT Station (NS16),790.831215,1987-11-07,NaT
2,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,Improved,1977,8000.0,86,2,309 ANG MO KIO AVE 1,1.365517,103.843968,POINT (29185.176 38617.449),Ang Mo Kio MRT Station (NS16),790.831215,1987-11-07,NaT
3,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,07 TO 09,31.0,Improved,1977,6000.0,86,3,309 ANG MO KIO AVE 1,1.365517,103.843968,POINT (29185.176 38617.449),Ang Mo Kio MRT Station (NS16),790.831215,1987-11-07,NaT
4,1990-01-01,ANG MO KIO,3 ROOM,216,ANG MO KIO AVE 1,04 TO 06,73.0,New Generation,1976,47200.0,85,4,216 ANG MO KIO AVE 1,1.366197,103.841505,POINT (28911.052 38692.617),Ang Mo Kio MRT Station (NS16),986.842379,1987-11-07,NaT


In [90]:
flat_resale_gdf_reprojected_merge.shape

(915371, 20)

# 11. Export Data

<b>Geodata export in pickle: </b><br>
The exported geodata pickle file needs geopandas to reopen/import, the file is still a geodata. <br>
If the active 'geometry' column is dropped and then export in pickle, then the pickle file is a normal data file. No longer needs geopandas to reopen/import. <br>

<b>Geodata export in csv: </b><br>
The exported geodata csv file does not need geopandas to reopen/import. The file is a normal data file. The active 'geometry' column is converted from Point into string. 

In [91]:
# 1) 'flat_resale_gdf_reprojected_merge'

# Export 'flat_resale_gdf_reprojected_merge' to 'Prepared Data' folder in csv format
flat_resale_gdf_reprojected_merge.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'flat_resale_all_w_nearest_stn (geodata).csv'))

In [92]:
# Export 'flat_resale_gdf_reprojected_merge' to 'Prepared Data' folder in pkl format
flat_resale_gdf_reprojected_merge.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'flat_resale_all_w_nearest_stn (geodata).pkl'))

In [93]:
# 2) 'train_station_gdf_reprojected'

train_station_gdf_reprojected.head()

Unnamed: 0,station_code,station_name,line,opening,closure,station_name_w_code,latitude,longitude,geometry,closure_w_dummy,opening_yymm,closure_w_dummy_yymm
0,NS1,Jurong East,North-South Line,1990-03-10,NaT,Jurong East MRT Station (NS1),1.333153,103.742286,POINT (17869.057 35038.969),2099-01-01,1990-03-01,2099-01-01
1,NS2,Bukit Batok,North-South Line,1990-03-10,NaT,Bukit Batok MRT Station (NS2),1.349033,103.749566,POINT (18679.322 36794.926),2099-01-01,1990-03-01,2099-01-01
2,NS3,Bukit Gombak,North-South Line,1990-03-10,NaT,Bukit Gombak MRT Station (NS3),1.358612,103.751791,POINT (18926.913 37854.036),2099-01-01,1990-03-01,2099-01-01
3,NS4,Choa Chu Kang,North-South Line,1990-03-10,NaT,Choa Chu Kang MRT Station (NS4),1.385363,103.744371,POINT (18101.247 40812.122),2099-01-01,1990-03-01,2099-01-01
4,NS5,Yew Tee,North-South Line,1996-02-10,NaT,Yew Tee MRT Station (NS5),1.397535,103.747405,POINT (18438.983 42158.018),2099-01-01,1996-02-01,2099-01-01


In [94]:
train_station_gdf_reprojected.shape

(204, 12)

In [95]:
# Export 'train_station_gdf_reprojected' to 'Prepared Data' folder in csv format
train_station_gdf_reprojected.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'train_station_w_coord (geodata).csv'))

In [96]:
# Export 'train_station_gdf_reprojected' to 'Prepared Data' folder in pkl format
train_station_gdf_reprojected.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'train_station_w_coord (geodata).pkl'))