# HDB Resale Price Prediction

## Case Studies

### Import Libraries

In [1]:
import random
random.seed(42)
import numpy as np
import pandas as pd
import pickle
import folium
from datetime import datetime
from math import radians
from sklearn.metrics.pairwise import haversine_distances

# Warnings
import warnings
warnings.filterwarnings('ignore')

### Import Trained Random Forest Regressor

In [None]:
with open('price_model.pkl', 'rb') as f:
        price_model = pickle.load(f)

### Import HDB Test Dataset

In [3]:
# Importing Data I
data_hdb_test_x = pd.read_csv('../dataset/hdb_test_x.csv')
data_hdb_test_x.index += 1
data_hdb_test_x

Unnamed: 0,floor_area_sqm,date_sold,lease_commence_date,remaining_lease,nearest_distance_to_mrt,healthcare_within_1km_count,healthcare_within_1km_average_rating,healthcare_within_2km_count,healthcare_within_2km_average_rating,recreational_within_1km_count,...,storey_range_26 TO 30,storey_range_31 TO 35,storey_range_36 TO 40,storey_range_41 TO 50,region_Central,region_City,region_East,region_North,region_South,region_West
1,104.0,2023-01-01,1983,59.416667,0.777,4.0,2.925000,10.0,2.530000,0.0,...,False,False,False,False,False,False,False,False,False,True
2,122.0,2022-01-01,1996,73.666667,0.365,7.0,2.471429,74.0,2.728378,2.0,...,True,False,False,False,True,False,False,False,False,False
3,121.0,2011-08-01,1996,84.000000,0.421,1.0,4.800000,6.0,3.683333,4.0,...,False,False,False,False,False,False,False,False,False,True
4,104.0,2018-05-01,1998,79.166667,1.307,4.0,2.050000,22.0,2.054545,2.0,...,False,False,False,False,True,False,False,False,False,False
5,91.0,2014-08-01,1980,65.000000,0.773,9.0,2.444444,18.0,2.550000,1.0,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63368,112.0,2021-09-01,2016,93.833333,0.267,4.0,3.300000,12.0,2.791667,4.0,...,False,False,False,False,False,False,False,True,False,False
63369,74.0,2022-06-01,1986,62.833333,0.756,5.0,3.460000,9.0,2.555556,1.0,...,False,False,False,False,False,False,False,False,False,True
63370,113.0,2022-12-01,2016,92.416667,1.260,4.0,2.975000,16.0,2.306250,2.0,...,False,False,False,False,False,False,False,True,False,False
63371,61.0,2019-05-01,1974,54.000000,1.013,1.0,3.300000,10.0,2.350000,2.0,...,False,False,False,False,False,False,False,False,False,True


In [4]:
# Understanding Data I
data_hdb_test_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63372 entries, 1 to 63372
Data columns (total 40 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   floor_area_sqm                          63372 non-null  float64
 1   date_sold                               63372 non-null  object 
 2   lease_commence_date                     63372 non-null  int64  
 3   remaining_lease                         63372 non-null  float64
 4   nearest_distance_to_mrt                 63372 non-null  float64
 5   healthcare_within_1km_count             63372 non-null  float64
 6   healthcare_within_1km_average_rating    63372 non-null  float64
 7   healthcare_within_2km_count             63372 non-null  float64
 8   healthcare_within_2km_average_rating    63372 non-null  float64
 9   recreational_within_1km_count           63372 non-null  float64
 10  recreational_within_1km_average_rating  63372 non-null  fl

In [5]:
# Importing Data II
data_hdb_test_y = pd.read_csv('../dataset/hdb_test_y.csv')
data_hdb_test_y.index += 1
data_hdb_test_y

Unnamed: 0,resale_price
1,528000.0
2,932000.0
3,506000.0
4,580000.0
5,460000.0
...,...
63368,669000.0
63369,345000.0
63370,670000.0
63371,190000.0


In [6]:
# Understanding Data II
data_hdb_test_y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63372 entries, 1 to 63372
Data columns (total 1 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   resale_price  63372 non-null  float64
dtypes: float64(1)
memory usage: 495.2 KB


### Import HDB Merged Dataset

In [8]:
# Import Cleaned Dataset 
# Replaced merged with cleaned as merged includes flats with remaining lease > 95 years
hdb_last15_cleaned = pd.read_csv('../dataset/hdb_last15_cleaned.csv')
hdb_last15_cleaned

Unnamed: 0,town,flat_type,flat_model,floor_area_sqm,resale_price,month,lease_commence_date,storey_range,block,remaining_lease,...,recreational_within_1km_average_rating,recreational_within_2km_count,recreational_within_2km_average_rating,education_within_1km_count,education_within_1km_average_rating,education_within_2km_count,education_within_2km_average_rating,postal_code,region,price_per_sqm
0,SEMBAWANG,5 ROOM,Premium Apartment,111.0,362000.0,2009-07-01,2001,01 TO 05,357A,91.000000,...,3.733333,6.0,3.916667,4.0,4.275,8.0,4.250,751357.0,North,3261.261261
1,SEMBAWANG,5 ROOM,Premium Apartment,110.0,370000.0,2009-08-01,2001,06 TO 10,357A,91.000000,...,3.733333,6.0,3.916667,4.0,4.275,8.0,4.250,751357.0,North,3363.636364
2,SEMBAWANG,5 ROOM,Premium Apartment,110.0,403000.0,2010-01-01,2001,16 TO 20,357A,90.000000,...,3.733333,6.0,3.916667,4.0,4.275,8.0,4.250,751357.0,North,3663.636364
3,SEMBAWANG,4 ROOM,Premium Apartment,95.0,350000.0,2010-07-01,2001,01 TO 05,357A,90.000000,...,3.733333,6.0,3.916667,4.0,4.275,8.0,4.250,751357.0,North,3684.210526
4,SEMBAWANG,4 ROOM,Premium Apartment,95.0,399000.0,2010-07-01,2001,10 TO 15,357A,90.000000,...,3.733333,6.0,3.916667,4.0,4.275,8.0,4.250,751357.0,North,4200.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316853,BUKIT MERAH,5 ROOM,Improved,114.0,921000.0,2022-10-01,1974,10 TO 15,87,50.750000,...,4.057143,78.0,4.052564,3.0,2.700,25.0,3.348,160087.0,South,8078.947368
316854,BUKIT MERAH,5 ROOM,Improved,117.0,930000.0,2022-10-01,1974,01 TO 05,87,50.750000,...,4.057143,78.0,4.052564,3.0,2.700,25.0,3.348,160087.0,South,7948.717949
316855,BUKIT MERAH,5 ROOM,Improved,117.0,978000.0,2022-12-01,1974,10 TO 15,87,50.666667,...,4.057143,78.0,4.052564,3.0,2.700,25.0,3.348,160087.0,South,8358.974359
316856,BUKIT MERAH,5 ROOM,Improved,114.0,950000.0,2022-12-01,1974,21 TO 25,87,50.583333,...,4.057143,78.0,4.052564,3.0,2.700,25.0,3.348,160087.0,South,8333.333333


In [9]:
# Understanding Data 
hdb_last15_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 316858 entries, 0 to 316857
Data columns (total 35 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   town                                    316858 non-null  object 
 1   flat_type                               316858 non-null  object 
 2   flat_model                              316858 non-null  object 
 3   floor_area_sqm                          316858 non-null  float64
 4   resale_price                            316858 non-null  float64
 5   month                                   316858 non-null  object 
 6   lease_commence_date                     316858 non-null  int64  
 7   storey_range                            316858 non-null  object 
 8   block                                   316858 non-null  object 
 9   remaining_lease                         316858 non-null  float64
 10  address                                 3168

### Import POI Dataset

In [None]:
# Importing Data
data_poi_raw = pd.read_csv('../dataset/points_of_interest.csv')
data_poi_raw

## Mini Case Study: HDB Flat in Tampines

The objective of this case study is to provide a visualisation of whether and how a flat's features can contribute to its resale price. More specifically, we hope to present a flat in its geographical context such as its location, and the amenities around it to and uncover the impact that its geographical context can have on its resale price.

### Selection of Specific Flat

In alignment with the case study's objective, HDB flats that exhibit great performance in terms of POI-related data will be identified. Thus, we calculate the 75th percentile of each POI-related measure below to serve as a filter for potential target flats.

In [10]:
# Finding the 75th Percentile for POI Data
q3_healthcare_1km_count = np.percentile(data_hdb_test_x['healthcare_within_1km_count'], 75)
q3_healthcare_1km_rating = np.percentile(data_hdb_test_x['healthcare_within_1km_average_rating'], 75)

q3_healthcare_2km_count = np.percentile(data_hdb_test_x['healthcare_within_2km_count'], 75)
q3_healthcare_2km_rating = np.percentile(data_hdb_test_x['healthcare_within_2km_average_rating'], 75)

q3_education_1km_count = np.percentile(data_hdb_test_x['education_within_1km_count'], 75)
q3_education_1km_rating = np.percentile(data_hdb_test_x['education_within_1km_average_rating'], 75)

q3_education_2km_count = np.percentile(data_hdb_test_x['education_within_2km_count'], 75)
q3_education_2km_rating = np.percentile(data_hdb_test_x['education_within_2km_average_rating'], 75)

q3_recreational_1km_count = np.percentile(data_hdb_test_x['recreational_within_1km_count'], 75)
q3_recreational_1km_rating = np.percentile(data_hdb_test_x['recreational_within_1km_average_rating'], 75)

q3_recreational_2km_count = np.percentile(data_hdb_test_x['recreational_within_2km_count'], 75)
q3_recreational_2km_rating = np.percentile(data_hdb_test_x['recreational_within_2km_average_rating'], 75)

print("3rd Quartile for Healthcare POI Count (1km): ", q3_healthcare_1km_count)
print("3rd Quartile for Healthcare POI Avg Rating (1km): ", q3_healthcare_1km_rating)

print("3rd Quartile for Healthcare POI Count (2km): ", q3_healthcare_2km_count)
print("3rd Quartile for Healthcare POI Avg Rating (2km): ", q3_healthcare_2km_rating)

print("3rd Quartile for Education POI Count (1km): ", q3_education_1km_count)
print("3rd Quartile for Education POI Avg Rating (1km): ", q3_education_1km_rating)

print("3rd Quartile for Education POI Count (2km): ", q3_education_2km_count)
print("3rd Quartile for Education POI Avg Rating (2km): ", q3_education_2km_rating)

print("3rd Quartile for Recreational POI Count (1km): ", q3_recreational_1km_count)
print("3rd Quartile for Recreational POI Avg Rating (1km): ", q3_recreational_1km_rating)

print("3rd Quartile for Recreational POI Count (2km): ", q3_recreational_2km_count)
print("3rd Quartile for Recreational POI Avg Rating (2km): ", q3_recreational_2km_rating)


3rd Quartile for Healthcare POI Count (1km):  8.0
3rd Quartile for Healthcare POI Avg Rating (1km):  3.233333333333333
3rd Quartile for Healthcare POI Count (2km):  22.0
3rd Quartile for Healthcare POI Avg Rating (2km):  2.8
3rd Quartile for Education POI Count (1km):  6.0
3rd Quartile for Education POI Avg Rating (1km):  4.266666666666667
3rd Quartile for Education POI Count (2km):  17.0
3rd Quartile for Education POI Avg Rating (2km):  4.205263157894737
3rd Quartile for Recreational POI Count (1km):  5.0
3rd Quartile for Recreational POI Avg Rating (1km):  4.15
3rd Quartile for Recreational POI Count (2km):  13.0
3rd Quartile for Recreational POI Avg Rating (2km):  4.136363636363637


Upon trial and error, it is observed that the most encompassing combination of POI-measures for which there exists HDB flats above the 75th percentile across all measures includes:
1. Average rating of healthcare POIs within 2km
2. Average rating of healthcare POIs within 1km
3. Average rating of education POIs within 2km
4. Average rating of education POIs within 1km
5. Number of recreational POIs within 2km
6. Average rating of recreation POIs within 2km

In [12]:
# Finding the HDB Flats with the best POI statistics
# The maximum number/combination of POI-related attributes for which there exists flats with POI values above the 75th percentile has been used
potential_cases1_x = data_hdb_test_x[(data_hdb_test_x.healthcare_within_2km_average_rating > q3_healthcare_2km_rating) &
                                    (data_hdb_test_x.healthcare_within_1km_average_rating > q3_healthcare_1km_rating) &
                                    (data_hdb_test_x.education_within_2km_average_rating > q3_education_2km_rating) &
                                    (data_hdb_test_x.education_within_1km_average_rating > q3_education_1km_rating) &
                                    (data_hdb_test_x.recreational_within_2km_count > q3_recreational_2km_count) &
                                    (data_hdb_test_x.recreational_within_2km_average_rating > q3_recreational_2km_rating)]
                
potential_cases1_x

Unnamed: 0,floor_area_sqm,date_sold,lease_commence_date,remaining_lease,nearest_distance_to_mrt,healthcare_within_1km_count,healthcare_within_1km_average_rating,healthcare_within_2km_count,healthcare_within_2km_average_rating,recreational_within_1km_count,...,storey_range_26 TO 30,storey_range_31 TO 35,storey_range_36 TO 40,storey_range_41 TO 50,region_Central,region_City,region_East,region_North,region_South,region_West
1589,127.0,2020-01-01,1994,73.833333,0.194,4.0,4.075,18.0,2.822222,3.0,...,False,False,False,False,False,False,True,False,False,False
2377,124.0,2019-08-01,1994,74.333333,0.208,4.0,4.075,18.0,2.822222,3.0,...,False,False,False,False,False,False,True,False,False,False
8411,126.0,2011-06-01,1994,82.0,0.335,4.0,4.075,18.0,2.822222,3.0,...,False,False,False,False,False,False,True,False,False,False
9872,105.0,2018-04-01,1994,74.916667,0.374,4.0,4.075,18.0,2.822222,3.0,...,False,False,False,False,False,False,True,False,False,False
14003,137.0,2011-09-01,1994,82.0,0.374,4.0,4.075,18.0,2.822222,3.0,...,False,False,False,False,False,False,True,False,False,False
14528,121.0,2021-08-01,1994,72.333333,0.161,4.0,4.075,18.0,2.822222,3.0,...,False,False,False,False,False,False,True,False,False,False
19148,127.0,2017-08-01,1994,76.25,0.194,4.0,4.075,18.0,2.822222,3.0,...,False,False,False,False,False,False,True,False,False,False
21785,137.0,2013-03-01,1994,80.0,0.331,4.0,4.075,18.0,2.822222,3.0,...,False,False,False,False,False,False,True,False,False,False
22966,104.0,2015-10-01,1994,78.0,0.194,4.0,4.075,18.0,2.822222,3.0,...,False,False,False,False,False,False,True,False,False,False
23368,121.0,2015-04-01,1994,78.0,0.335,4.0,4.075,18.0,2.822222,3.0,...,False,False,False,False,False,False,True,False,False,False


To further narrow down the choices for the target flat, we will now take other non-POI related attributes into consideration and identify flats that perform poorly in these aspects. 

In [13]:
# Narrowing choices by looking at non-POI data
q3_floor_area = np.percentile(data_hdb_test_x['floor_area_sqm'], 75)
q2_remaining_lease = np.percentile(data_hdb_test_x['remaining_lease'], 50)
q1_nearest_distance_to_mrt = np.percentile(data_hdb_test_x['nearest_distance_to_mrt'], 25)

print("1st Quartile for Floor Area: ", q3_floor_area)
print("2nd Quartile for Remaining Lease: ", q2_remaining_lease)
print("3rd Quartile for Nearest Distance to Mrt:", q1_nearest_distance_to_mrt)

1st Quartile for Floor Area:  113.0
2nd Quartile for Remaining Lease:  75.0
3rd Quartile for Nearest Distance to Mrt: 0.338


In considering the various non-POI related numerical attributes, the combination that provides us with the least number of choices (> 0) is as illustrated below.

In [None]:
# In contrast to how we selected the appropriate flats using POI-related attributes, here flats that rank poorly in terms of non POI-related attributes are selected
# This is to better highlight any impact that the amenities near a flat may have on its resale price
potential_cases2_x = potential_cases1_x[(potential_cases1_x.floor_area_sqm < q3_floor_area) & 
                                    (potential_cases1_x.remaining_lease < q2_remaining_lease) &
                                    (potential_cases1_x.nearest_distance_to_mrt < q1_nearest_distance_to_mrt)]

pd.set_option('display.max_columns', None)
potential_cases2_x

Out of the 3 remaining potential cases, the flat at index 43737 is chosen. This is because it while its location and flat type are the same as the other 2, it has the lowest storey range. Given that in Singapore, the general

In [20]:
selected_case_x = data_hdb_test_x[data_hdb_test_x.index == 43737]
selected_case_y = data_hdb_test_y[data_hdb_test_y.index == 43737]

In [22]:
# Retrieve from cleaned dataset
actual_price = selected_case_y.iloc[0,0]
actual_fas = selected_case_x.iloc[0,0]
actual_month = selected_case_x.iloc[0,1]

hdb_last15_cleaned_selected = hdb_last15_cleaned[(hdb_last15_cleaned.resale_price == actual_price) &
                                                 (hdb_last15_cleaned.floor_area_sqm == actual_fas) &
                                                 (hdb_last15_cleaned.month == actual_month)]

hdb_last15_cleaned_selected

Unnamed: 0,town,flat_type,flat_model,floor_area_sqm,resale_price,month,lease_commence_date,storey_range,block,remaining_lease,address,full_address,lat,long,nearest_mrt,nearest_distance_to_mrt,flat_category,street_name,avg_long,avg_lat,healthcare_within_1km_count,healthcare_within_1km_average_rating,healthcare_within_2km_count,healthcare_within_2km_average_rating,recreational_within_1km_count,recreational_within_1km_average_rating,recreational_within_2km_count,recreational_within_2km_average_rating,education_within_1km_count,education_within_1km_average_rating,education_within_2km_count,education_within_2km_average_rating,postal_code,region,price_per_sqm
238602,TAMPINES,4 ROOM,Model A,105.0,530000.0,2022-10-01,1994,01 TO 05,390,70.5,390 TAMPINES AVE 7,390 TAMPINES AVENUE 7 SINGAPORE 520390,1.356182,103.957616,tampines east,0.331,4 ROOM Model A,TAMPINES AVE 7,103.956991,1.355567,4.0,4.075,18.0,2.822222,3.0,4.1,16.0,4.1375,5.0,4.38,16.0,4.28125,520390.0,East,5047.619048


In [1]:
latitude = hdb_last15_cleaned_selected.lat
longitude = hdb_last15_cleaned_selected.long

latitude
longitude

NameError: name 'hdb_last15_cleaned_selected' is not defined

### Model Prediction

In [77]:
# Predictor Columns
predictor_cols = ['floor_area_sqm', 'remaining_lease', 'nearest_distance_to_mrt', 
                  'healthcare_within_1km_count', 'healthcare_within_1km_average_rating',
                  'healthcare_within_2km_count', 'healthcare_within_2km_average_rating', 
                  'recreational_within_1km_count', 'recreational_within_1km_average_rating', 
                  'recreational_within_2km_count', 'education_within_1km_count',
                  'education_within_1km_average_rating', 'education_within_2km_count',
                  'price_per_sqm', 'flat_type_3 ROOM', 'flat_type_4 ROOM',
                  'flat_type_5 ROOM', 'flat_type_EXECUTIVE', 'storey_range_01 TO 05',
                  'storey_range_06 TO 10', 'storey_range_10 TO 15', 'region_Central',
                  'region_East', 'region_North', 'region_South', 'region_West']

predictors = data_hdb_selected_x[predictor_cols]

In [81]:
# Predicted Value
predicted_price = price_model.predict(predictors)[0]
print("Predicted Resale Price:", predicted_price)

# Actual Value
print("Actual Resale Price:", actual_price)

Predicted Resale Price: 854790.0
Actual Resale Price: 855000.0


From the Model's prediction, we can see that the predicted price differs from the actual price by only $210. 

### POIs Around HDB Flat

In [91]:
## Use Cells Here

### HDB Flat Map

In [90]:
hdb_map = folium.Map(location = [latitude, longitude], zoom_start = 16)

# HDB Flat
popup_content_hdb = f"Predicted Price: {predicted_price}<br>Actual Price: {actual_price}"
folium.Marker(location = [latitude, longitude], 
              icon = folium.Icon(color = 'red'), 
              popup = folium.Popup(popup_content_hdb, max_width=500)).add_to(hdb_map)


# POIs Near HDB Flat
## Write Here 


hdb_map

### Save Map

In [None]:
# Save the map to an HTML file
hdb_map.save('hdb_map.html')

### Archive

In [70]:
# Placeholder Flat
# random_index = random.choice(data_hdb_test_x.index)
# print(random_index)

# data_hdb_selected_x = data_hdb_test_x[data_hdb_test_x.index == random_index]
# data_hdb_selected_y = data_hdb_test_y[data_hdb_test_y.index == random_index]

14629


In [None]:
# data_hdb_selected_x

In [72]:
# data_hdb_selected_y

Unnamed: 0,resale_price
14629,855000.0


In [74]:
# Retrieve from Selected X Dataset
# actual_price = data_hdb_selected_y.iloc[0, 0]
# actual_fas =  data_hdb_selected_x.iloc[0, 0]
# actual_month = data_hdb_selected_x.iloc[0, 1]

# Retrieve from Merged Dataset
# hdb_last15_merged_selected = hdb_last15_merged[(hdb_last15_merged.resale_price == actual_price) & 
                                               # (hdb_last15_merged.floor_area_sqm == actual_fas) &
                                               # (hdb_last15_merged.month == actual_month)]
# hdb_last15_merged_selected

Unnamed: 0,town,flat_type,flat_model,floor_area_sqm,resale_price,month,lease_commence_date,storey_range,block,remaining_lease,...,recreational_within_1km_count,recreational_within_1km_average_rating,recreational_within_2km_count,recreational_within_2km_average_rating,education_within_1km_count,education_within_1km_average_rating,education_within_2km_count,education_within_2km_average_rating,postal_code,region
101195,QUEENSTOWN,4 ROOM,Premium Apartment,87.0,855000.0,2021-12-01,2016-01-01,16 TO 20,87,93.75,...,3.0,4.2,12.0,4.041667,4.0,3.85,12.0,3.991667,141087.0,South
