# HDB Resale Price Prediction

## Case Studies

### Import Libraries

In [23]:
import random
random.seed(42)
import numpy as np
import pandas as pd
import pickle
import folium
from datetime import datetime
from math import radians
from sklearn.metrics.pairwise import haversine_distances

# Warnings
import warnings
warnings.filterwarnings('ignore')

### Import Trained Random Forest Regressor

In [6]:
with open('price_model.pkl', 'rb') as f:
        price_model = pickle.load(f)

### Import HDB Test Dataset

In [66]:
# Importing Data I
data_hdb_test_x = pd.read_csv('../dataset/hdb_test_x.csv')
data_hdb_test_x.index += 1
data_hdb_test_x

Unnamed: 0,floor_area_sqm,date_sold,lease_commence_date,remaining_lease,nearest_distance_to_mrt,healthcare_within_1km_count,healthcare_within_1km_average_rating,healthcare_within_2km_count,healthcare_within_2km_average_rating,recreational_within_1km_count,...,storey_range_26 TO 30,storey_range_31 TO 35,storey_range_36 TO 40,storey_range_41 TO 50,region_Central,region_City,region_East,region_North,region_South,region_West
1,104.0,2023-01-01,1983,59.416667,0.777,4.0,2.925000,10.0,2.530000,0.0,...,False,False,False,False,False,False,False,False,False,True
2,122.0,2022-01-01,1996,73.666667,0.365,7.0,2.471429,74.0,2.728378,2.0,...,True,False,False,False,True,False,False,False,False,False
3,121.0,2011-08-01,1996,84.000000,0.421,1.0,4.800000,6.0,3.683333,4.0,...,False,False,False,False,False,False,False,False,False,True
4,104.0,2018-05-01,1998,79.166667,1.307,4.0,2.050000,22.0,2.054545,2.0,...,False,False,False,False,True,False,False,False,False,False
5,91.0,2014-08-01,1980,65.000000,0.773,9.0,2.444444,18.0,2.550000,1.0,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63368,112.0,2021-09-01,2016,93.833333,0.267,4.0,3.300000,12.0,2.791667,4.0,...,False,False,False,False,False,False,False,True,False,False
63369,74.0,2022-06-01,1986,62.833333,0.756,5.0,3.460000,9.0,2.555556,1.0,...,False,False,False,False,False,False,False,False,False,True
63370,113.0,2022-12-01,2016,92.416667,1.260,4.0,2.975000,16.0,2.306250,2.0,...,False,False,False,False,False,False,False,True,False,False
63371,61.0,2019-05-01,1974,54.000000,1.013,1.0,3.300000,10.0,2.350000,2.0,...,False,False,False,False,False,False,False,False,False,True


In [67]:
# Understanding Data I
data_hdb_test_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63372 entries, 1 to 63372
Data columns (total 40 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   floor_area_sqm                          63372 non-null  float64
 1   date_sold                               63372 non-null  object 
 2   lease_commence_date                     63372 non-null  int64  
 3   remaining_lease                         63372 non-null  float64
 4   nearest_distance_to_mrt                 63372 non-null  float64
 5   healthcare_within_1km_count             63372 non-null  float64
 6   healthcare_within_1km_average_rating    63372 non-null  float64
 7   healthcare_within_2km_count             63372 non-null  float64
 8   healthcare_within_2km_average_rating    63372 non-null  float64
 9   recreational_within_1km_count           63372 non-null  float64
 10  recreational_within_1km_average_rating  63372 non-null  fl

In [68]:
# Importing Data II
data_hdb_test_y = pd.read_csv('../dataset/hdb_test_y.csv')
data_hdb_test_y.index += 1
data_hdb_test_y

Unnamed: 0,resale_price
1,528000.0
2,932000.0
3,506000.0
4,580000.0
5,460000.0
...,...
63368,669000.0
63369,345000.0
63370,670000.0
63371,190000.0


In [69]:
# Understanding Data II
data_hdb_test_y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63372 entries, 1 to 63372
Data columns (total 1 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   resale_price  63372 non-null  float64
dtypes: float64(1)
memory usage: 495.2 KB


### Import HDB Merged Dataset

In [21]:
# Import Merged Dataset 
hdb_last15_merged = pd.read_csv('../dataset/hdb_last15_merged.csv')
hdb_last15_merged

Unnamed: 0,town,flat_type,flat_model,floor_area_sqm,resale_price,month,lease_commence_date,storey_range,block,remaining_lease,...,recreational_within_1km_count,recreational_within_1km_average_rating,recreational_within_2km_count,recreational_within_2km_average_rating,education_within_1km_count,education_within_1km_average_rating,education_within_2km_count,education_within_2km_average_rating,postal_code,region
0,SEMBAWANG,5 ROOM,Premium Apartment,111.0,362000.0,2009-07-01,2001-01-01,01 TO 05,357A,91.000000,...,3.0,3.733333,6.0,3.916667,4.0,4.275,8.0,4.250,751357.0,North
1,SEMBAWANG,5 ROOM,Premium Apartment,110.0,370000.0,2009-08-01,2001-01-01,06 TO 10,357A,91.000000,...,3.0,3.733333,6.0,3.916667,4.0,4.275,8.0,4.250,751357.0,North
2,SEMBAWANG,5 ROOM,Premium Apartment,110.0,403000.0,2010-01-01,2001-01-01,16 TO 20,357A,90.000000,...,3.0,3.733333,6.0,3.916667,4.0,4.275,8.0,4.250,751357.0,North
3,SEMBAWANG,4 ROOM,Premium Apartment,95.0,350000.0,2010-07-01,2001-01-01,01 TO 05,357A,90.000000,...,3.0,3.733333,6.0,3.916667,4.0,4.275,8.0,4.250,751357.0,North
4,SEMBAWANG,4 ROOM,Premium Apartment,95.0,399000.0,2010-07-01,2001-01-01,10 TO 15,357A,90.000000,...,3.0,3.733333,6.0,3.916667,4.0,4.275,8.0,4.250,751357.0,North
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325965,BUKIT MERAH,5 ROOM,Improved,114.0,921000.0,2022-10-01,1974-01-01,10 TO 15,87,50.750000,...,7.0,4.057143,78.0,4.052564,3.0,2.700,25.0,3.348,160087.0,South
325966,BUKIT MERAH,5 ROOM,Improved,117.0,930000.0,2022-10-01,1974-01-01,01 TO 05,87,50.750000,...,7.0,4.057143,78.0,4.052564,3.0,2.700,25.0,3.348,160087.0,South
325967,BUKIT MERAH,5 ROOM,Improved,117.0,978000.0,2022-12-01,1974-01-01,10 TO 15,87,50.666667,...,7.0,4.057143,78.0,4.052564,3.0,2.700,25.0,3.348,160087.0,South
325968,BUKIT MERAH,5 ROOM,Improved,114.0,950000.0,2022-12-01,1974-01-01,21 TO 25,87,50.583333,...,7.0,4.057143,78.0,4.052564,3.0,2.700,25.0,3.348,160087.0,South


In [22]:
# Understanding Data 
hdb_last15_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325970 entries, 0 to 325969
Data columns (total 34 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   town                                    325970 non-null  object 
 1   flat_type                               325970 non-null  object 
 2   flat_model                              325970 non-null  object 
 3   floor_area_sqm                          325970 non-null  float64
 4   resale_price                            325970 non-null  float64
 5   month                                   325970 non-null  object 
 6   lease_commence_date                     325970 non-null  object 
 7   storey_range                            325970 non-null  object 
 8   block                                   325970 non-null  object 
 9   remaining_lease                         325970 non-null  float64
 10  address                                 3259

### Import POI Dataset

In [9]:
# Importing Data
data_poi_raw = pd.read_csv('../dataset/points_of_interest.csv')
data_poi_raw

Unnamed: 0.1,Unnamed: 0,place_id,name,lat,lng,rating,user_ratings_total,price_level,formatted_address,global_code,...,train_station,natural_feature,subpremise,SUBZONE_NO,SUBZONE_N,SUBZONE_C,PLN_AREA_N,PLN_AREA_C,REGION_N,REGION_C
0,0,ChIJ01fgzLUe2jERxlhvImcbZ7g,Quayside Isle,1.247681,103.842072,4.3,568.0,,"31 Ocean Way, Singapore 098375",6PH56RXR+3R,...,False,False,False,1.0,SENTOSA,SISZ01,SOUTHERN ISLANDS,SI,CENTRAL REGION,CR
1,1,ChIJ1S4qfY8Q2jERgb68gskzUbo,Sime Darby Centre,1.336644,103.783597,3.7,437.0,,"896 Dunearn Rd, Singapore 589472",6PH58QPM+MC,...,False,False,False,2.0,SWISS CLUB,BTSZ02,BUKIT TIMAH,BT,CENTRAL REGION,CR
2,2,ChIJ1ZAIkrwZ2jERxtZGC1JnrHM,PoMo,1.300192,103.849220,3.8,1285.0,,"1 Selegie Rd, Singapore 188306",6PH58R2X+3M,...,False,False,False,8.0,SELEGIE,RCSZ08,ROCHOR,RC,CENTRAL REGION,CR
3,3,ChIJ1ZYJOiAZ2jER1mvQqHstQII,LR boulangerie,1.293178,103.827194,4.3,12.0,,"491 River Valley Rd, #01-02 valley point shopp...",6PH57RVG+7V,...,False,False,False,2.0,CHATSWORTH,TNSZ02,TANGLIN,TN,CENTRAL REGION,CR
4,4,ChIJ2Y1DYBI92jERlFUKKSznJrY,Tampines Hub,1.353108,103.940361,4.6,227.0,,"1 Tampines Walk, Singapore 528523",6PH59W3R+64,...,False,False,False,3.0,TAMPINES WEST,TMSZ03,TAMPINES,TM,EAST REGION,ER
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8667,8667,ChIJic0FbG4T2jERlg__g9KNSUU,Old Chang Kee (Head Office),1.450192,103.805305,4.3,25.0,1.0,"2 Woodlands Terrace, Singapore",6PH5FR24+34,...,False,False,False,9.0,SENOKO SOUTH,SBSZ09,SEMBAWANG,SB,NORTH REGION,NR
8668,8668,ChIJ7cjSMX0T2jERGYqGQog7A2E,Old Chang Kee @ Sun Plaza,1.448144,103.819983,3.4,10.0,1.0,"30 Sembawang Dr, #B1-44 Sun Plaza, Singapore",6PH5CRX9+7X,...,False,False,False,3.0,SEMBAWANG CENTRAL,SBSZ03,SEMBAWANG,SB,NORTH REGION,NR
8669,8669,ChIJxXwRE24T2jERDGyVjkkHTTs,Old Chang Kee Bldg,1.449830,103.805229,4.0,2.0,,Singapore,6PH5CRX4+W3,...,False,False,False,9.0,SENOKO SOUTH,SBSZ09,SEMBAWANG,SB,NORTH REGION,NR
8670,8670,ChIJmQOMh1AT2jERs_1tteD7eTg,Old Chang Kee Coldstore,1.468055,103.812869,0.0,0.0,,"20 Senoko Way, Singapore",6PH5FR97+64,...,False,False,False,8.0,SENOKO NORTH,SBSZ08,SEMBAWANG,SB,NORTH REGION,NR


## Mini Case Study: HDB Flat in [ Location ]

### Selection of Specific Flat

In [70]:
# Placeholder Flat
random_index = random.choice(data_hdb_test_x.index)
print(random_index)

data_hdb_selected_x = data_hdb_test_x[data_hdb_test_x.index == random_index]
data_hdb_selected_y = data_hdb_test_y[data_hdb_test_y.index == random_index]

14629


In [71]:
data_hdb_selected_x

Unnamed: 0,floor_area_sqm,date_sold,lease_commence_date,remaining_lease,nearest_distance_to_mrt,healthcare_within_1km_count,healthcare_within_1km_average_rating,healthcare_within_2km_count,healthcare_within_2km_average_rating,recreational_within_1km_count,...,storey_range_26 TO 30,storey_range_31 TO 35,storey_range_36 TO 40,storey_range_41 TO 50,region_Central,region_City,region_East,region_North,region_South,region_West
14629,87.0,2021-12-01,2016,93.75,0.43,4.0,3.475,49.0,2.581633,3.0,...,False,False,False,False,False,False,False,False,True,False


In [72]:
data_hdb_selected_y

Unnamed: 0,resale_price
14629,855000.0


In [74]:
# Retrieve from Selected X Dataset
actual_price = data_hdb_selected_y.iloc[0, 0]
actual_fas =  data_hdb_selected_x.iloc[0, 0]
actual_month = data_hdb_selected_x.iloc[0, 1]

# Retrieve from Merged Dataset
hdb_last15_merged_selected = hdb_last15_merged[(hdb_last15_merged.resale_price == actual_price) & 
                                               (hdb_last15_merged.floor_area_sqm == actual_fas) &
                                               (hdb_last15_merged.month == actual_month)]
hdb_last15_merged_selected

Unnamed: 0,town,flat_type,flat_model,floor_area_sqm,resale_price,month,lease_commence_date,storey_range,block,remaining_lease,...,recreational_within_1km_count,recreational_within_1km_average_rating,recreational_within_2km_count,recreational_within_2km_average_rating,education_within_1km_count,education_within_1km_average_rating,education_within_2km_count,education_within_2km_average_rating,postal_code,region
101195,QUEENSTOWN,4 ROOM,Premium Apartment,87.0,855000.0,2021-12-01,2016-01-01,16 TO 20,87,93.75,...,3.0,4.2,12.0,4.041667,4.0,3.85,12.0,3.991667,141087.0,South


In [75]:
latitude = hdb_last15_merged_selected.lat
longitude = hdb_last15_merged_selected.long

### Model Prediction

In [77]:
# Predictor Columns
predictor_cols = ['floor_area_sqm', 'remaining_lease', 'nearest_distance_to_mrt', 
                  'healthcare_within_1km_count', 'healthcare_within_1km_average_rating',
                  'healthcare_within_2km_count', 'healthcare_within_2km_average_rating', 
                  'recreational_within_1km_count', 'recreational_within_1km_average_rating', 
                  'recreational_within_2km_count', 'education_within_1km_count',
                  'education_within_1km_average_rating', 'education_within_2km_count',
                  'price_per_sqm', 'flat_type_3 ROOM', 'flat_type_4 ROOM',
                  'flat_type_5 ROOM', 'flat_type_EXECUTIVE', 'storey_range_01 TO 05',
                  'storey_range_06 TO 10', 'storey_range_10 TO 15', 'region_Central',
                  'region_East', 'region_North', 'region_South', 'region_West']

predictors = data_hdb_selected_x[predictor_cols]

In [81]:
# Predicted Value
predicted_price = price_model.predict(predictors)[0]
print("Predicted Resale Price:", predicted_price)

# Actual Value
print("Actual Resale Price:", actual_price)

Predicted Resale Price: 854790.0
Actual Resale Price: 855000.0


From the Model's prediction, we can see that the predicted price differs from the actual price by only $210. 

### POIs Around HDB Flat

In [91]:
## Use Cells Here

### HDB Flat Map

In [90]:
hdb_map = folium.Map(location = [latitude, longitude], zoom_start = 16)

# HDB Flat
popup_content_hdb = f"Predicted Price: {predicted_price}<br>Actual Price: {actual_price}"
folium.Marker(location = [latitude, longitude], 
              icon = folium.Icon(color = 'red'), 
              popup = folium.Popup(popup_content_hdb, max_width=500)).add_to(hdb_map)


# POIs Near HDB Flat
## Write Here 


hdb_map

### Save Map

In [None]:
# Save the map to an HTML file
hdb_map.save('hdb_map.html')