# HDB Resale Price Prediction

## Data Blending

### Import Libraries

In [66]:
# General
import math
import re
import numpy as np
import pandas as pd
from datetime import datetime
from tabulate import tabulate
from math import radians
from sklearn.metrics.pairwise import haversine_distances

# Warnings
import warnings
warnings.filterwarnings('ignore')

### Import HDB Dataset

In [26]:
# Importing Data
data_hdb_raw = pd.read_csv('../dataset/hdb_latest_raw.csv')
print(data_hdb_raw.shape)
data_hdb_raw.head()

(881924, 18)


Unnamed: 0.1,Unnamed: 0,town,flat_type,flat_model,floor_area_sqm,street_name,resale_price,month,lease_commence_date,storey_range,block,remaining_lease,address,full_address,lat,long,nearest_mrt,nearest_distance_to_mrt
0,0,ANG MO KIO,3 ROOM,Improved,69.0,ANG MO KIO AVE 4,147000.0,2000-01,1986,07 TO 09,170,85,170 ANG MO KIO AVE 4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,1.374001,103.836432,mayflower,0.283
1,1,ANG MO KIO,3 ROOM,Improved,60.0,ANG MO KIO AVE 4,130000.0,2000-02,1986,07 TO 09,170,85,170 ANG MO KIO AVE 4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,1.374001,103.836432,mayflower,0.283
2,2,ANG MO KIO,2 ROOM,Improved,45.0,ANG MO KIO AVE 4,105000.0,2000-03,1986,04 TO 06,170,85,170 ANG MO KIO AVE 4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,1.374001,103.836432,mayflower,0.283
3,3,ANG MO KIO,2 ROOM,Improved,45.0,ANG MO KIO AVE 4,125000.0,2000-05,1986,04 TO 06,170,85,170 ANG MO KIO AVE 4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,1.374001,103.836432,mayflower,0.283
4,4,ANG MO KIO,2 ROOM,Improved,45.0,ANG MO KIO AVE 4,117000.0,2000-06,1986,07 TO 09,170,85,170 ANG MO KIO AVE 4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,1.374001,103.836432,mayflower,0.283


### Truncate Dataset

In [22]:
# Reduce to Last 15 Years i.e. 2009 onwards
data_hdb_last15 = data_hdb_raw
data_hdb_last15['month'] = pd.to_datetime(data_hdb_last15['month'])
data_hdb_last15 = data_hdb_last15[data_hdb_last15['month'].dt.year >= 2009]
data_hdb_last15.head()

Unnamed: 0.1,Unnamed: 0,town,flat_type,flat_model,floor_area_sqm,street_name,resale_price,month,lease_commence_date,storey_range,block,remaining_lease,address,full_address,lat,long,nearest_mrt,nearest_distance_to_mrt
77,77,ANG MO KIO,3 ROOM,Improved,61.0,ANG MO KIO AVE 4,200000.0,2009-03-01,1986,04 TO 06,170,76,170 ANG MO KIO AVE 4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,1.374001,103.836432,mayflower,0.283
78,78,ANG MO KIO,3 ROOM,Improved,60.0,ANG MO KIO AVE 4,200000.0,2009-04-01,1986,04 TO 06,170,76,170 ANG MO KIO AVE 4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,1.374001,103.836432,mayflower,0.283
79,79,ANG MO KIO,3 ROOM,Improved,69.0,ANG MO KIO AVE 4,224000.0,2009-06-01,1986,04 TO 06,170,76,170 ANG MO KIO AVE 4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,1.374001,103.836432,mayflower,0.283
80,80,ANG MO KIO,2 ROOM,Improved,45.0,ANG MO KIO AVE 4,170000.0,2009-07-01,1986,07 TO 09,170,76,170 ANG MO KIO AVE 4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,1.374001,103.836432,mayflower,0.283
81,81,ANG MO KIO,3 ROOM,Improved,61.0,ANG MO KIO AVE 4,193000.0,2009-07-01,1986,04 TO 06,170,76,170 ANG MO KIO AVE 4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,1.374001,103.836432,mayflower,0.283


### Data Wrangling

In [4]:
# Understanding Data I
data_hdb_last15.info()

<class 'pandas.core.frame.DataFrame'>
Index: 325970 entries, 77 to 602220
Data columns (total 18 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   Unnamed: 0               325970 non-null  int64         
 1   town                     325970 non-null  object        
 2   flat_type                325970 non-null  object        
 3   flat_model               325970 non-null  object        
 4   floor_area_sqm           325970 non-null  float64       
 5   street_name              325970 non-null  object        
 6   resale_price             325970 non-null  float64       
 7   month                    325970 non-null  datetime64[ns]
 8   lease_commence_date      325970 non-null  int64         
 9   storey_range             325970 non-null  object        
 10  block                    325970 non-null  object        
 11  remaining_lease          325970 non-null  object        
 12  address             

In [5]:
# Drop Unnamed First (Index) Column
data_hdb_last15 = data_hdb_last15.drop(data_hdb_last15.columns[0], axis = 1)

In [6]:
# Merge flat model with different names that refer to the same type
data_hdb_last15['flat_model'] = data_hdb_last15['flat_model'].str.replace('MODEL A','Model A')
data_hdb_last15['flat_model'] = data_hdb_last15['flat_model'].str.replace('IMPROVED','Improved')
data_hdb_last15['flat_model'] = data_hdb_last15['flat_model'].str.replace('NEW GENERATION','New Generation')
data_hdb_last15['flat_model'] = data_hdb_last15['flat_model'].str.replace('PREMIUM APARTMENT', 'Premium Apartment')
data_hdb_last15['flat_model'] = data_hdb_last15['flat_model'].str.replace('SIMPLIFIED','Simplified')
data_hdb_last15['flat_model'] = data_hdb_last15['flat_model'].str.replace('APARTMENT','Apartment')
data_hdb_last15['flat_model'] = data_hdb_last15['flat_model'].str.replace('STANDARD','Standard')
data_hdb_last15['flat_model'] = data_hdb_last15['flat_model'].str.replace('MAISONETTE','Maisonette')
data_hdb_last15['flat_model'] = data_hdb_last15['flat_model'].str.replace('2-ROOM', '2-room')
data_hdb_last15['flat_model'] = data_hdb_last15['flat_model'].str.replace('IMPROVED-MAISONETTE', 'Improved-Maisonette')
data_hdb_last15['flat_model'] = data_hdb_last15['flat_model'].str.replace('MODEL A-MAISONETTE', 'Model A-Maisonette')
data_hdb_last15['flat_model'] = data_hdb_last15['flat_model'].str.replace('MULTI GENERATION', 'Multi Generation')
data_hdb_last15['flat_model'] = data_hdb_last15['flat_model'].str.replace('TERRACE','Terrace')

In [7]:
# Finding out the different types of flat models for each flat type
grouped = data_hdb_last15.groupby('flat_type')['flat_model'].unique()
results = []

for flat_type, flat_models in grouped.items():
    results.append([flat_type, ', '.join(map(str, flat_models))])

print(tabulate(results, headers=["Flat Type", "Flat Models"], tablefmt="pretty"))

+------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
|    Flat Type     |                                                                     Flat Models                                                                     |
+------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+
|      1 ROOM      |                                                                      Improved                                                                       |
|      2 ROOM      |                                            Improved, Standard, 2-room, Model A, Premium Apartment, DBSS                                             |
|      3 ROOM      |                              Improved, New Generation, Model A, Simplified, Standard, Terrace, Premium Apartment, DBSS      

In [8]:
# Combining 'flat_type' and 'flat_model' to give a better representation of the flat's category
data_hdb_last15['flat_category'] = data_hdb_last15['flat_type'] + " " + data_hdb_last15['flat_model']

In [9]:
# Change lease_commence_date to dt
data_hdb_last15['lease_commence_date'] = pd.to_datetime(data_hdb_last15['lease_commence_date'], format='%Y')

In [10]:
## USE THE CELLS HERE ##

In [11]:
## USE THE CELLS HERE ##
# Source : https://hecksrealty.com/singapore-district-guide/
data_hdb_last15['postal_code'] = data_hdb_last15['full_address'].str[-6:]

def determine_region(row):
    postal_sector = str(row['postal_code'])[:2]

    if postal_sector in ['53', '54', '55', '82', '56', '57', '72', '73', '77', '78', '75', '76', '79', '80']:
        return 'North'
    elif postal_sector in ['14', '15', '16', '09', '10']:
        return 'South'
    elif postal_sector in ['34', '36', '36', '37',  '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '81', '51', '52']:
        return 'East'
    elif postal_sector in ['11', '12', '13', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71']:
        return 'West'
    elif postal_sector in ['20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33']:
        return 'Central' 
    elif postal_sector in ['01', '02', '03', '04', '05', '06', '07', '08', '17', '18', '19']:
        return 'City'
    else:
        return 'Others'

    
data_hdb_last15['Region'] = data_hdb_last15.apply(determine_region, axis=1)

print(data_hdb_last15)


              town flat_type flat_model  floor_area_sqm       street_name  \
77      ANG MO KIO    3 ROOM   Improved            61.0  ANG MO KIO AVE 4   
78      ANG MO KIO    3 ROOM   Improved            60.0  ANG MO KIO AVE 4   
79      ANG MO KIO    3 ROOM   Improved            69.0  ANG MO KIO AVE 4   
80      ANG MO KIO    2 ROOM   Improved            45.0  ANG MO KIO AVE 4   
81      ANG MO KIO    3 ROOM   Improved            61.0  ANG MO KIO AVE 4   
...            ...       ...        ...             ...               ...   
602216      YISHUN    2 ROOM    Model A            38.0      YISHUN AVE 4   
602217      YISHUN    4 ROOM    Model A            93.0      YISHUN AVE 4   
602218      YISHUN    3 ROOM    Model A            68.0      YISHUN AVE 4   
602219      YISHUN    5 ROOM   Improved           112.0      YISHUN AVE 4   
602220      YISHUN    5 ROOM   Improved           112.0      YISHUN AVE 4   

        resale_price      month lease_commence_date storey_range block  \
7

In [12]:
# Check Updated HDB Dataset
data_hdb_last15.info()

<class 'pandas.core.frame.DataFrame'>
Index: 325970 entries, 77 to 602220
Data columns (total 20 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   town                     325970 non-null  object        
 1   flat_type                325970 non-null  object        
 2   flat_model               325970 non-null  object        
 3   floor_area_sqm           325970 non-null  float64       
 4   street_name              325970 non-null  object        
 5   resale_price             325970 non-null  float64       
 6   month                    325970 non-null  datetime64[ns]
 7   lease_commence_date      325970 non-null  datetime64[ns]
 8   storey_range             325970 non-null  object        
 9   block                    325970 non-null  object        
 10  remaining_lease          325970 non-null  object        
 11  address                  325970 non-null  object        
 12  full_address        

In [91]:
# Unique street_names
# Tagging the POI count to the street_name to save time?
print(data_hdb_last15['street_name'].nunique())
data_hdb_last15['street_name'].unique()

566


array(['ANG MO KIO AVE 4', 'ANG MO KIO AVE 1', 'ANG MO KIO AVE 10',
       'ANG MO KIO AVE 3', 'BEDOK NTH RD', 'ANG MO KIO AVE 5',
       'ANG MO KIO AVE 6', 'ANG MO KIO AVE 8', 'ANG MO KIO AVE 9',
       'ANG MO KIO AVE 2', 'ANG MO KIO ST 21', 'ANG MO KIO ST 31',
       'ANG MO KIO ST 11', 'BEDOK RESERVOIR RD', 'BEDOK NTH ST 3',
       'BEDOK STH AVE 1', 'BEDOK STH RD', 'CHAI CHEE AVE',
       'NEW UPP CHANGI RD', 'CHAI CHEE DR', 'CHAI CHEE RD',
       'BEDOK NTH AVE 1', 'BEDOK NTH AVE 2', 'BEDOK NTH AVE 3',
       'BEDOK NTH AVE 4', 'BEDOK NTH ST 1', 'BEDOK NTH ST 2',
       'BEDOK NTH ST 4', 'BEDOK STH AVE 2', 'BEDOK STH AVE 3',
       'CHAI CHEE ST', 'LENGKONG TIGA', 'JLN TENAGA', 'SIN MING RD',
       'SHUNFU RD', 'BISHAN ST 11', 'BISHAN ST 12', 'BISHAN ST 13',
       'SIN MING AVE', 'BISHAN ST 22', 'BISHAN ST 23', 'BISHAN ST 24',
       'BRIGHT HILL DR', 'BT BATOK ST 51', 'BT BATOK ST 52',
       'BT BATOK ST 11', 'BT BATOK ST 21', 'BT BATOK ST 34',
       'BT BATOK WEST AVE 4', 

In [108]:
# getting average coordinates of each street name
street_avg_coords = data_hdb_last15.groupby('street_name').agg({'long': 'mean', 'lat': 'mean'}).reset_index()
street_avg_coords.columns = ['street_name', 'avg_long', 'avg_lat']

ave_coord_df = pd.merge(data_hdb_last15, street_avg_coords, on='street_name', how='left')
ave_coord_df

Unnamed: 0.1,Unnamed: 0,town,flat_type,flat_model,floor_area_sqm,street_name,resale_price,month,lease_commence_date,storey_range,block,remaining_lease,address,full_address,lat,long,nearest_mrt,nearest_distance_to_mrt,avg_long,avg_lat
0,77,ANG MO KIO,3 ROOM,Improved,61.0,ANG MO KIO AVE 4,200000.0,2009-03-01,1986,04 TO 06,170,76,170 ANG MO KIO AVE 4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,1.374001,103.836432,mayflower,0.283,103.837815,1.375219
1,78,ANG MO KIO,3 ROOM,Improved,60.0,ANG MO KIO AVE 4,200000.0,2009-04-01,1986,04 TO 06,170,76,170 ANG MO KIO AVE 4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,1.374001,103.836432,mayflower,0.283,103.837815,1.375219
2,79,ANG MO KIO,3 ROOM,Improved,69.0,ANG MO KIO AVE 4,224000.0,2009-06-01,1986,04 TO 06,170,76,170 ANG MO KIO AVE 4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,1.374001,103.836432,mayflower,0.283,103.837815,1.375219
3,80,ANG MO KIO,2 ROOM,Improved,45.0,ANG MO KIO AVE 4,170000.0,2009-07-01,1986,07 TO 09,170,76,170 ANG MO KIO AVE 4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,1.374001,103.836432,mayflower,0.283,103.837815,1.375219
4,81,ANG MO KIO,3 ROOM,Improved,61.0,ANG MO KIO AVE 4,193000.0,2009-07-01,1986,04 TO 06,170,76,170 ANG MO KIO AVE 4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,1.374001,103.836432,mayflower,0.283,103.837815,1.375219
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325965,602216,YISHUN,2 ROOM,Model A,38.0,YISHUN AVE 4,335000.0,2023-01-01,2018,07 TO 09,675A,94 years 11 months,675A YISHUN AVE 4,675A YISHUN AVENUE 4 FERN GROVE @ YISHUN SINGA...,1.419458,103.843092,khatib,1.147,103.840084,1.419388
325966,602217,YISHUN,4 ROOM,Model A,93.0,YISHUN AVE 4,540000.0,2023-01-01,2018,04 TO 06,675A,94 years 11 months,675A YISHUN AVE 4,675A YISHUN AVENUE 4 FERN GROVE @ YISHUN SINGA...,1.419458,103.843092,khatib,1.147,103.840084,1.419388
325967,602218,YISHUN,3 ROOM,Model A,68.0,YISHUN AVE 4,450000.0,2023-01-01,2018,10 TO 12,673C,94 years 11 months,673C YISHUN AVE 4,673C YISHUN AVENUE 4 FERN GROVE @ YISHUN SINGA...,1.421049,103.842207,khatib,1.104,103.840084,1.419388
325968,602219,YISHUN,5 ROOM,Improved,112.0,YISHUN AVE 4,690000.0,2023-01-01,2018,07 TO 09,673C,94 years 11 months,673C YISHUN AVE 4,673C YISHUN AVENUE 4 FERN GROVE @ YISHUN SINGA...,1.421049,103.842207,khatib,1.104,103.840084,1.419388


### Import POI Dataset

In [27]:
# Importing Data
data_poi_raw = pd.read_csv('../dataset/points_of_interest.csv')
data_poi_raw

Unnamed: 0.1,Unnamed: 0,place_id,name,lat,lng,rating,user_ratings_total,price_level,formatted_address,global_code,...,train_station,natural_feature,subpremise,SUBZONE_NO,SUBZONE_N,SUBZONE_C,PLN_AREA_N,PLN_AREA_C,REGION_N,REGION_C
0,0,ChIJ01fgzLUe2jERxlhvImcbZ7g,Quayside Isle,1.247681,103.842072,4.3,568.0,,"31 Ocean Way, Singapore 098375",6PH56RXR+3R,...,False,False,False,1.0,SENTOSA,SISZ01,SOUTHERN ISLANDS,SI,CENTRAL REGION,CR
1,1,ChIJ1S4qfY8Q2jERgb68gskzUbo,Sime Darby Centre,1.336644,103.783597,3.7,437.0,,"896 Dunearn Rd, Singapore 589472",6PH58QPM+MC,...,False,False,False,2.0,SWISS CLUB,BTSZ02,BUKIT TIMAH,BT,CENTRAL REGION,CR
2,2,ChIJ1ZAIkrwZ2jERxtZGC1JnrHM,PoMo,1.300192,103.849220,3.8,1285.0,,"1 Selegie Rd, Singapore 188306",6PH58R2X+3M,...,False,False,False,8.0,SELEGIE,RCSZ08,ROCHOR,RC,CENTRAL REGION,CR
3,3,ChIJ1ZYJOiAZ2jER1mvQqHstQII,LR boulangerie,1.293178,103.827194,4.3,12.0,,"491 River Valley Rd, #01-02 valley point shopp...",6PH57RVG+7V,...,False,False,False,2.0,CHATSWORTH,TNSZ02,TANGLIN,TN,CENTRAL REGION,CR
4,4,ChIJ2Y1DYBI92jERlFUKKSznJrY,Tampines Hub,1.353108,103.940361,4.6,227.0,,"1 Tampines Walk, Singapore 528523",6PH59W3R+64,...,False,False,False,3.0,TAMPINES WEST,TMSZ03,TAMPINES,TM,EAST REGION,ER
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8667,8667,ChIJic0FbG4T2jERlg__g9KNSUU,Old Chang Kee (Head Office),1.450192,103.805305,4.3,25.0,1.0,"2 Woodlands Terrace, Singapore",6PH5FR24+34,...,False,False,False,9.0,SENOKO SOUTH,SBSZ09,SEMBAWANG,SB,NORTH REGION,NR
8668,8668,ChIJ7cjSMX0T2jERGYqGQog7A2E,Old Chang Kee @ Sun Plaza,1.448144,103.819983,3.4,10.0,1.0,"30 Sembawang Dr, #B1-44 Sun Plaza, Singapore",6PH5CRX9+7X,...,False,False,False,3.0,SEMBAWANG CENTRAL,SBSZ03,SEMBAWANG,SB,NORTH REGION,NR
8669,8669,ChIJxXwRE24T2jERDGyVjkkHTTs,Old Chang Kee Bldg,1.449830,103.805229,4.0,2.0,,Singapore,6PH5CRX4+W3,...,False,False,False,9.0,SENOKO SOUTH,SBSZ09,SEMBAWANG,SB,NORTH REGION,NR
8670,8670,ChIJmQOMh1AT2jERs_1tteD7eTg,Old Chang Kee Coldstore,1.468055,103.812869,0.0,0.0,,"20 Senoko Way, Singapore",6PH5FR97+64,...,False,False,False,8.0,SENOKO NORTH,SBSZ08,SEMBAWANG,SB,NORTH REGION,NR


In [14]:
# Understanding Data II
data_poi_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8672 entries, 0 to 8671
Columns: 125 entries, Unnamed: 0 to REGION_C
dtypes: bool(105), float64(6), int64(1), object(13)
memory usage: 2.2+ MB


In [15]:
# Understanding Data III
for i in range(len(data_poi_raw.columns)):
    print(data_poi_raw.columns[i])

Unnamed: 0
place_id
name
lat
lng
rating
user_ratings_total
price_level
formatted_address
global_code
compound_code
planning_area
brand
establishment
point_of_interest
store
food
health
restaurant
hospital
lodging
finance
cafe
convenience_store
clothing_store
atm
shopping_mall
grocery_or_supermarket
home_goods_store
school
bakery
beauty_salon
transit_station
place_of_worship
pharmacy
meal_takeaway
furniture_store
tourist_attraction
secondary_school
supermarket
doctor
shoe_store
dentist
jewelry_store
church
bank
primary_school
electronics_store
gym
spa
car_repair
pet_store
bus_station
university
park
general_contractor
subway_station
real_estate_agency
florist
hair_care
department_store
hardware_store
car_dealer
veterinary_care
travel_agency
bicycle_store
book_store
laundry
plumber
meal_delivery
lawyer
parking
mosque
physiotherapist
art_gallery
insurance_agency
bar
museum
storage
movie_theater
moving_company
liquor_store
gas_station
electrician
car_rental
locksmith
car_wash
post_office
e

In [48]:
# Retain POI types related to health, recreational areas and schools
columns_to_keep = ['name', 'lat', 'lng', 'hospital', 'doctor', 'dentist', 'park', 'shopping_mall', 'amusement_park', 'zoo', 'library',
                   'school', 'primary_school', 'secondary_school', 'university']

data_poi_filtered = data_poi_raw[columns_to_keep]

# Drop rows where all selected POI types are False
data_poi_filtered = data_poi_filtered[data_poi_filtered.iloc[:, 3:].any(axis=1)]

# Create new columns for POI archetypes
data_poi_filtered['Healthcare'] = data_poi_filtered[['hospital', 'doctor', 'dentist']].any(axis=1)
data_poi_filtered['Recreational'] = data_poi_filtered[['park', 'shopping_mall', 'amusement_park', 'zoo', 'library']].any(axis=1)
data_poi_filtered['Education'] = data_poi_filtered[['school', 'primary_school', 'secondary_school', 'university']].any(axis=1)

data_poi_filtered = data_poi_filtered[['name', 'lat', 'lng', 'Healthcare', 'Recreational', 'Education']]

data_poi_filtered

Unnamed: 0,name,lat,lng,Healthcare,Recreational,Education
0,Quayside Isle,1.247681,103.842072,False,True,False
1,Sime Darby Centre,1.336644,103.783597,False,True,False
2,PoMo,1.300192,103.849220,False,True,False
4,Tampines Hub,1.353108,103.940361,False,True,False
5,City Plaza,1.314764,103.893408,False,True,False
...,...,...,...,...,...,...
7681,Orchid Garden & Koi Pond,1.353986,103.989008,False,True,False
7689,JCube,1.333310,103.740199,False,True,False
7770,Giant Panda Forest - River Safari,1.403751,103.792624,False,True,False
7939,Tiong Bahru Plaza,1.286560,103.827543,False,True,False


### Merge Datasets

In [74]:
# Function to calculate Haversine distance between two points
def haversine_distance(lng1, lat1, lng2, lat2):
    lng1, lat1, lng2, lat2 = map(radians, [lng1, lat1, lng2, lat2])
    distances = haversine_distances([[lat1, lng1], [lat2, lng2]])
    return distances[1, 0] * 6371.0  # multiply by Earth radius to get km

# Function to count POIs of a certain type within a specified radius
def count_pois_within_radius(hdb_lng, hdb_lat, poi_data, poi_type, radius):
    count = 0
    for index, poi in data_poi_filtered.iterrows():
        poi_lng = poi['lng']
        poi_lat = poi['lat']
        distance = haversine_distance(hdb_lng, hdb_lat, poi_lng, poi_lat)
        if distance <= radius and poi[poi_type] == True:
            count += 1
    return count

In [109]:
hdb_resale_data = data_hdb_last15.sample(n = 10)
hdb_resale_data

Unnamed: 0.1,Unnamed: 0,town,flat_type,flat_model,floor_area_sqm,street_name,resale_price,month,lease_commence_date,storey_range,block,remaining_lease,address,full_address,lat,long,nearest_mrt,nearest_distance_to_mrt
470643,470643,SEMBAWANG,4 ROOM,Model A,100.0,SEMBAWANG CL,430000.0,2013-09-01,1999,07 TO 09,335,85,335 SEMBAWANG CL,335 SEMBAWANG CLOSE SINGAPORE 750335,1.447163,103.815014,sembawang,0.597
55851,55851,CLEMENTI,3 ROOM,New Generation,67.0,CLEMENTI AVE 4,350000.0,2022-10-01,1979,01 TO 03,305,55 years 05 months,305 CLEMENTI AVE 4,305 CLEMENTI AVENUE 4 PCF SPARKLETOTS PRESCHOO...,1.322061,103.766014,clementi,0.778
48879,48879,CHOA CHU KANG,4 ROOM,Model A,105.0,CHOA CHU KANG AVE 3,330000.0,2018-03-01,1993,01 TO 03,406,74 years 03 months,406 CHOA CHU KANG AVE 3,406 CHOA CHU KANG AVENUE 3 SINGAPORE 680406,1.380374,103.74021,south view,0.565
262215,262215,PASIR RIS,4 ROOM,Model A,106.0,PASIR RIS DR 6,466000.0,2013-04-01,1989,07 TO 09,406,75,406 PASIR RIS DR 6,406 PASIR RIS DRIVE 6 SINGAPORE 510406,1.373767,103.955388,pasir ris,0.683
247644,247644,CHOA CHU KANG,EXECUTIVE,Apartment,148.0,CHOA CHU KANG ST 51,500000.0,2016-03-01,1995,01 TO 03,531,78,531 CHOA CHU KANG ST 51,531 CHOA CHU KANG STREET 51 SINGAPORE 680531,1.392891,103.741534,yew tee,0.832
31595,31595,BUKIT BATOK,4 ROOM,Model A,103.0,BT BATOK ST 34,427000.0,2021-03-01,1986,07 TO 09,341,64 years 02 months,341 BT BATOK ST 34,341 BUKIT BATOK STREET 34 SINGAPORE 650341,1.362805,103.749157,bukit gombak,0.551
554209,554209,PUNGGOL,4 ROOM,Model A,85.0,PUNGGOL FIELD,390000.0,2019-05-01,2004,16 TO 18,203B,84 years 06 months,203B PUNGGOL FIELD,203B PUNGGOL FIELD SINGAPORE 822203,1.398048,103.903477,cove,0.308
557130,557130,SENGKANG,4 ROOM,Premium Apartment,95.0,FERNVALE RD,397000.0,2010-10-01,2004,10 TO 12,408A,93,408A FERNVALE RD,408A FERNVALE ROAD CORAL VALE SINGAPORE 791408,1.388764,103.876212,fernvale,0.347
29347,29347,BISHAN,3 ROOM,Simplified,64.0,BISHAN ST 13,488000.0,2022-10-01,1988,07 TO 09,195,64 years 04 months,195 BISHAN ST 13,195 BISHAN STREET 13 BISHAN SPRING SINGAPORE 5...,1.348362,103.850897,bishan,0.412
105857,105857,SERANGOON,3 ROOM,New Generation,67.0,SERANGOON AVE 4,355000.0,2017-05-01,1984,04 TO 06,227,66 years 07 months,227 SERANGOON AVE 4,227 SERANGOON AVENUE 4 BOUNDARY VILLE SINGAPOR...,1.356471,103.871127,serangoon,0.8


In [103]:
# Iterate through each unqiue Street Name
for index, hdb_unit in hdb_resale_data.iterrows():
    hdb_lng = hdb_unit['long']
    hdb_lat = hdb_unit['lat']

    healthcare_within_1km_count = 0
    recreational_within_1km_count = 0
    education_within_1km_count = 0
    
    # Count healthcare within 1km radius
    healthcare_within_1km_count = count_pois_within_radius(hdb_lng, hdb_lat, data_poi_filtered, 'Healthcare', 1) 
    # Count recreational areas within 1km radius
    recreational_within_1km_count = count_pois_within_radius(hdb_lng, hdb_lat, data_poi_filtered, 'Recreational', 1)
    # Count educational instituitions within 1km radius
    education_within_1km_count = count_pois_within_radius(hdb_lng, hdb_lat, data_poi_filtered, 'Education', 1)
    
    # Add the recreational_count and school_count columns to the original hdb dataset
    hdb_resale_data.loc[index, 'healthcare_within_1km_count'] = healthcare_within_1km_count
    hdb_resale_data.loc[index, 'recreational_within_1km_count'] = recreational_within_1km_count
    hdb_resale_data.loc[index, 'education_within_1km_count'] = education_within_1km_count
    


hdb_resale_data

Unnamed: 0.1,Unnamed: 0,town,flat_type,flat_model,floor_area_sqm,street_name,resale_price,month,lease_commence_date,storey_range,...,remaining_lease,address,full_address,lat,long,nearest_mrt,nearest_distance_to_mrt,healthcare_within_1km_count,recreational_within_1km_count,education_within_1km_count
77,77,ANG MO KIO,3 ROOM,Improved,61.0,ANG MO KIO AVE 4,200000.0,2009-03-01,1986,04 TO 06,...,76,170 ANG MO KIO AVE 4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,1.374001,103.836432,mayflower,0.283,3.0,1.0,3.0
78,78,ANG MO KIO,3 ROOM,Improved,60.0,ANG MO KIO AVE 4,200000.0,2009-04-01,1986,04 TO 06,...,76,170 ANG MO KIO AVE 4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,1.374001,103.836432,mayflower,0.283,3.0,1.0,3.0
79,79,ANG MO KIO,3 ROOM,Improved,69.0,ANG MO KIO AVE 4,224000.0,2009-06-01,1986,04 TO 06,...,76,170 ANG MO KIO AVE 4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,1.374001,103.836432,mayflower,0.283,3.0,1.0,3.0
80,80,ANG MO KIO,2 ROOM,Improved,45.0,ANG MO KIO AVE 4,170000.0,2009-07-01,1986,07 TO 09,...,76,170 ANG MO KIO AVE 4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,1.374001,103.836432,mayflower,0.283,3.0,1.0,3.0
81,81,ANG MO KIO,3 ROOM,Improved,61.0,ANG MO KIO AVE 4,193000.0,2009-07-01,1986,04 TO 06,...,76,170 ANG MO KIO AVE 4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,1.374001,103.836432,mayflower,0.283,3.0,1.0,3.0
82,82,ANG MO KIO,3 ROOM,Improved,60.0,ANG MO KIO AVE 4,208000.0,2009-08-01,1986,04 TO 06,...,76,170 ANG MO KIO AVE 4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,1.374001,103.836432,mayflower,0.283,3.0,1.0,3.0
83,83,ANG MO KIO,3 ROOM,Improved,60.0,ANG MO KIO AVE 4,201000.0,2009-08-01,1986,04 TO 06,...,76,170 ANG MO KIO AVE 4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,1.374001,103.836432,mayflower,0.283,3.0,1.0,3.0
84,84,ANG MO KIO,2 ROOM,Improved,45.0,ANG MO KIO AVE 4,183000.0,2009-09-01,1986,07 TO 09,...,76,170 ANG MO KIO AVE 4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,1.374001,103.836432,mayflower,0.283,3.0,1.0,3.0
85,85,ANG MO KIO,2 ROOM,Improved,45.0,ANG MO KIO AVE 4,166000.0,2009-12-01,1986,01 TO 03,...,76,170 ANG MO KIO AVE 4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,1.374001,103.836432,mayflower,0.283,3.0,1.0,3.0
86,86,ANG MO KIO,2 ROOM,Improved,45.0,ANG MO KIO AVE 4,181000.0,2010-01-01,1986,04 TO 06,...,75,170 ANG MO KIO AVE 4,170 ANG MO KIO AVENUE 4 KEBUN BARU LINK 1 SING...,1.374001,103.836432,mayflower,0.283,3.0,1.0,3.0


In [19]:
# Check Merged Dataset
# data_hdb_last15.info()

### Saving Dataset

In [20]:
# Save Merged Dataset
# data_hdb_last15.to_csv('../dataset/hdb_last15_merged', index = False)