# HDB Resale Price Prediction

## Machine Learning Modelling

### Import Libraries

In [3]:
# General
import math
import random
random.seed(42)
import pickle
import numpy as np
import pandas as pd
from datetime import datetime

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")
colour_palette = sns.color_palette("hls", 8)

# Pre-Processing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler      # Remove Mean and scale to Unit Variance
from sklearn.preprocessing import PowerTransformer    # Log Transformation
from sklearn.preprocessing import OneHotEncoder

# Feature Engineering
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import RFE

# Evaluation Metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

# Validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

# Modelling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
# from lightgbm import LGBMRegressor
# import lightgbm as ltb
import catboost as cb

# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Warnings
import warnings
warnings.filterwarnings('ignore')

### Import Dataset

In [7]:
# Importing Data
data_hdb_cleaned = pd.read_csv('../dataset/hdb_last15_cleaned.csv')
data_hdb_cleaned.index += 1
data_hdb_cleaned

Unnamed: 0,town,flat_type,flat_model,floor_area_sqm,resale_price,month,lease_commence_date,storey_range,block,remaining_lease,...,recreational_within_1km_average_rating,recreational_within_2km_count,recreational_within_2km_average_rating,education_within_1km_count,education_within_1km_average_rating,education_within_2km_count,education_within_2km_average_rating,postal_code,region,price_per_sqm
1,SEMBAWANG,5 ROOM,Premium Apartment,111.0,362000.0,2009-07-01,2001,04 TO 06,357A,91.000000,...,3.733333,6.0,3.916667,4.0,4.275,8.0,4.250,751357,North,3261.261261
2,SEMBAWANG,5 ROOM,Premium Apartment,110.0,370000.0,2009-08-01,2001,07 TO 09,357A,91.000000,...,3.733333,6.0,3.916667,4.0,4.275,8.0,4.250,751357,North,3363.636364
3,SEMBAWANG,5 ROOM,Premium Apartment,110.0,403000.0,2010-01-01,2001,16 TO 18,357A,90.000000,...,3.733333,6.0,3.916667,4.0,4.275,8.0,4.250,751357,North,3663.636364
4,SEMBAWANG,4 ROOM,Premium Apartment,95.0,350000.0,2010-07-01,2001,01 TO 03,357A,90.000000,...,3.733333,6.0,3.916667,4.0,4.275,8.0,4.250,751357,North,3684.210526
5,SEMBAWANG,4 ROOM,Premium Apartment,95.0,399000.0,2010-07-01,2001,10 TO 12,357A,90.000000,...,3.733333,6.0,3.916667,4.0,4.275,8.0,4.250,751357,North,4200.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316854,BUKIT MERAH,5 ROOM,Improved,114.0,921000.0,2022-10-01,1974,13 TO 15,87,50.750000,...,4.057143,78.0,4.052564,3.0,2.700,25.0,3.348,160087,South,8078.947368
316855,BUKIT MERAH,5 ROOM,Improved,117.0,930000.0,2022-10-01,1974,04 TO 06,87,50.750000,...,4.057143,78.0,4.052564,3.0,2.700,25.0,3.348,160087,South,7948.717949
316856,BUKIT MERAH,5 ROOM,Improved,117.0,978000.0,2022-12-01,1974,13 TO 15,87,50.666667,...,4.057143,78.0,4.052564,3.0,2.700,25.0,3.348,160087,South,8358.974359
316857,BUKIT MERAH,5 ROOM,Improved,114.0,950000.0,2022-12-01,1974,22 TO 24,87,50.583333,...,4.057143,78.0,4.052564,3.0,2.700,25.0,3.348,160087,South,8333.333333


In [8]:
# Understanding Data I
data_hdb_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 316858 entries, 1 to 316858
Data columns (total 35 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   town                                    316858 non-null  object 
 1   flat_type                               316858 non-null  object 
 2   flat_model                              316858 non-null  object 
 3   floor_area_sqm                          316858 non-null  float64
 4   resale_price                            316858 non-null  float64
 5   month                                   316858 non-null  object 
 6   lease_commence_date                     316858 non-null  int64  
 7   storey_range                            316858 non-null  object 
 8   block                                   316858 non-null  object 
 9   remaining_lease                         316858 non-null  float64
 10  address                                 3168

In [9]:
# Understanding Data II
data_hdb_cleaned.describe()

Unnamed: 0,floor_area_sqm,resale_price,lease_commence_date,remaining_lease,lat,long,nearest_distance_to_mrt,avg_long,avg_lat,healthcare_within_1km_count,...,healthcare_within_2km_average_rating,recreational_within_1km_count,recreational_within_1km_average_rating,recreational_within_2km_count,recreational_within_2km_average_rating,education_within_1km_count,education_within_1km_average_rating,education_within_2km_count,education_within_2km_average_rating,price_per_sqm
count,316858.0,316858.0,316858.0,316858.0,316858.0,316858.0,316858.0,316858.0,316858.0,316858.0,...,316858.0,316858.0,316858.0,316858.0,316858.0,316858.0,316858.0,316858.0,316858.0,316858.0
mean,97.479359,441256.6,1991.72968,75.267866,1.365842,103.838676,0.624044,103.83868,1.365844,5.988376,...,2.56045,3.837429,3.928042,11.887855,4.055404,4.855336,4.03746,14.403837,4.09573,4581.61763
std,24.742062,146768.9,11.84791,11.684112,0.042881,0.072799,0.378055,0.072687,0.042815,5.242842,...,0.600599,3.054143,0.688451,8.638953,0.134527,2.244488,0.574591,4.698377,0.183904,1181.663412
min,31.0,127000.0,1966.0,43.0,1.27038,103.644248,0.023,103.688247,1.272255,0.0,...,0.8,0.0,0.0,2.0,3.577778,0.0,0.0,2.0,2.786667,1445.92
25%,74.0,336000.0,1984.0,66.0,1.336264,103.774173,0.337,103.774336,1.33632,3.0,...,2.292857,2.0,3.933333,8.0,3.983333,3.0,3.975,12.0,3.994737,3798.076923
50%,97.0,415000.0,1989.0,75.0,1.362565,103.844027,0.552,103.844611,1.36235,5.0,...,2.55,3.0,4.05,10.0,4.075,5.0,4.133333,14.0,4.13,4368.932039
75%,113.0,515000.0,2000.0,85.0,1.39271,103.898351,0.828,103.898503,1.391577,8.0,...,2.8,5.0,4.15,13.0,4.136364,6.0,4.266667,17.0,4.205263,5074.626866
max,280.0,1418000.0,2018.0,94.916667,1.457071,103.964915,2.154,103.9624,1.45554,61.0,...,4.9,41.0,4.7,105.0,4.38,19.0,5.0,37.0,4.5,14731.182796


### Split Training & Testing Dataset

In [10]:
# Dependent Variable
target_list = ['resale_price']

# Independent Variables
feature_list = ['flat_type', 'floor_area_sqm', 'month', 'lease_commence_date', 'storey_range', 'remaining_lease',
                'nearest_distance_to_mrt', 'healthcare_within_1km_count', 'healthcare_within_1km_average_rating', 
                'healthcare_within_2km_count', 'healthcare_within_2km_average_rating', 'recreational_within_1km_count', 
                'recreational_within_1km_average_rating', 'recreational_within_2km_count', 'recreational_within_2km_average_rating', 
                'education_within_1km_count', 'education_within_1km_average_rating', 'education_within_2km_count',
                'education_within_2km_average_rating', 'region', 'price_per_sqm']

print(feature_list)

# Selecting Target Variable and Features from Dataset
X = data_hdb_cleaned[feature_list]
y = data_hdb_cleaned[target_list]

['town', 'flat_type', 'floor_area_sqm', 'month', 'lease_commence_date', 'storey_range', 'remaining_lease', 'nearest_distance_to_mrt', 'healthcare_within_1km_count', 'healthcare_within_1km_average_rating', 'healthcare_within_2km_count', 'healthcare_within_2km_average_rating', 'recreational_within_1km_count', 'recreational_within_1km_average_rating', 'recreational_within_2km_count', 'recreational_within_2km_average_rating', 'education_within_1km_count', 'education_within_1km_average_rating', 'education_within_2km_count', 'education_within_2km_average_rating', 'region', 'price_per_sqm']


In [11]:
# Split Training & Testing Dataset (80:20 Split)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(253486, 22) (63372, 22) (253486, 1) (63372, 1)


In [12]:
# Reset Indexes
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

### Data Preprocessing

#### Data Cleaning

In [13]:
# Check Missing Data
X_train.isnull().sum()[X_train.isnull().sum() > 0]
X_test.isnull().sum()[X_test.isnull().sum() > 0]

Series([], dtype: int64)

In [14]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253486 entries, 0 to 253485
Data columns (total 22 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   town                                    253486 non-null  object 
 1   flat_type                               253486 non-null  object 
 2   floor_area_sqm                          253486 non-null  float64
 3   month                                   253486 non-null  object 
 4   lease_commence_date                     253486 non-null  int64  
 5   storey_range                            253486 non-null  object 
 6   remaining_lease                         253486 non-null  float64
 7   nearest_distance_to_mrt                 253486 non-null  float64
 8   healthcare_within_1km_count             253486 non-null  float64
 9   healthcare_within_1km_average_rating    253486 non-null  float64
 10  healthcare_within_2km_count             2534

In [None]:
# Change Data Types

## For Month


#### Data Transformation

In [None]:
# DT Methods i.e. StandardScaler, PowerTransformer



### Feature Engineering

#### Encoding

In [None]:
# Encode using OneHot

## For flat_type


## For storey_range


## For region



### Feature Extraction 

In [None]:
# Principal Component Analysis



### Feature Selection

In [None]:
# Heatmap 



In [None]:
# FS Methods i.e. Variance Threshold, KBest, RFE



## Machine Learning Modelling