## Business Problem

- Using hyperparameter tuning on RandomForestRegressor to predict the price that one has to pay for HDB flats


In [None]:
#import streamlit as st
import datetime as dt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor
import pickle

In [None]:
import requests
import json

query_string=['https://data.gov.sg/api/action/datastore_search?resource_id=f1765b54-a209-4718-8d38-a39237f502b3&limit=309885']
def hdb_api(query_string):
    for url in query_string:
        resp = requests.get(url)
        data = resp.json()
        hdb_records = data['result']['records']            
        df = pd.DataFrame.from_dict(hdb_records)
        return df

In [None]:
hdb_data = hdb_api(query_string)
print(hdb_data.columns)

Index(['town', 'flat_type', 'flat_model', 'floor_area_sqm', 'street_name',
       'resale_price', 'month', 'remaining_lease', 'lease_commence_date',
       'storey_range', '_id', 'block'],
      dtype='object')


In [None]:
hdb_data.head(5)

Unnamed: 0,town,flat_type,flat_model,floor_area_sqm,street_name,resale_price,month,remaining_lease,lease_commence_date,storey_range,_id,block
0,ANG MO KIO,2 ROOM,Improved,44,ANG MO KIO AVE 10,232000,2017-01,61 years 04 months,1979,10 TO 12,1,406
1,ANG MO KIO,3 ROOM,New Generation,67,ANG MO KIO AVE 4,250000,2017-01,60 years 07 months,1978,01 TO 03,2,108
2,ANG MO KIO,3 ROOM,New Generation,67,ANG MO KIO AVE 5,262000,2017-01,62 years 05 months,1980,01 TO 03,3,602
3,ANG MO KIO,3 ROOM,New Generation,68,ANG MO KIO AVE 10,265000,2017-01,62 years 01 month,1980,04 TO 06,4,465
4,ANG MO KIO,3 ROOM,New Generation,67,ANG MO KIO AVE 5,265000,2017-01,62 years 05 months,1980,01 TO 03,5,601


In [None]:
# create a function for data transformation from object to integer
def transform_data_to_int_float(dataframe,column,type):
  dataframe[column] = dataframe[column].astype(type)

In [None]:
# Fix the categorical data to change type to integer
hdb_data['flat_type'] = hdb_data['flat_type'].replace('ROOM','',regex=True)
hdb_data['flat_type'] = hdb_data['flat_type'].replace('EXECUTIVE','6',regex=True)
hdb_data['flat_type'] = hdb_data['flat_type'].replace('MULTI-GENERATION','6',regex=True)

In [None]:
hdb_data['block'].unique()

array(['406', '108', '602', ..., '449B', '694B', '694D'], dtype=object)

In [None]:
transform_data_to_int_float(hdb_data,'lease_commence_date','int')
transform_data_to_int_float(hdb_data,'flat_type','int')
transform_data_to_int_float(hdb_data,'floor_area_sqm','float')
transform_data_to_int_float(hdb_data,'resale_price','float')


In [None]:
hdb_data_datatypes_as_objects = hdb_data.select_dtypes(include='object')

In [None]:
# To identify which 
hdb_data_datatypes_as_objects.columns

Index(['town', 'flat_model', 'street_name', 'month', 'remaining_lease',
       'storey_range', 'block'],
      dtype='object')

In [None]:
# Looking at the summary of data types in the dataframe
hdb_data.dtypes.value_counts()

object     7
int64      3
float64    2
dtype: int64

In [None]:
# converting some categorical variables using Label Encoder

labelencoder=LabelEncoder()
hdb_data['town_cat']= labelencoder.fit_transform(hdb_data['town'])
hdb_data['floor'] = labelencoder.fit_transform(hdb_data['storey_range'])


In [None]:
# preparing data for Modelling
training_data = hdb_data[['town_cat','floor','flat_type','floor_area_sqm','lease_commence_date']]
X= training_data
y= hdb_data['resale_price']
print(training_data.dtypes)
print(y.dtypes)



town_cat                 int64
floor                    int64
flat_type                int64
floor_area_sqm         float64
lease_commence_date      int64
dtype: object
float64


In [None]:
# Train test split to allow for model validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=44)

In [None]:
#define grid
estimator = RandomForestRegressor()
param_grid = { 
            "n_estimators"      : [10,20,30], #number of estimators
            "max_features"      : ["auto", "sqrt", "log2"], #max number of features before creating a new node
            "min_samples_split" : [2,4,8], #min number of sample required in each node
            "bootstrap": [True, False], #replacement
            }

grid = GridSearchCV(estimator, param_grid, n_jobs=-1, cv=5)

results=grid.fit(X_train, y_train)
print('MAE: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)


MAE: 0.900
Config: {'bootstrap': True, 'max_features': 'auto', 'min_samples_split': 8, 'n_estimators': 30}
