## Basic Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set() # set the default Seaborn style for graphics

## Plotly Imports

In [None]:
import matplotlib.ticker as ticker
import json
import plotly.express as px

## Import the Dataset

Note: resale-flat-prices-based-on-approval-date-2000-feb-2012 is being split into two due to exceeding the size limit for Github.
- resale-flat-prices-based-on-approval-date-2000-feb-2005.csv
- resale-flat-prices-based-on-approval-date-2006-2012.csv

In [None]:
df_1990 = pd.read_csv('Resale Flat Prices/resale-flat-prices-based-on-approval-date-1990-1999.csv')
df_2000 = pd.read_csv('Resale Flat Prices/resale-flat-prices-based-on-approval-date-2000-feb-2005.csv')
df_2006 = pd.read_csv('Resale Flat Prices/resale-flat-prices-based-on-approval-date-2006-2012.csv')
df_2012 = pd.read_csv('Resale Flat Prices/resale-flat-prices-based-on-registration-date-from-mar-2012-to-dec-2014.csv')
df_2015 = pd.read_csv('Resale Flat Prices/resale-flat-prices-based-on-registration-date-from-jan-2015-to-dec-2016.csv')
df_2017 = pd.read_csv('Resale Flat Prices/resale-flat-prices-based-on-registration-date-from-jan-2017-onwards.csv')

In [None]:
df_main = pd.concat([df_1990, df_2000, df_2006, df_2012, df_2015, df_2017])
df_main = df_main.reset_index(drop=True)

## Check out the Data

In [None]:
df_main.head()

In [None]:
df_main.info()

In [None]:
df_main.nunique()

In [None]:
df_main.month.unique()

In [None]:
df_main.town.unique()

In [None]:
df_main.flat_type.unique()

In [None]:
df_main.storey_range.unique()

In [None]:
df_main.flat_model.unique()

In [None]:
df_main.remaining_lease.unique()

# Data Cleaning

In [None]:
df_main

In [None]:
# Region 

df_main['town'] = df_main['town'].replace(['ANG MO KIO'],'NORTH-EAST REGION')
df_main['town'] = df_main['town'].replace(['BEDOK'],'EAST REGION')
df_main['town'] = df_main['town'].replace(['BISHAN'],'CENTRAL REGION')
df_main['town'] = df_main['town'].replace(['BUKIT BATOK'],'WEST REGION')
df_main['town'] = df_main['town'].replace(['BUKIT MERAH'],'CENTRAL REGION')
df_main['town'] = df_main['town'].replace(['BUKIT TIMAH'],'CENTRAL REGION')
df_main['town'] = df_main['town'].replace(['CENTRAL AREA'],'CENTRAL REGION')
df_main['town'] = df_main['town'].replace(['CHOA CHU KANG'],'WEST REGION')
df_main['town'] = df_main['town'].replace(['CLEMENTI'],'WEST REGION')
df_main['town'] = df_main['town'].replace(['GEYLANG'],'CENTRAL REGION')
df_main['town'] = df_main['town'].replace(['HOUGANG'],'NORTH-EAST REGION')
df_main['town'] = df_main['town'].replace(['JURONG EAST'],'WEST REGION')
df_main['town'] = df_main['town'].replace(['JURONG WEST'],'WEST REGION')
df_main['town'] = df_main['town'].replace(['KALLANG/WHAMPOA'],'CENTRAL REGION')
df_main['town'] = df_main['town'].replace(['MARINE PARADE'],'CENTRAL REGION')
df_main['town'] = df_main['town'].replace(['QUEENSTOWN'],'CENTRAL REGION')
df_main['town'] = df_main['town'].replace(['SENGKANG'],'NORTH-EAST REGION')
df_main['town'] = df_main['town'].replace(['SERANGOON'],'NORTH-EAST REGION')
df_main['town'] = df_main['town'].replace(['TAMPINES'],'EAST REGION')
df_main['town'] = df_main['town'].replace(['TOA PAYOH'],'CENTRAL REGION')
df_main['town'] = df_main['town'].replace(['WOODLANDS'],'NORTH REGION')
df_main['town'] = df_main['town'].replace(['YISHUN'],'NORTH REGION')
df_main['town'] = df_main['town'].replace(['LIM CHU KANG'],'NORTH REGION')
df_main['town'] = df_main['town'].replace(['SEMBAWANG'],'NORTH REGION')
df_main['town'] = df_main['town'].replace(['BUKIT PANJANG'],'WEST REGION')
df_main['town'] = df_main['town'].replace(['PASIR RIS'],'EAST REGION')
df_main['town'] = df_main['town'].replace(['PUNGGOL'],'NORTH-EAST REGION')

In [None]:
# Storey Range

df_main['storey_range'] = df_main['storey_range'].replace(['10 TO 12'],'01 TO 15')
df_main['storey_range'] = df_main['storey_range'].replace(['04 TO 06'],'01 TO 15')
df_main['storey_range'] = df_main['storey_range'].replace(['07 TO 09'],'01 TO 15')
df_main['storey_range'] = df_main['storey_range'].replace(['01 TO 03'],'01 TO 15')
df_main['storey_range'] = df_main['storey_range'].replace(['13 TO 15'],'01 TO 15')
df_main['storey_range'] = df_main['storey_range'].replace(['19 TO 21'],'16 TO 30')
df_main['storey_range'] = df_main['storey_range'].replace(['16 TO 18'],'16 TO 30')
df_main['storey_range'] = df_main['storey_range'].replace(['25 TO 27'],'16 TO 30')
df_main['storey_range'] = df_main['storey_range'].replace(['22 TO 24'],'16 TO 30')
df_main['storey_range'] = df_main['storey_range'].replace(['28 TO 30'],'16 TO 30')
df_main['storey_range'] = df_main['storey_range'].replace(['31 TO 33'],'16 TO 30')
df_main['storey_range'] = df_main['storey_range'].replace(['40 TO 42'],'31 TO 45')
df_main['storey_range'] = df_main['storey_range'].replace(['37 TO 39'],'31 TO 45')
df_main['storey_range'] = df_main['storey_range'].replace(['34 TO 36'],'31 TO 45')
df_main['storey_range'] = df_main['storey_range'].replace(['06 TO 10'],'01 TO 15')
df_main['storey_range'] = df_main['storey_range'].replace(['01 TO 05'],'01 TO 15')
df_main['storey_range'] = df_main['storey_range'].replace(['11 TO 15'],'01 TO 15')
df_main['storey_range'] = df_main['storey_range'].replace(['16 TO 20'],'16 TO 30')
df_main['storey_range'] = df_main['storey_range'].replace(['21 TO 25'],'16 TO 30')
df_main['storey_range'] = df_main['storey_range'].replace(['26 TO 30'],'16 TO 30')
df_main['storey_range'] = df_main['storey_range'].replace(['36 TO 40'],'31 TO 45')
df_main['storey_range'] = df_main['storey_range'].replace(['31 TO 35'],'31 TO 45')
df_main['storey_range'] = df_main['storey_range'].replace(['46 TO 48'],'31 TO 45')
df_main['storey_range'] = df_main['storey_range'].replace(['43 TO 45'],'31 TO 45')
df_main['storey_range'] = df_main['storey_range'].replace(['49 TO 51'],'31 TO 45')

In [None]:
# Splitting month into Month and Year

df_main[['Year', 'Month']] = df_main['month'].str.split('-', expand=True) 

In [None]:
# Lease Commence Date

df_main['Year'] = df_main['Year'].astype(str).astype(int) 

df_main['lease_left']=(99 - (df_main['Year'] - df_main['lease_commence_date'])) 

Flat differences

Multi generation flat:
- Current - 3 rooms (2 master room, 1 normal room) [excluding living room]
- Previous (1980s) - 4-room or 5-room flat with an adjoining studio apartment that had a separate entrance

In [None]:
# Replacing MULTI-GENERATION TO MULTI GENERATION

df_main['flat_type'] = df_main['flat_type'].replace(['MULTI-GENERATION'],'MULTI GENERATION')

Type S1 and S2: (i.e. The Pinnacle@Duxton)
- Special types, different unit variations – with dissimilar combinations of features such as extended bays, balconies, bay windows and planter areas.

All the different models (Improved, Simplified etc):
- Refers to the different model of each flat types, normally means floor plan is different, but all means the same type.

Adjoined flat are 2 individual flats purchased side by side, and the owners link both units together.

Loft apartment is flats with high ceiling, 1 staircase and 1 room on the top, normally located at the top of the building (something like penthouse but HDB version)

Comparison between following models:
- Apartment (1-level)
- Maisonette (2-level, normally multiple units in 1 HDB building)
- Terrace (2-levels, but a stand-alone building)

In [None]:
# Flat Model

df_main['flat_model'] = df_main['flat_model'].replace(['MODEL A-MAISONETTE','IMPROVED-MAISONETTE',
                                                       'Model A-Maisonette','Improved-Maisonette', 
                                                       'Premium Maisonette', 'Maisonette'],
                                                       'MAISONETTE')
df_main['flat_model'] = df_main['flat_model'].replace(['PREMIUM APARTMENT','Apartment','Premium Apartment'],
                                                       'APARTMENT')
df_main['flat_model'] = df_main['flat_model'].replace(['Improved'],'IMPROVED')
df_main['flat_model'] = df_main['flat_model'].replace(['New Generation'],'NEW GENERATION')
df_main['flat_model'] = df_main['flat_model'].replace(['Model A'],'MODEL A')
df_main['flat_model'] = df_main['flat_model'].replace(['Standard'],'STANDARD')
df_main['flat_model'] = df_main['flat_model'].replace(['Simplified'],'SIMPLIFIED')
df_main['flat_model'] = df_main['flat_model'].replace(['2-room','Model A2'],'2-ROOM')
df_main['flat_model'] = df_main['flat_model'].replace(['Terrace'],'TERRACE')
df_main['flat_model'] = df_main['flat_model'].replace(['Multi Generation'],'MULTI GENERATION')
df_main['flat_model'] = df_main['flat_model'].replace(['Adjoined flat'],'ADJOINED FLAT')
df_main['flat_model'] = df_main['flat_model'].replace(['Type S1', 'Type S2'],'TYPE S')
df_main['flat_model'] = df_main['flat_model'].replace(['Premium Apartment Loft'],'LOFT APARTMENT')

In [None]:
df_main

In [None]:
df_cleaned = df_main[['flat_type','block','street_name','storey_range','floor_area_sqm','flat_model',
                      'resale_price','Year','lease_left','town']]
df_cleaned = df_cleaned.rename(columns={"Year": "year", 'town' : 'region'})
df_cleaned.head()

In [None]:
df_cleaned.info()

In [None]:
df_cleaned.nunique()

Separating the data into decades.
- 1990 to 1999
- 2000 to 2009
- 2010 to 2019
- 2020 onwards

In [None]:
df_90 = df_cleaned[df_cleaned['year'] < 2000]
df_90

In [None]:
df_00 = df_cleaned[(df_cleaned['year'] >= 2000) & (df_cleaned['year'] < 2010)]
df_00

In [None]:
df_10 = df_cleaned[(df_cleaned['year'] >= 2010) & (df_cleaned['year'] < 2020)]
df_10

In [None]:
df_20 = df_cleaned[df_cleaned['year'] >= 2020]
df_20

## Map Visualisation

In [None]:
f = open('Geojson/master-plan-2019-region-boundary-no-sea-geojson.geojson')
 
data = json.load(f)

In [None]:
data

In [None]:
data['features'][0]['properties']['region'] = 'WEST REGION'
data['features'][1]['properties']['region'] = 'NORTH REGION'
data['features'][2]['properties']['region'] = 'NORTH-EAST REGION'
data['features'][3]['properties']['region'] = 'EAST REGION'
data['features'][4]['properties']['region'] = 'CENTRAL REGION'

In [None]:
print(df_cleaned["region"][0])
print(data['features'][0]['properties'])

# Exploratory Data Analysis (EDA)

## Overall across the years from 1990 to 2022

In [None]:
numeric_main = pd.DataFrame(df_cleaned[["floor_area_sqm","resale_price","lease_left"]])
numeric_main

In [None]:
def numeric_dist(df):

    f, axes = plt.subplots(3, 3, figsize=(18, 18))
    count = 0
    for var in df:
        sns.boxplot(data = df[var], orient = "h", ax = axes[count,0])
        sns.histplot(data = df[var], ax = axes[count,1])
        sns.violinplot(data = df[var], orient = "h", ax = axes[count,2])
        count += 1

In [None]:
# Draw the Distributions of All Variables
numeric_dist(numeric_main)

In [None]:
def numeric_corr(df):
    
    print(df.corr())
    f = plt.figure(figsize=(8, 8))
    sns.heatmap(df.corr(), vmin = -1, vmax = 1, linewidths = 1,
               annot = True, fmt = ".2f", annot_kws = {"size": 18}, cmap = "RdBu")

In [None]:
# Heatmap of the Correlation Matrix
numeric_corr(numeric_main)

In [None]:
sns.pairplot(data = numeric_main)

In [None]:
def flat_type_flat_model(df):
    
    f = plt.figure(figsize=(20, 15))
    sns.heatmap(df.groupby(['flat_type', 'flat_model']).size().unstack(), 
           linewidths = 1, annot = True, annot_kws = {"size": 18}, cmap = "Blues", fmt=".0f")    


In [None]:
# Distribution Over Flat Type vs Flat Model
flat_type_flat_model(df_cleaned)

In [None]:
def flat_type_region(df):

    f = plt.figure(figsize=(15, 10))
    sns.heatmap(df.groupby(['flat_type', 'region']).size().unstack(), 
           linewidths = 1, annot = True, annot_kws = {"size": 18}, cmap = "Blues", fmt=".0f")

In [None]:
# Distribution Over Flat Type vs Region
flat_type_region(df_cleaned)

In [None]:
def storey_range_region(df):
    
    f = plt.figure(figsize=(10, 5))
    sns.heatmap(df.groupby(['storey_range', 'region']).size().unstack(), 
           linewidths = 1, annot = True, annot_kws = {"size": 18}, cmap = "Blues", fmt=".0f")

In [None]:
# Distribution Over Storey Range vs Region
storey_range_region(df_cleaned)

In [None]:
def resale_price_region(df):
    
    fig, ax = plt.subplots(figsize=(15, 10))    
    sns.boxplot(x ='resale_price', y ='region', data = df)
    ax.xaxis.set_major_formatter(ticker.EngFormatter())
    plt.show()

In [None]:
# Distribution Over Resale Price vs Region
resale_price_region(df_cleaned)

In [None]:
def resale_price_flat_model(df):

    fig, ax = plt.subplots(figsize=(10, 10))    
    sns.boxplot(x ='resale_price', y ='flat_model', data = df)
    ax.xaxis.set_major_formatter(ticker.EngFormatter())
    plt.show()

In [None]:
# Distribution Over Resale Price vs Flat Model
resale_price_flat_model(df_cleaned)

In [None]:
df_overall = df_cleaned.groupby(['region']).mean().reset_index()

In [None]:
def map_visual(df):
    
    fig = px.choropleth_mapbox(df, geojson=data, color="resale_price",
                           locations="region", featureidkey="properties.region",
                           center={"lat": 1.3302, "lon": 103.8519},
                           color_continuous_scale = 'blues',
                           mapbox_style="carto-positron", zoom=10)
    fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
    fig.show()

In [None]:
# Map Visualisation for Overall Average Resale Price per Region
map_visual(df_overall)

## 1990 to 1999

In [None]:
numeric_90 = pd.DataFrame(df_90[["floor_area_sqm","resale_price","lease_left"]])
numeric_90

In [None]:
# Draw the Distributions of All Variables
numeric_dist(numeric_90)

In [None]:
# Heatmap of the Correlation Matrix
numeric_corr(numeric_90)

In [None]:
sns.pairplot(data = numeric_90)

In [None]:
# Distribution Over Flat Type vs Flat Model
flat_type_flat_model(df_90)

In [None]:
# Distribution Over Flat Type vs Region
flat_type_region(df_90)

In [None]:
# Distribution Over Storey Range vs Region
storey_range_region(df_90)

In [None]:
# Distribution Over Resale Price vs Region
resale_price_region(df_90)

In [None]:
# Distribution Over Resale Price vs Flat Model
resale_price_flat_model(df_90)

In [None]:
df_90overall = df_90.groupby(['region']).mean().reset_index()

In [None]:
# Map Visualisation for Overall Average Resale Price per Region
map_visual(df_90overall)

## 2000 to 2009

In [None]:
numeric_00= pd.DataFrame(df_00[["floor_area_sqm","resale_price","lease_left"]])
numeric_00.reset_index(drop=True, inplace=True)
numeric_00

In [None]:
# Draw the Distributions of All Variables
numeric_dist(numeric_00)

In [None]:
# Heatmap of the Correlation Matrix
numeric_corr(numeric_00)

In [None]:
sns.pairplot(data = numeric_00)

In [None]:
# Distribution Over Flat Type vs Flat Model
flat_type_flat_model(df_00)

In [None]:
# Distribution Over Flat Type vs Region
flat_type_region(df_00)

In [None]:
# Distribution Over Storey Range vs Region
storey_range_region(df_00)

In [None]:
# Distribution Over Resale Price vs Region
resale_price_region(df_00)

In [None]:
# Distribution Over Resale Price vs Flat Model
resale_price_flat_model(df_00)

In [None]:
df_00overall = df_00.groupby(['region']).mean().reset_index()

In [None]:
# Map Visualisation for Overall Average Resale Price per Region
map_visual(df_00overall)

## 2010 to 2019

In [None]:
numeric_10= pd.DataFrame(df_10[["floor_area_sqm","resale_price","lease_left"]])
numeric_10.reset_index(drop=True, inplace=True)
numeric_10

In [None]:
# Draw the Distributions of All Variables
numeric_dist(numeric_10)

In [None]:
# Heatmap of the Correlation Matrix
numeric_corr(numeric_10)

In [None]:
sns.pairplot(data = numeric_10)

In [None]:
# Distribution Over Flat Type vs Flat Model
flat_type_flat_model(df_10)

In [None]:
# Distribution Over Flat Type vs Region
flat_type_region(df_10)

In [None]:
# Distribution Over Storey Range vs Region
storey_range_region(df_10)

In [None]:
# Distribution Over Resale Price vs Region
resale_price_region(df_10)

In [None]:
# Distribution Over Resale Price vs Flat Model
resale_price_flat_model(df_10)

In [None]:
df_10overall = df_10.groupby(['region']).mean().reset_index()

In [None]:
# Map Visualisation for Overall Average Resale Price per Region
map_visual(df_10overall)

## 2020 onwards

In [None]:
numeric_20= pd.DataFrame(df_20[["floor_area_sqm","resale_price","lease_left"]])
numeric_20.reset_index(drop=True, inplace=True)
numeric_20

In [None]:
# Draw the Distributions of All Variables
numeric_dist(numeric_20)

In [None]:
# Heatmap of the Correlation Matrix
numeric_corr(numeric_20)

In [None]:
sns.pairplot(data = numeric_20)

In [None]:
# Distribution Over Flat Type vs Flat Model
flat_type_flat_model(df_cleaned)

In [None]:
# Distribution Over Flat Type vs Region
flat_type_region(df_cleaned)

In [None]:
# Distribution Over Storey Range vs Region
storey_range_region(df_cleaned)

In [None]:
# Distribution Over Resale Price vs Region
resale_price_region(df_cleaned)

In [None]:
# Distribution Over Resale Price vs Flat Model
resale_price_flat_model(df_cleaned)

In [None]:
df_20overall = df_20.groupby(['region']).mean().reset_index()

In [None]:
# Map Visualisation for Overall Average Resale Price per Region
map_visual(df_20overall)

FOR EACH YEAR:

region, 3 areas in singapore (north-east, central, west)

resale price of flat models (3 room, 4 room, 5 room, EA, EM)

average resale price/number of units to see the trend

FOR EACH GEN:

show graph of average and see resale price up or down

In [None]:
df_flat_overall_price = df_cleaned.groupby(['year'])['resale_price'].mean().reset_index()
df_flat_type_price = df_cleaned.groupby(['year','flat_type'])['resale_price'].mean().reset_index()
df_flat_model_price = df_cleaned.groupby(['year','flat_model'])['resale_price'].mean().reset_index()

In [None]:
df_flat_type_price_3room = df_flat_type_price.loc[df_flat_type_price['flat_type'] == '3 ROOM']
df_flat_type_price_4room = df_flat_type_price.loc[df_flat_type_price['flat_type'] == '4 ROOM']
df_flat_type_price_5room = df_flat_type_price.loc[df_flat_type_price['flat_type'] == '5 ROOM']
df_flat_model_price_ea = df_flat_model_price.loc[df_flat_model_price['flat_model'] == 'APARTMENT']
df_flat_model_price_ea['flat_type'] = "EXECUTIVE APARTMENT"
df_flat_model_price_ma = df_flat_model_price.loc[df_flat_model_price['flat_model'] == 'MAISONETTE']
df_flat_model_price_ma['flat_type'] = "EXECUTIVE MAISONETTE"
df_flat_multiple = pd.concat([df_flat_type_price_3room, df_flat_type_price_4room, df_flat_type_price_5room,
                             df_flat_model_price_ea, df_flat_model_price_ma]).reset_index()

In [None]:
def flat_lineplot(df, title):
    fig = plt.figure(figsize = (15, 5))
    plt.xlabel("Years")
    plt.ylabel("Resale Price")
    sns.lineplot(data=df, x="year", y="resale_price")
    plt.title(title)

In [None]:
fig = plt.figure(figsize = (15, 10))
plt.xlabel("Years")
plt.ylabel("Resale Price")
sns.lineplot(data=df_flat_multiple, x="year", y="resale_price", hue="flat_type")
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
plt.title("Overall")

In [None]:
flat_lineplot(df_flat_type_price_3room, "3 ROOM")

In [None]:
flat_lineplot(df_flat_type_price_4room, "4 ROOM")

In [None]:
flat_lineplot(df_flat_type_price_5room, "5 ROOM")

In [None]:
flat_lineplot(df_flat_model_price_ea, "EXECUTIVE APARTMENT")

In [None]:
flat_lineplot(df_flat_model_price_ma, "EXECUTIVE MAISONETTE")

# Machine Learning

## Import Libraries

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics
lm = LinearRegression()

## Training and Testing Data

## Testing Data: 2020 onwards

In [None]:
y_test_price = numeric_20['resale_price']
X_test_area = numeric_20[['floor_area_sqm']]
X_test_ll = numeric_20[['lease_left']]

## Training Data: 1990 to 1999

In [None]:
y_train90_price = numeric_90['resale_price']
X_train90_area = numeric_90[['floor_area_sqm']]
X_train90_ll = numeric_90[['lease_left']]

## Training the model: Area vs Price

In [None]:
lm.fit(X_train90_area,y_train90_price)

## Model Evaluation 

In [None]:
print('Intercept \t a: ', + lm.intercept_)
print('Coefficient \t b: ', + lm.coef_)

In [None]:
# Formula for the Regression line
regline_x = X_train90_area
regline_y = lm.intercept_ + lm.coef_ * X_train90_area

# Plot the Linear Regression line
plt.figure(figsize=(16, 8))
plt.scatter(X_train90_area, y_train90_price)
plt.plot(regline_x, regline_y, 'r-', linewidth = 3)
plt.show()

## Predictions from the model

In [None]:
predictions = lm.predict(X_test_area)

In [None]:
plt.figure(figsize=(16, 8))
plt.scatter(X_test_area,y_test_price)
plt.scatter(X_test_area, predictions, color = "r")

In [None]:
print('floor_area_sqm')
print('Explained Variance (R^2):', lm.score(X_train90_area, y_train90_price))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test_price, predictions))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test_price, predictions)))

In [None]:
data = [{'R2': lm.score(X_train90_area, y_train90_price),
        'MSE': metrics.mean_squared_error(y_test_price, predictions),
        'RMSE': np.sqrt(metrics.mean_squared_error(y_test_price, predictions))}]
df_lr_90_area = pd.DataFrame(data)
df_lr_90_area['Period'] = '90'

## Training the model: Lease vs Price

In [None]:
lm.fit(X_train90_ll,y_train90_price)

## Model Evaluation 

In [None]:
print('Intercept \t a: ', + lm.intercept_)
print('Coefficient \t b: ', + lm.coef_)

In [None]:
# Formula for the Regression line
regline_x = X_train90_ll
regline_y = lm.intercept_ + lm.coef_ * X_train90_ll

# Plot the Linear Regression line
plt.figure(figsize=(16, 8))
plt.scatter(X_train90_ll, y_train90_price)
plt.plot(regline_x, regline_y, 'r-', linewidth = 3)
plt.show()

## Predictions from the model

In [None]:
predictions = lm.predict(X_test_ll)

In [None]:
plt.figure(figsize=(16, 8))
plt.scatter(X_test_ll,y_test_price)
plt.scatter(X_test_ll, predictions, color = "r")

In [None]:
print('floor_area_sqm')
print('Explained Variance (R^2):', lm.score(X_train90_ll, y_train90_price))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test_price, predictions))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test_price, predictions)))

In [None]:
data = [{'R2': lm.score(X_train90_ll, y_train90_price),
        'MSE': metrics.mean_squared_error(y_test_price, predictions),
        'RMSE': np.sqrt(metrics.mean_squared_error(y_test_price, predictions))}]
df_lr_90_ll = pd.DataFrame(data)
df_lr_90_ll['Period'] = '90'

## Training Data: 2000 to 2009

In [None]:
y_train00_price = numeric_00['resale_price']
X_train00_area = numeric_00[['floor_area_sqm']]
X_train00_ll = numeric_00[['lease_left']]

## Training the model: Area vs Price

In [None]:
lm.fit(X_train00_area,y_train00_price)

## Model Evaluation 

In [None]:
print('Intercept \t a: ', + lm.intercept_)
print('Coefficient \t b: ', + lm.coef_)

In [None]:
# Formula for the Regression line
regline_x = X_train00_area
regline_y = lm.intercept_ + lm.coef_ * X_train00_area

# Plot the Linear Regression line
plt.figure(figsize=(16, 8))
plt.scatter(X_train00_area, y_train00_price)
plt.plot(regline_x, regline_y, 'r-', linewidth = 3)
plt.show()

## Predictions from the model

In [None]:
predictions = lm.predict(X_test_area)

In [None]:
plt.figure(figsize=(16, 8))
plt.scatter(X_test_area,y_test_price)
plt.scatter(X_test_area, predictions, color = "r")

In [None]:
print('floor_area_sqm')
print('Explained Variance (R^2):', lm.score(X_train00_area, y_train00_price))
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test_price, predictions)))
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test_price, predictions))))

In [None]:
data = [{'R2': lm.score(X_train00_area, y_train00_price),
        'MSE': metrics.mean_squared_error(y_test_price, predictions),
        'RMSE': np.sqrt(metrics.mean_squared_error(y_test_price, predictions))}]
df_lr_00_area = pd.DataFrame(data)
df_lr_00_area['Period'] = '00'

## Training the model: Lease vs Price

In [None]:
lm.fit(X_train00_ll,y_train00_price)

## Model Evaluation 

In [None]:
print('Intercept \t a: ', + lm.intercept_)
print('Coefficient \t b: ', + lm.coef_)

In [None]:
# Formula for the Regression line
regline_x = X_train00_ll
regline_y = lm.intercept_ + lm.coef_ * X_train00_ll

# Plot the Linear Regression line
plt.figure(figsize=(16, 8))
plt.scatter(X_train00_ll, y_train00_price)
plt.plot(regline_x, regline_y, 'r-', linewidth = 3)
plt.show()

## Predictions from the model

In [None]:
predictions = lm.predict(X_test_ll)

In [None]:
plt.figure(figsize=(16, 8))
plt.scatter(X_test_ll,y_test_price)
plt.scatter(X_test_ll, predictions, color = "r")

In [None]:
print('floor_area_sqm')
print('Explained Variance (R^2):', lm.score(X_train00_ll, y_train00_price))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test_price, predictions))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test_price, predictions)))

In [None]:
data = [{'R2': lm.score(X_train00_ll, y_train00_price),
        'MSE': metrics.mean_squared_error(y_test_price, predictions),
        'RMSE': np.sqrt(metrics.mean_squared_error(y_test_price, predictions))}]
df_lr_00_ll = pd.DataFrame(data)
df_lr_00_ll['Period'] = '00'

## Training Data: 2010 to 2019

In [None]:
y_train10_price = numeric_10['resale_price']
X_train10_area = numeric_10[['floor_area_sqm']]
X_train10_ll = numeric_10[['lease_left']]

## Training the model: Area vs Price

In [None]:
lm.fit(X_train10_area,y_train10_price)

## Model Evaluation 

In [None]:
print('Intercept \t a: ', + lm.intercept_)
print('Coefficient \t b: ', + lm.coef_)

In [None]:
# Formula for the Regression line
regline_x = X_train10_area
regline_y = lm.intercept_ + lm.coef_ * X_train10_area

# Plot the Linear Regression line
plt.figure(figsize=(16, 8))
plt.scatter(X_train10_area, y_train10_price)
plt.plot(regline_x, regline_y, 'r-', linewidth = 3)
plt.show()

## Predictions from the model

In [None]:
predictions = lm.predict(X_test_area)

In [None]:
plt.figure(figsize=(16, 8))
plt.scatter(X_test_area,y_test_price)
plt.scatter(X_test_area, predictions, color = "r")

In [None]:
print('floor_area_sqm')
print('Explained Variance (R^2):', lm.score(X_train10_area, y_train10_price))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test_price, predictions))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test_price, predictions)))

In [None]:
data = [{'R2': lm.score(X_train10_area, y_train10_price),
        'MSE': metrics.mean_squared_error(y_test_price, predictions),
        'RMSE': np.sqrt(metrics.mean_squared_error(y_test_price, predictions))}]
df_lr_10_area = pd.DataFrame(data)
df_lr_10_area['Period'] = '10'

## Training the model: Lease vs Price

In [None]:
lm.fit(X_train10_ll,y_train10_price)

## Model Evaluation 

In [None]:
print('Intercept \t a: ', + lm.intercept_)
print('Coefficient \t b: ', + lm.coef_)

In [None]:
# Formula for the Regression line
regline_x = X_train10_ll
regline_y = lm.intercept_ + lm.coef_ * X_train10_ll

# Plot the Linear Regression line
plt.figure(figsize=(16, 8))
plt.scatter(X_train10_ll, y_train10_price)
plt.plot(regline_x, regline_y, 'r-', linewidth = 3)
plt.show()

## Predictions from the model

In [None]:
predictions = lm.predict(X_test_ll)

In [None]:
plt.figure(figsize=(16, 8))
plt.scatter(X_test_ll,y_test_price)
plt.scatter(X_test_ll, predictions, color = "r")

In [None]:
print('floor_area_sqm')
print('Explained Variance (R^2):', lm.score(X_train10_ll, y_train10_price))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test_price, predictions))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test_price, predictions)))

In [None]:
data = [{'R2': lm.score(X_train10_ll, y_train10_price),
        'MSE': metrics.mean_squared_error(y_test_price, predictions),
        'RMSE': np.sqrt(metrics.mean_squared_error(y_test_price, predictions))}]
df_lr_10_ll = pd.DataFrame(data)
df_lr_10_ll['Period'] = '10'

In [None]:
df_lr_area = pd.concat([df_lr_90_area, df_lr_00_area, df_lr_10_area]).reset_index()

In [None]:
df_lr_ll = pd.concat([df_lr_90_ll, df_lr_00_ll, df_lr_10_ll]).reset_index()

## Linear Regression Model Comparison

In [None]:
df_lr_ll

In [None]:
df_lr_area

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from scipy.stats import spearmanr, pearsonr

In [None]:
1990s

In [None]:
df_90.info()

In [None]:
y_90 = pd.DataFrame(df_90[['resale_price']])
X_90 = pd.DataFrame(df_90[['flat_type', 'storey_range', 'flat_model', 'region']])
X_90.nunique()

In [None]:
replace_values = {'1 ROOM':0,'2 ROOM':1, '3 ROOM':2, '4 ROOM':3, '5 ROOM':4, 'EXECUTIVE':5, 'MULTI GENERATION':6}
X_90 = X_90.replace({'flat_type': replace_values})
X_90= pd.get_dummies(X_90, columns=['storey_range'], prefix=['storey_range'])
X_90= pd.get_dummies(X_90, columns=['flat_model'], prefix=['flat_model'])
X_90= pd.get_dummies(X_90, columns=['region'], prefix=['region'])

In [None]:
# Validation using out-of-bag method
rf = RandomForestRegressor(n_estimators=100,oob_score=True, random_state=0)
rf.fit(X_90,np.ravel(y_90))
predicted_train = rf.predict(X_90)

print(f'Out-of-bag R\u00b2 score estimate: {rf.oob_score_:>5.3}')

In [None]:
predicted_test = rf.predict(X_90)
oob_test_score = r2_score(y_90['resale_price'], predicted_test)
spearman = spearmanr(y_90['resale_price'], predicted_test)
pearson = pearsonr(y_90['resale_price'], predicted_test)
oob_mae = mean_absolute_error(y_90['resale_price'], predicted_test)

print(f'Test data R\u00b2 score: {oob_test_score:>5.3}')
print(f'Test data Spearman correlation: {spearman[0]:.3}')
print(f'Test data Pearson correlation: {pearson[0]:.3}')
print(f'Test data Mean Absolute Error: {round(oob_mae)}')

In [None]:
from sklearn.model_selection import GridSearchCV

# validation by k-fold cross validation with grid search for best hyperparameters
# hyperparameter values shown below are the tuned final values
param_grid = {
    'max_features': ['auto'], # max number of features considered for splitting a node
    'max_depth': [20], # max number of levels in each decision tree
    'min_samples_split': [15], # min number of data points placed in a node before the node is split
    'min_samples_leaf': [2]} # min number of data points allowed in a leaf node
rfr =GridSearchCV(RandomForestRegressor(n_estimators = 500, n_jobs=-1, random_state=0),
                        param_grid, cv=10, scoring='r2', return_train_score=True)
rfr.fit(X_90,np.ravel(y_90))
print("Best parameters set found on Cross Validation:\n\n", rfr.best_params_)
print("\nCross Validation R\u00b2 score:\n\n", rfr.best_score_.round(3))

In [None]:
cv_predicted_test = rfr.predict(X_90)
cv_test_score = r2_score(y_90['resale_price'], cv_predicted_test)
spearman = spearmanr(y_90['resale_price'], cv_predicted_test)
pearson = pearsonr(y_90['resale_price'], cv_predicted_test)
cv_mae = mean_absolute_error(y_90['resale_price'], cv_predicted_test)
print(f'Test data R\u00b2 score: {cv_test_score:>5.3}')
print(f'Test data Spearman correlation: {spearman[0]:.3}')
print(f'Test data Pearson correlation: {pearson[0]:.3}')
print(f'Test data Mean Absolute Error: {round(cv_mae)}')

In [None]:
fig = plt.figure(figsize=(13,4))

ax1 = plt.subplot(121)
ax1 = sns.scatterplot(x=y_90['resale_price'], y=predicted_test, edgecolors='w', alpha=0.9, s=8)
ax1.set_xlabel('Observed'), ax1.set_xticklabels(['{:,.0f}'.format(x) + 'K' for x in ax1.get_xticks()/1000])
ax1.set_ylabel('Predicted'), ax1.set_yticklabels(['{:,.0f}'.format(x) + 'K' for x in ax1.get_yticks()/1000])
ax1.annotate('Test R\u00b2: ' + str(round(oob_test_score,3)) + '\nTest MAE: ' + str(round(oob_mae)), xy=(0, 1), xytext=(25, -35),
    xycoords='axes fraction', textcoords='offset points', fontsize=12)
ax1.set_title('Tuned Using Out-Of-Bag')

ax2 = plt.subplot(122)
ax2 = sns.scatterplot(x=y_90['resale_price'], y=cv_predicted_test, edgecolors='w', alpha=0.9, s=8)
ax2.set_xlabel('Observed'), ax2.set_xticklabels(['{:,.0f}'.format(x) + 'K' for x in ax2.get_xticks()/1000])
ax2.set_ylabel('Predicted'), ax2.set_yticklabels(['{:,.0f}'.format(x) + 'K' for x in ax2.get_yticks()/1000])
ax2.annotate('Test R\u00b2: ' + str(round(cv_test_score,3)) + '\nTest MAE: ' + str(round(cv_mae)), xy=(0, 1), xytext=(25, -35),
    xycoords='axes fraction', textcoords='offset points', fontsize=12)
ax2.set_title('Tuned Using Cross Validation')
plt.tight_layout(pad=0, rect=[0, 0, 0.9, 0.9])
plt.show()

In [None]:
fig = plt.figure(figsize=(20,5))

ax1 = plt.subplot(121)
feat_imp = pd.DataFrame({'Features': X_90.columns, 'Feature Importance': rf.feature_importances_}).sort_values('Feature Importance', ascending=False)
sns.barplot(y='Features', x='Feature Importance', data=feat_imp)
#plt.xticks(rotation=45, ha='right')
ax1.set_title('OOB Feature Importance', size=10)

ax2 = plt.subplot(122)
feat_imp = pd.DataFrame({'Features': X_90.columns, 'Feature Importance': rfr.best_estimator_.feature_importances_}).sort_values('Feature Importance', ascending=False)
sns.barplot(y='Features', x='Feature Importance', data=feat_imp)
ax2.set_title('CV Feature Importance', size=10)

In [None]:
2000s

In [None]:
df_00.info()

In [None]:
y_00 = pd.DataFrame(df_00[['resale_price']])
X_00 = pd.DataFrame(df_00[['flat_type', 'storey_range', 'flat_model', 'region']])
X_00.nunique()

In [None]:
X_00 = X_00.replace({'flat_type': replace_values})
X_00= pd.get_dummies(X_00, columns=['storey_range'], prefix=['storey_range'])
X_00= pd.get_dummies(X_00, columns=['flat_model'], prefix=['flat_model'])
X_00= pd.get_dummies(X_00, columns=['region'], prefix=['region'])

In [None]:
rf = RandomForestRegressor(n_estimators=100,oob_score=True, random_state=0)
rf.fit(X_00,np.ravel(y_00))
predicted_train = rf.predict(X_00)

print(f'Out-of-bag R\u00b2 score estimate: {rf.oob_score_:>5.3}')

In [None]:
predicted_test = rf.predict(X_00)
oob_test_score = r2_score(y_00['resale_price'], predicted_test)
spearman = spearmanr(y_00['resale_price'], predicted_test)
pearson = pearsonr(y_00['resale_price'], predicted_test)
oob_mae = mean_absolute_error(y_00['resale_price'], predicted_test)

print(f'Test data R\u00b2 score: {oob_test_score:>5.3}')
print(f'Test data Spearman correlation: {spearman[0]:.3}')
print(f'Test data Pearson correlation: {pearson[0]:.3}')
print(f'Test data Mean Absolute Error: {round(oob_mae)}')

In [None]:
from sklearn.model_selection import GridSearchCV

# validation by k-fold cross validation with grid search for best hyperparameters
# hyperparameter values shown below are the tuned final values
param_grid = {
    'max_features': ['auto'], # max number of features considered for splitting a node
    'max_depth': [20], # max number of levels in each decision tree
    'min_samples_split': [15], # min number of data points placed in a node before the node is split
    'min_samples_leaf': [2]} # min number of data points allowed in a leaf node
rfr =GridSearchCV(RandomForestRegressor(n_estimators = 500, n_jobs=-1, random_state=0),
                        param_grid, cv=10, scoring='r2', return_train_score=True)
rfr.fit(X_00,np.ravel(y_00))
print("Best parameters set found on Cross Validation:\n\n", rfr.best_params_)
print("\nCross Validation R\u00b2 score:\n\n", rfr.best_score_.round(3))

In [None]:
cv_predicted_test = rfr.predict(X_00)
cv_test_score = r2_score(y_00['resale_price'], cv_predicted_test)
spearman = spearmanr(y_00['resale_price'], cv_predicted_test)
pearson = pearsonr(y_00['resale_price'], cv_predicted_test)
cv_mae = mean_absolute_error(y_00['resale_price'], cv_predicted_test)
print(f'Test data R\u00b2 score: {cv_test_score:>5.3}')
print(f'Test data Spearman correlation: {spearman[0]:.3}')
print(f'Test data Pearson correlation: {pearson[0]:.3}')
print(f'Test data Mean Absolute Error: {round(cv_mae)}')

In [None]:
fig = plt.figure(figsize=(13,4))

ax1 = plt.subplot(121)
ax1 = sns.scatterplot(x=y_00['resale_price'], y=predicted_test, edgecolors='w', alpha=0.9, s=8)
ax1.set_xlabel('Observed'), ax1.set_xticklabels(['{:,.0f}'.format(x) + 'K' for x in ax1.get_xticks()/1000])
ax1.set_ylabel('Predicted'), ax1.set_yticklabels(['{:,.0f}'.format(x) + 'K' for x in ax1.get_yticks()/1000])
ax1.annotate('Test R\u00b2: ' + str(round(oob_test_score,3)) + '\nTest MAE: ' + str(round(oob_mae)), xy=(0, 1), xytext=(25, -35),
    xycoords='axes fraction', textcoords='offset points', fontsize=12)
ax1.set_title('Tuned Using Out-Of-Bag')

ax2 = plt.subplot(122)
ax2 = sns.scatterplot(x=y_00['resale_price'], y=cv_predicted_test, edgecolors='w', alpha=0.9, s=8)
ax2.set_xlabel('Observed'), ax2.set_xticklabels(['{:,.0f}'.format(x) + 'K' for x in ax2.get_xticks()/1000])
ax2.set_ylabel('Predicted'), ax2.set_yticklabels(['{:,.0f}'.format(x) + 'K' for x in ax2.get_yticks()/1000])
ax2.annotate('Test R\u00b2: ' + str(round(cv_test_score,3)) + '\nTest MAE: ' + str(round(cv_mae)), xy=(0, 1), xytext=(25, -35),
    xycoords='axes fraction', textcoords='offset points', fontsize=12)
ax2.set_title('Tuned Using Cross Validation')
plt.tight_layout(pad=0, rect=[0, 0, 0.9, 0.9])
plt.show()

In [None]:
fig = plt.figure(figsize=(20,5))

ax1 = plt.subplot(121)
feat_imp = pd.DataFrame({'Features': X_00.columns, 'Feature Importance': rf.feature_importances_}).sort_values('Feature Importance', ascending=False)
sns.barplot(y='Features', x='Feature Importance', data=feat_imp)
#plt.xticks(rotation=45, ha='right')
ax1.set_title('OOB Feature Importance', size=10)

ax2 = plt.subplot(122)
feat_imp = pd.DataFrame({'Features': X_00.columns, 'Feature Importance': rfr.best_estimator_.feature_importances_}).sort_values('Feature Importance', ascending=False)
sns.barplot(y='Features', x='Feature Importance', data=feat_imp)
ax2.set_title('CV Feature Importance', size=10)

In [None]:
2010s

In [None]:
df_10.info()

In [None]:
y_10 = pd.DataFrame(df_10[['resale_price']])
X_10 = pd.DataFrame(df_10[['flat_type', 'storey_range', 'flat_model', 'region']])
X_10.nunique()

In [None]:
X_10 = X_10.replace({'flat_type': replace_values})
X_10= pd.get_dummies(X_10, columns=['storey_range'], prefix=['storey_range'])
X_10= pd.get_dummies(X_10, columns=['flat_model'], prefix=['flat_model'])
X_10= pd.get_dummies(X_10, columns=['region'], prefix=['region'])

In [None]:
rf = RandomForestRegressor(n_estimators=100,oob_score=True, random_state=0)
rf.fit(X_10,np.ravel(y_10))
predicted_train = rf.predict(X_10)

print(f'Out-of-bag R\u00b2 score estimate: {rf.oob_score_:>5.3}')

In [None]:
predicted_test = rf.predict(X_10)
oob_test_score = r2_score(y_10['resale_price'], predicted_test)
spearman = spearmanr(y_10['resale_price'], predicted_test)
pearson = pearsonr(y_10['resale_price'], predicted_test)
oob_mae = mean_absolute_error(y_10['resale_price'], predicted_test)

print(f'Test data R\u00b2 score: {oob_test_score:>5.3}')
print(f'Test data Spearman correlation: {spearman[0]:.3}')
print(f'Test data Pearson correlation: {pearson[0]:.3}')
print(f'Test data Mean Absolute Error: {round(oob_mae)}')

In [None]:
from sklearn.model_selection import GridSearchCV

# validation by k-fold cross validation with grid search for best hyperparameters
# hyperparameter values shown below are the tuned final values
param_grid = {
    'max_features': ['auto'], # max number of features considered for splitting a node
    'max_depth': [20], # max number of levels in each decision tree
    'min_samples_split': [15], # min number of data points placed in a node before the node is split
    'min_samples_leaf': [2]} # min number of data points allowed in a leaf node
rfr =GridSearchCV(RandomForestRegressor(n_estimators = 500, n_jobs=-1, random_state=0),
                        param_grid, cv=10, scoring='r2', return_train_score=True)
rfr.fit(X_10,np.ravel(y_10))
print("Best parameters set found on Cross Validation:\n\n", rfr.best_params_)
print("\nCross Validation R\u00b2 score:\n\n", rfr.best_score_.round(3))

In [None]:
from sklearn.model_selection import GridSearchCV

# validation by k-fold cross validation with grid search for best hyperparameters
# hyperparameter values shown below are the tuned final values
param_grid = {
    'max_features': ['auto'], # max number of features considered for splitting a node
    'max_depth': [20], # max number of levels in each decision tree
    'min_samples_split': [15], # min number of data points placed in a node before the node is split
    'min_samples_leaf': [2]} # min number of data points allowed in a leaf node
rfr =GridSearchCV(RandomForestRegressor(n_estimators = 500, n_jobs=-1, random_state=0),
                        param_grid, cv=10, scoring='r2', return_train_score=True)
rfr.fit(X_10,np.ravel(y_10))
print("Best parameters set found on Cross Validation:\n\n", rfr.best_params_)
print("\nCross Validation R\u00b2 score:\n\n", rfr.best_score_.round(3))

In [None]:
fig = plt.figure(figsize=(13,4))

ax1 = plt.subplot(121)
ax1 = sns.scatterplot(x=y_10['resale_price'], y=predicted_test, edgecolors='w', alpha=0.9, s=8)
ax1.set_xlabel('Observed'), ax1.set_xticklabels(['{:,.0f}'.format(x) + 'K' for x in ax1.get_xticks()/1000])
ax1.set_ylabel('Predicted'), ax1.set_yticklabels(['{:,.0f}'.format(x) + 'K' for x in ax1.get_yticks()/1000])
ax1.annotate('Test R\u00b2: ' + str(round(oob_test_score,3)) + '\nTest MAE: ' + str(round(oob_mae)), xy=(0, 1), xytext=(25, -35),
    xycoords='axes fraction', textcoords='offset points', fontsize=12)
ax1.set_title('Tuned Using Out-Of-Bag')

ax2 = plt.subplot(122)
ax2 = sns.scatterplot(x=y_10['resale_price'], y=cv_predicted_test, edgecolors='w', alpha=0.9, s=8)
ax2.set_xlabel('Observed'), ax2.set_xticklabels(['{:,.0f}'.format(x) + 'K' for x in ax2.get_xticks()/1000])
ax2.set_ylabel('Predicted'), ax2.set_yticklabels(['{:,.0f}'.format(x) + 'K' for x in ax2.get_yticks()/1000])
ax2.annotate('Test R\u00b2: ' + str(round(cv_test_score,3)) + '\nTest MAE: ' + str(round(cv_mae)), xy=(0, 1), xytext=(25, -35),
    xycoords='axes fraction', textcoords='offset points', fontsize=12)
ax2.set_title('Tuned Using Cross Validation')
plt.tight_layout(pad=0, rect=[0, 0, 0.9, 0.9])
plt.show()

In [None]:
fig = plt.figure(figsize=(20,5))

ax1 = plt.subplot(121)
feat_imp = pd.DataFrame({'Features': X_10.columns, 'Feature Importance': rf.feature_importances_}).sort_values('Feature Importance', ascending=False)
sns.barplot(y='Features', x='Feature Importance', data=feat_imp)
#plt.xticks(rotation=45, ha='right')
ax1.set_title('OOB Feature Importance', size=10)

ax2 = plt.subplot(122)
feat_imp = pd.DataFrame({'Features': X_10.columns, 'Feature Importance': rfr.best_estimator_.feature_importances_}).sort_values('Feature Importance', ascending=False)
sns.barplot(y='Features', x='Feature Importance', data=feat_imp)
ax2.set_title('CV Feature Importance', size=10)

In [None]:
2020s

In [None]:
df_20.info()

In [None]:
y_20 = pd.DataFrame(df_20[['resale_price']])
X_20 = pd.DataFrame(df_20[['flat_type', 'storey_range', 'flat_model', 'region']])
X_20.nunique()

In [None]:
X_20 = X_20.replace({'flat_type': replace_values})
X_20= pd.get_dummies(X_20, columns=['storey_range'], prefix=['storey_range'])
X_20= pd.get_dummies(X_20, columns=['flat_model'], prefix=['flat_model'])
X_20= pd.get_dummies(X_20, columns=['region'], prefix=['region'])

In [None]:
rf = RandomForestRegressor(n_estimators=100,oob_score=True, random_state=0)
rf.fit(X_20,np.ravel(y_20))
predicted_train = rf.predict(X_20)

print(f'Out-of-bag R\u00b2 score estimate: {rf.oob_score_:>5.3}')

In [None]:
predicted_test = rf.predict(X_20)
oob_test_score = r2_score(y_20['resale_price'], predicted_test)
spearman = spearmanr(y_20['resale_price'], predicted_test)
pearson = pearsonr(y_20['resale_price'], predicted_test)
oob_mae = mean_absolute_error(y_20['resale_price'], predicted_test)

print(f'Test data R\u00b2 score: {oob_test_score:>5.3}')
print(f'Test data Spearman correlation: {spearman[0]:.3}')
print(f'Test data Pearson correlation: {pearson[0]:.3}')
print(f'Test data Mean Absolute Error: {round(oob_mae)}')

In [None]:
from sklearn.model_selection import GridSearchCV

# validation by k-fold cross validation with grid search for best hyperparameters
# hyperparameter values shown below are the tuned final values
param_grid = {
    'max_features': ['auto'], # max number of features considered for splitting a node
    'max_depth': [20], # max number of levels in each decision tree
    'min_samples_split': [15], # min number of data points placed in a node before the node is split
    'min_samples_leaf': [2]} # min number of data points allowed in a leaf node
rfr =GridSearchCV(RandomForestRegressor(n_estimators = 500, n_jobs=-1, random_state=0),
                        param_grid, cv=10, scoring='r2', return_train_score=True)
rfr.fit(X_20,np.ravel(y_20))
print("Best parameters set found on Cross Validation:\n\n", rfr.best_params_)
print("\nCross Validation R\u00b2 score:\n\n", rfr.best_score_.round(3))

In [None]:
from sklearn.model_selection import GridSearchCV

# validation by k-fold cross validation with grid search for best hyperparameters
# hyperparameter values shown below are the tuned final values
param_grid = {
    'max_features': ['auto'], # max number of features considered for splitting a node
    'max_depth': [20], # max number of levels in each decision tree
    'min_samples_split': [15], # min number of data points placed in a node before the node is split
    'min_samples_leaf': [2]} # min number of data points allowed in a leaf node
rfr =GridSearchCV(RandomForestRegressor(n_estimators = 500, n_jobs=-1, random_state=0),
                        param_grid, cv=10, scoring='r2', return_train_score=True)
rfr.fit(X_20,np.ravel(y_20))
print("Best parameters set found on Cross Validation:\n\n", rfr.best_params_)
print("\nCross Validation R\u00b2 score:\n\n", rfr.best_score_.round(3))

In [None]:
fig = plt.figure(figsize=(13,4))

ax1 = plt.subplot(121)
ax1 = sns.scatterplot(x=y_20['resale_price'], y=predicted_test, edgecolors='w', alpha=0.9, s=8)
ax1.set_xlabel('Observed'), ax1.set_xticklabels(['{:,.0f}'.format(x) + 'K' for x in ax1.get_xticks()/1000])
ax1.set_ylabel('Predicted'), ax1.set_yticklabels(['{:,.0f}'.format(x) + 'K' for x in ax1.get_yticks()/1000])
ax1.annotate('Test R\u00b2: ' + str(round(oob_test_score,3)) + '\nTest MAE: ' + str(round(oob_mae)), xy=(0, 1), xytext=(25, -35),
    xycoords='axes fraction', textcoords='offset points', fontsize=12)
ax1.set_title('Tuned Using Out-Of-Bag')

ax2 = plt.subplot(122)
ax2 = sns.scatterplot(x=y_20['resale_price'], y=cv_predicted_test, edgecolors='w', alpha=0.9, s=8)
ax2.set_xlabel('Observed'), ax2.set_xticklabels(['{:,.0f}'.format(x) + 'K' for x in ax2.get_xticks()/1000])
ax2.set_ylabel('Predicted'), ax2.set_yticklabels(['{:,.0f}'.format(x) + 'K' for x in ax2.get_yticks()/1000])
ax2.annotate('Test R\u00b2: ' + str(round(cv_test_score,3)) + '\nTest MAE: ' + str(round(cv_mae)), xy=(0, 1), xytext=(25, -35),
    xycoords='axes fraction', textcoords='offset points', fontsize=12)
ax2.set_title('Tuned Using Cross Validation')
plt.tight_layout(pad=0, rect=[0, 0, 0.9, 0.9])
plt.show()

In [None]:
fig = plt.figure(figsize=(20,5))

ax1 = plt.subplot(121)
feat_imp = pd.DataFrame({'Features': X_20.columns, 'Feature Importance': rf.feature_importances_}).sort_values('Feature Importance', ascending=False)
sns.barplot(y='Features', x='Feature Importance', data=feat_imp)
#plt.xticks(rotation=45, ha='right')
ax1.set_title('OOB Feature Importance', size=10)

ax2 = plt.subplot(122)
feat_imp = pd.DataFrame({'Features': X_20.columns, 'Feature Importance': rfr.best_estimator_.feature_importances_}).sort_values('Feature Importance', ascending=False)
sns.barplot(y='Features', x='Feature Importance', data=feat_imp)
ax2.set_title('CV Feature Importance', size=10)