# EDA and Data Pre-processing

## Importing Packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pprint
import csv
import datetime as dt
from sklearn.metrics import classification_report, confusion_matrix, roc_curve
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import math

## Importing Dataset

In [2]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

## Pre-processing on Train Dataset

## Feature Engineering to Create "dist_mrt" from Auxiliary Data

In [3]:
# do not change order of this .dropna() function
train.dropna()
train = train.reset_index(drop=True)
train

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,resale_price
0,2001-08,pasir ris,4 room,440,pasir ris drive 4,01 to 03,118.0,model a,uncategorized,1989,1.369008,103.958697,0.0,pasir ris drive,pasir ris,east region,209700.0
1,2014-10,punggol,5-room,196B,punggol field,10 to 12,110.0,improved,uncategorized,2003,1.399007,103.906991,0.0,punggol field,punggol,north-east region,402300.0
2,2020-09,sengkang,5 room,404A,fernvale lane,01 to 03,112.0,premium apartment,uncategorized,2004,1.388348,103.873815,0.0,fernvale,sengkang,north-east region,351000.0
3,2000-10,clementi,3 room,375,clementi avenue 4,07 to 09,67.0,new generation,uncategorized,1980,1.318493,103.766702,0.0,clementi north,clementi,west region,151200.0
4,2013-01,bukit batok,3-room,163,bukit batok street 11,07 to 09,73.0,model a,uncategorized,1985,1.348149,103.742658,0.0,bukit batok west,bukit batok,west region,318600.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431727,2005-03,woodlands,4 room,537,Woodlands Drive 16,01 to 03,101.0,model a,uncategorized,2000,1.429658,103.792583,0.0,woodlands south,woodlands,north region,238500.0
431728,2016-04,sengkang,4 room,410A,fernvale road,13 to 15,95.0,premium apartment,uncategorized,2012,1.390053,103.875941,0.0,fernvale,sengkang,north-east region,376200.0
431729,2011-01,tampines,3-room,829,tampines street 81,01 to 03,67.0,new generation,uncategorized,1986,1.349224,103.934913,0.0,tampines west,tampines,east region,255600.0
431730,2013-05,sengkang,5-room,233,compassvale walk,16 to 18,123.0,improved,uncategorized,1999,1.389941,103.900721,0.0,sengkang town centre,sengkang,north-east region,508500.0


In [4]:
with open('./auxiliary-data/distance-to-mrt.csv', newline='') as f:
    reader = csv.reader(f)
    mrt_list = list(reader)

In [5]:
train['dist_mrt'] = np.array(mrt_list)
train.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,resale_price,dist_mrt
0,2001-08,pasir ris,4 room,440,pasir ris drive 4,01 to 03,118.0,model a,uncategorized,1989,1.369008,103.958697,0.0,pasir ris drive,pasir ris,east region,209700.0,1.137651115
1,2014-10,punggol,5-room,196B,punggol field,10 to 12,110.0,improved,uncategorized,2003,1.399007,103.906991,0.0,punggol field,punggol,north-east region,402300.0,0.118453389
2,2020-09,sengkang,5 room,404A,fernvale lane,01 to 03,112.0,premium apartment,uncategorized,2004,1.388348,103.873815,0.0,fernvale,sengkang,north-east region,351000.0,0.479542021
3,2000-10,clementi,3 room,375,clementi avenue 4,07 to 09,67.0,new generation,uncategorized,1980,1.318493,103.766702,0.0,clementi north,clementi,west region,151200.0,0.421345802
4,2013-01,bukit batok,3-room,163,bukit batok street 11,07 to 09,73.0,model a,uncategorized,1985,1.348149,103.742658,0.0,bukit batok west,bukit batok,west region,318600.0,0.775005151


## Convert "storey_range" into Numerical Data Type "storey_average"

In [6]:
# first number in "storey_range"
train['lower'] = train['storey_range'].str.extract('(\d+)')
train['upper'] = train['storey_range'].str.extract('(\d+$)')
# calculate average storey from the range
train['storey_average'] = train['upper'].astype(int) + train['lower'].astype(int)
train['storey_average'] = train['storey_average'] / 2.0
train = train.drop(columns=['storey_range'])
train.head()

Unnamed: 0,month,town,flat_type,block,street_name,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,resale_price,dist_mrt,lower,upper,storey_average
0,2001-08,pasir ris,4 room,440,pasir ris drive 4,118.0,model a,uncategorized,1989,1.369008,103.958697,0.0,pasir ris drive,pasir ris,east region,209700.0,1.137651115,1,3,2.0
1,2014-10,punggol,5-room,196B,punggol field,110.0,improved,uncategorized,2003,1.399007,103.906991,0.0,punggol field,punggol,north-east region,402300.0,0.118453389,10,12,11.0
2,2020-09,sengkang,5 room,404A,fernvale lane,112.0,premium apartment,uncategorized,2004,1.388348,103.873815,0.0,fernvale,sengkang,north-east region,351000.0,0.479542021,1,3,2.0
3,2000-10,clementi,3 room,375,clementi avenue 4,67.0,new generation,uncategorized,1980,1.318493,103.766702,0.0,clementi north,clementi,west region,151200.0,0.421345802,7,9,8.0
4,2013-01,bukit batok,3-room,163,bukit batok street 11,73.0,model a,uncategorized,1985,1.348149,103.742658,0.0,bukit batok west,bukit batok,west region,318600.0,0.775005151,7,9,8.0


## Convert "flat_type" to Ordinal Data Type "flattype_mapping"

In [7]:
train = train.replace(to_replace ="5 room", value ="5-room")
train = train.replace(to_replace ="4 room", value ="4-room")
train = train.replace(to_replace ="3 room", value ="3-room")
train = train.replace(to_replace ="2 room", value ="2-room")
train = train.replace(to_replace ="1 room", value ="1-room")
flattype_mapping={"1-room": 1,
                   "2-room": 2,
                   "3-room": 3,
                   "4-room": 4,
                   "5-room": 5,
                   "multi generation": 6,
                   "executive": 7,}
train["flattype_mapping"] = train['flat_type'].map(flattype_mapping)
train = train.drop(columns=['flat_type'])
train.head()

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,resale_price,dist_mrt,lower,upper,storey_average,flattype_mapping
0,2001-08,pasir ris,440,pasir ris drive 4,118.0,model a,uncategorized,1989,1.369008,103.958697,0.0,pasir ris drive,pasir ris,east region,209700.0,1.137651115,1,3,2.0,4
1,2014-10,punggol,196B,punggol field,110.0,improved,uncategorized,2003,1.399007,103.906991,0.0,punggol field,punggol,north-east region,402300.0,0.118453389,10,12,11.0,5
2,2020-09,sengkang,404A,fernvale lane,112.0,premium apartment,uncategorized,2004,1.388348,103.873815,0.0,fernvale,sengkang,north-east region,351000.0,0.479542021,1,3,2.0,5
3,2000-10,clementi,375,clementi avenue 4,67.0,new generation,uncategorized,1980,1.318493,103.766702,0.0,clementi north,clementi,west region,151200.0,0.421345802,7,9,8.0,3
4,2013-01,bukit batok,163,bukit batok street 11,73.0,model a,uncategorized,1985,1.348149,103.742658,0.0,bukit batok west,bukit batok,west region,318600.0,0.775005151,7,9,8.0,3


## Convert "region" to Binary Data Type

In [8]:
one_hot_cols = pd.get_dummies(train, columns=['region'], prefix=['region'])
train = train.merge(one_hot_cols)
train = train.drop(columns=['region'])
train.head()

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,...,dist_mrt,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region
0,2001-08,pasir ris,440,pasir ris drive 4,118.0,model a,uncategorized,1989,1.369008,103.958697,...,1.137651115,1,3,2.0,4,0,1,0,0,0
1,2014-10,punggol,196B,punggol field,110.0,improved,uncategorized,2003,1.399007,103.906991,...,0.118453389,10,12,11.0,5,0,0,0,1,0
2,2020-09,sengkang,404A,fernvale lane,112.0,premium apartment,uncategorized,2004,1.388348,103.873815,...,0.479542021,1,3,2.0,5,0,0,0,1,0
3,2000-10,clementi,375,clementi avenue 4,67.0,new generation,uncategorized,1980,1.318493,103.766702,...,0.421345802,7,9,8.0,3,0,0,0,0,1
4,2013-01,bukit batok,163,bukit batok street 11,73.0,model a,uncategorized,1985,1.348149,103.742658,...,0.775005151,7,9,8.0,3,0,0,0,0,1


## Feature engineering to create "remaining_lease" feature from "lease_commencement_date"

In [9]:
lease_commence_date = train['lease_commence_date'].to_numpy()
#assuming 99-year lease, we calculate the remaining years on the lease
remaining_lease = lease_commence_date + 99 - pd.DatetimeIndex(train['month']).year.to_numpy()
train['remaining_lease'] = remaining_lease
train = train.drop(columns=['lease_commence_date'])
train.head()

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,latitude,longitude,elevation,...,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2001-08,pasir ris,440,pasir ris drive 4,118.0,model a,uncategorized,1.369008,103.958697,0.0,...,1,3,2.0,4,0,1,0,0,0,87
1,2014-10,punggol,196B,punggol field,110.0,improved,uncategorized,1.399007,103.906991,0.0,...,10,12,11.0,5,0,0,0,1,0,88
2,2020-09,sengkang,404A,fernvale lane,112.0,premium apartment,uncategorized,1.388348,103.873815,0.0,...,1,3,2.0,5,0,0,0,1,0,83
3,2000-10,clementi,375,clementi avenue 4,67.0,new generation,uncategorized,1.318493,103.766702,0.0,...,7,9,8.0,3,0,0,0,0,1,79
4,2013-01,bukit batok,163,bukit batok street 11,73.0,model a,uncategorized,1.348149,103.742658,0.0,...,7,9,8.0,3,0,0,0,0,1,71


## Convert "flat_model" to Ordinal Data Type

In [10]:
flat_models = train.groupby('flat_model')
keys = flat_models.groups.keys()

flat_type_dict = {}

# store the average "resale_price" of each "flat_model" in a dictionary with
#    key : "flat_model"
#    value : average "resale_price"
for key in keys:
    df_i = flat_models.get_group(key)
    flat_type_dict.update({key : df_i["resale_price"].mean()})

# ordering the different "flat_model" by average "resale_price" (ascending order)
{k: v for k, v in sorted(flat_type_dict.items(), key=lambda item: item[1])}
# print(flat_type_dict)
# print(flat_type_dict.keys())

# save the ordered list of keys from dictionary as a list for easier index finding
flat_model_list = list(flat_type_dict.keys())

for flat_model in flat_model_list:
    ordinal_value = flat_model_list.index(flat_model) + 1
    train['flat_model'] = train['flat_model'].str.replace(rf'^{flat_model}$', str(ordinal_value), regex=True)
    
train.head()

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,latitude,longitude,elevation,...,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2001-08,pasir ris,440,pasir ris drive 4,118.0,8,uncategorized,1.369008,103.958697,0.0,...,1,3,2.0,4,0,1,0,0,0,87
1,2014-10,punggol,196B,punggol field,110.0,5,uncategorized,1.399007,103.906991,0.0,...,10,12,11.0,5,0,0,0,1,0,88
2,2020-09,sengkang,404A,fernvale lane,112.0,13,uncategorized,1.388348,103.873815,0.0,...,1,3,2.0,5,0,0,0,1,0,83
3,2000-10,clementi,375,clementi avenue 4,67.0,12,uncategorized,1.318493,103.766702,0.0,...,7,9,8.0,3,0,0,0,0,1,79
4,2013-01,bukit batok,163,bukit batok street 11,73.0,8,uncategorized,1.348149,103.742658,0.0,...,7,9,8.0,3,0,0,0,0,1,71


## Convert Date to DateTime Format

In [11]:
train_with_data = train.copy()
train['month'] = pd.to_datetime(train['month'])
train['month']=train['month'].map(dt.datetime.toordinal)

## Drop Columns "block", "street_name", and "eco_category"

In [12]:
train = train.drop(columns=['block', 'street_name', 'eco_category'])
train.head()

Unnamed: 0,month,town,floor_area_sqm,flat_model,latitude,longitude,elevation,subzone,planning_area,resale_price,...,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,730698,pasir ris,118.0,8,1.369008,103.958697,0.0,pasir ris drive,pasir ris,209700.0,...,1,3,2.0,4,0,1,0,0,0,87
1,735507,punggol,110.0,5,1.399007,103.906991,0.0,punggol field,punggol,402300.0,...,10,12,11.0,5,0,0,0,1,0,88
2,737669,sengkang,112.0,13,1.388348,103.873815,0.0,fernvale,sengkang,351000.0,...,1,3,2.0,5,0,0,0,1,0,83
3,730394,clementi,67.0,12,1.318493,103.766702,0.0,clementi north,clementi,151200.0,...,7,9,8.0,3,0,0,0,0,1,79
4,734869,bukit batok,73.0,8,1.348149,103.742658,0.0,bukit batok west,bukit batok,318600.0,...,7,9,8.0,3,0,0,0,0,1,71


In [13]:
train = train.drop(columns=['elevation','town','subzone','planning_area'])
train.head()

Unnamed: 0,month,floor_area_sqm,flat_model,latitude,longitude,resale_price,dist_mrt,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,730698,118.0,8,1.369008,103.958697,209700.0,1.137651115,1,3,2.0,4,0,1,0,0,0,87
1,735507,110.0,5,1.399007,103.906991,402300.0,0.118453389,10,12,11.0,5,0,0,0,1,0,88
2,737669,112.0,13,1.388348,103.873815,351000.0,0.479542021,1,3,2.0,5,0,0,0,1,0,83
3,730394,67.0,12,1.318493,103.766702,151200.0,0.421345802,7,9,8.0,3,0,0,0,0,1,79
4,734869,73.0,8,1.348149,103.742658,318600.0,0.775005151,7,9,8.0,3,0,0,0,0,1,71


## Pre-processing on Test Dataset

## Feature Engineering to Create "dist_mrt" from Auxiliary Data

In [14]:
test.dropna()
test = test.reset_index(drop=True)
test

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region
0,2004-01,bukit batok,4 room,186,bukit batok west avenue 6,04 to 06,94.0,new generation,uncategorized,1989,1.346581,103.744085,0.0,bukit batok west,bukit batok,west region
1,2001-11,tampines,5 room,366,tampines street 34,04 to 06,122.0,improved,uncategorized,1997,1.357618,103.961379,0.0,tampines east,tampines,east region
2,2002-07,jurong east,3 room,206,jurong east street 21,01 to 03,67.0,new generation,uncategorized,1982,1.337804,103.741998,0.0,toh guan,jurong east,west region
3,2015-04,ang mo kio,3 room,180,Ang Mo Kio Avenue 5,04 to 06,82.0,new generation,uncategorized,1981,1.380084,103.849574,0.0,yio chu kang east,ang mo kio,north-east region
4,2004-04,clementi,5 room,356,clementi avenue 2,01 to 03,117.0,standard,uncategorized,1978,1.313960,103.769831,0.0,clementi north,clementi,west region
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107929,2008-04,hougang,5 room,981D,buangkok crescent,10 to 12,110.0,improved,uncategorized,2003,1.380452,103.879333,0.0,trafalgar,hougang,north-east region
107930,2006-01,kallang/whampoa,4 room,13,upper boon keng road,13 to 15,102.0,model a,uncategorized,1999,1.314481,103.870458,0.0,boon keng,kallang,central region
107931,2000-01,kallang/whampoa,3 room,1,beach road,07 to 09,68.0,improved,uncategorized,1979,1.294924,103.854315,0.0,city hall,downtown core,central region
107932,2009-07,jurong west,4 room,919,jurong west street 91,10 to 12,104.0,model a,uncategorized,1988,1.339927,103.687354,0.0,yunnan,jurong west,west region


In [15]:
with open('./auxiliary-data/distance-to-mrt-test-edited.csv', newline='') as f:
    reader = csv.reader(f)
    mrt_list = list(reader)

In [16]:
test['dist_mrt'] = np.array(mrt_list)
test.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,dist_mrt
0,2004-01,bukit batok,4 room,186,bukit batok west avenue 6,04 to 06,94.0,new generation,uncategorized,1989,1.346581,103.744085,0.0,bukit batok west,bukit batok,west region,0.667553928
1,2001-11,tampines,5 room,366,tampines street 34,04 to 06,122.0,improved,uncategorized,1997,1.357618,103.961379,0.0,tampines east,tampines,east region,0.767025394
2,2002-07,jurong east,3 room,206,jurong east street 21,01 to 03,67.0,new generation,uncategorized,1982,1.337804,103.741998,0.0,toh guan,jurong east,west region,0.515380197
3,2015-04,ang mo kio,3 room,180,Ang Mo Kio Avenue 5,04 to 06,82.0,new generation,uncategorized,1981,1.380084,103.849574,0.0,yio chu kang east,ang mo kio,north-east region,0.547114337
4,2004-04,clementi,5 room,356,clementi avenue 2,01 to 03,117.0,standard,uncategorized,1978,1.31396,103.769831,0.0,clementi north,clementi,west region,0.516298405


## Convert Date to DateTime Format

In [17]:
test['month'] = pd.to_datetime(test['month'])
test['month']=test['month'].map(dt.datetime.toordinal)

## Convert "storey_range" into Numerical Data Type "storey_average" 

In [18]:
test['lower'] = test['storey_range'].str.extract('(\d+)')
test['upper'] = test['storey_range'].str.extract('(\d+$)')
# calculate average storey from the range
test['storey_average'] = test['upper'].astype(int) + test['lower'].astype(int)
test['storey_average'] = test['storey_average'] / 2.0
test = test.drop(columns=['storey_range'])
test.head()

Unnamed: 0,month,town,flat_type,block,street_name,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,dist_mrt,lower,upper,storey_average
0,731581,bukit batok,4 room,186,bukit batok west avenue 6,94.0,new generation,uncategorized,1989,1.346581,103.744085,0.0,bukit batok west,bukit batok,west region,0.667553928,4,6,5.0
1,730790,tampines,5 room,366,tampines street 34,122.0,improved,uncategorized,1997,1.357618,103.961379,0.0,tampines east,tampines,east region,0.767025394,4,6,5.0
2,731032,jurong east,3 room,206,jurong east street 21,67.0,new generation,uncategorized,1982,1.337804,103.741998,0.0,toh guan,jurong east,west region,0.515380197,1,3,2.0
3,735689,ang mo kio,3 room,180,Ang Mo Kio Avenue 5,82.0,new generation,uncategorized,1981,1.380084,103.849574,0.0,yio chu kang east,ang mo kio,north-east region,0.547114337,4,6,5.0
4,731672,clementi,5 room,356,clementi avenue 2,117.0,standard,uncategorized,1978,1.31396,103.769831,0.0,clementi north,clementi,west region,0.516298405,1,3,2.0


## Convert "flat_type" to Ordinal Data Type "flattype_mapping"

In [19]:
test = test.replace(to_replace ="5 room", value ="5-room")
test = test.replace(to_replace ="4 room", value ="4-room")
test = test.replace(to_replace ="3 room", value ="3-room")
test = test.replace(to_replace ="2 room", value ="2-room")
test = test.replace(to_replace ="1 room", value ="1-room")
flattype_mapping={"1-room": 1,
                   "2-room": 2,
                   "3-room": 3,
                   "4-room": 4,
                   "5-room": 5,
                   "multi generation": 6,
                   "executive": 7,}
test["flattype_mapping"] = test['flat_type'].map(flattype_mapping)
test = test.drop(columns=['flat_type'])
test.head()

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,dist_mrt,lower,upper,storey_average,flattype_mapping
0,731581,bukit batok,186,bukit batok west avenue 6,94.0,new generation,uncategorized,1989,1.346581,103.744085,0.0,bukit batok west,bukit batok,west region,0.667553928,4,6,5.0,4
1,730790,tampines,366,tampines street 34,122.0,improved,uncategorized,1997,1.357618,103.961379,0.0,tampines east,tampines,east region,0.767025394,4,6,5.0,5
2,731032,jurong east,206,jurong east street 21,67.0,new generation,uncategorized,1982,1.337804,103.741998,0.0,toh guan,jurong east,west region,0.515380197,1,3,2.0,3
3,735689,ang mo kio,180,Ang Mo Kio Avenue 5,82.0,new generation,uncategorized,1981,1.380084,103.849574,0.0,yio chu kang east,ang mo kio,north-east region,0.547114337,4,6,5.0,3
4,731672,clementi,356,clementi avenue 2,117.0,standard,uncategorized,1978,1.31396,103.769831,0.0,clementi north,clementi,west region,0.516298405,1,3,2.0,5


## Convert "region" to Binary Data Type

In [20]:
one_hot_cols = pd.get_dummies(test, columns=['region'], prefix=['region'])
test = test.merge(one_hot_cols)
test = test.drop(columns=['region'])
test.head()

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,...,dist_mrt,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region
0,731581,bukit batok,186,bukit batok west avenue 6,94.0,new generation,uncategorized,1989,1.346581,103.744085,...,0.667553928,4,6,5.0,4,0,0,0,0,1
1,730790,tampines,366,tampines street 34,122.0,improved,uncategorized,1997,1.357618,103.961379,...,0.767025394,4,6,5.0,5,0,1,0,0,0
2,731032,jurong east,206,jurong east street 21,67.0,new generation,uncategorized,1982,1.337804,103.741998,...,0.515380197,1,3,2.0,3,0,0,0,0,1
3,735689,ang mo kio,180,Ang Mo Kio Avenue 5,82.0,new generation,uncategorized,1981,1.380084,103.849574,...,0.547114337,4,6,5.0,3,0,0,0,1,0
4,731672,clementi,356,clementi avenue 2,117.0,standard,uncategorized,1978,1.31396,103.769831,...,0.516298405,1,3,2.0,5,0,0,0,0,1


## Feature engineering to create "remaining_lease" feature from "lease_commencement_date"

In [21]:
lease_commence_date = test['lease_commence_date'].to_numpy()
#assuming 99-year lease, we calculate the remaining years on the lease
remaining_lease = lease_commence_date + 99 - pd.DatetimeIndex(test['month']).year.to_numpy()
test['remaining_lease'] = remaining_lease
test = test.drop(columns=['lease_commence_date'])
test.head()

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,latitude,longitude,elevation,...,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,731581,bukit batok,186,bukit batok west avenue 6,94.0,new generation,uncategorized,1.346581,103.744085,0.0,...,4,6,5.0,4,0,0,0,0,1,118
1,730790,tampines,366,tampines street 34,122.0,improved,uncategorized,1.357618,103.961379,0.0,...,4,6,5.0,5,0,1,0,0,0,126
2,731032,jurong east,206,jurong east street 21,67.0,new generation,uncategorized,1.337804,103.741998,0.0,...,1,3,2.0,3,0,0,0,0,1,111
3,735689,ang mo kio,180,Ang Mo Kio Avenue 5,82.0,new generation,uncategorized,1.380084,103.849574,0.0,...,4,6,5.0,3,0,0,0,1,0,110
4,731672,clementi,356,clementi avenue 2,117.0,standard,uncategorized,1.31396,103.769831,0.0,...,1,3,2.0,5,0,0,0,0,1,107


## Convert "flat_model" to Ordinal Data Type

In [22]:
# uses the same "flat_model_list" extracted from the "flat_model" dictionary calculated from train dataset
for flat_model in flat_model_list:
    ordinal_value = flat_model_list.index(flat_model) + 1
    test['flat_model'] = test['flat_model'].str.replace(rf'^{flat_model}$', str(ordinal_value), regex=True)

test.head()

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,latitude,longitude,elevation,...,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,731581,bukit batok,186,bukit batok west avenue 6,94.0,12,uncategorized,1.346581,103.744085,0.0,...,4,6,5.0,4,0,0,0,0,1,118
1,730790,tampines,366,tampines street 34,122.0,5,uncategorized,1.357618,103.961379,0.0,...,4,6,5.0,5,0,1,0,0,0,126
2,731032,jurong east,206,jurong east street 21,67.0,12,uncategorized,1.337804,103.741998,0.0,...,1,3,2.0,3,0,0,0,0,1,111
3,735689,ang mo kio,180,Ang Mo Kio Avenue 5,82.0,12,uncategorized,1.380084,103.849574,0.0,...,4,6,5.0,3,0,0,0,1,0,110
4,731672,clementi,356,clementi avenue 2,117.0,17,uncategorized,1.31396,103.769831,0.0,...,1,3,2.0,5,0,0,0,0,1,107


## Drop Columns "block", "street_name", and "eco_category"

In [23]:
test = test.drop(columns=['block', 'street_name', 'eco_category'])
test.head()

Unnamed: 0,month,town,floor_area_sqm,flat_model,latitude,longitude,elevation,subzone,planning_area,dist_mrt,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,731581,bukit batok,94.0,12,1.346581,103.744085,0.0,bukit batok west,bukit batok,0.667553928,4,6,5.0,4,0,0,0,0,1,118
1,730790,tampines,122.0,5,1.357618,103.961379,0.0,tampines east,tampines,0.767025394,4,6,5.0,5,0,1,0,0,0,126
2,731032,jurong east,67.0,12,1.337804,103.741998,0.0,toh guan,jurong east,0.515380197,1,3,2.0,3,0,0,0,0,1,111
3,735689,ang mo kio,82.0,12,1.380084,103.849574,0.0,yio chu kang east,ang mo kio,0.547114337,4,6,5.0,3,0,0,0,1,0,110
4,731672,clementi,117.0,17,1.31396,103.769831,0.0,clementi north,clementi,0.516298405,1,3,2.0,5,0,0,0,0,1,107


In [24]:
test = test.drop(columns=['elevation','town','subzone','planning_area'])
test.head()

Unnamed: 0,month,floor_area_sqm,flat_model,latitude,longitude,dist_mrt,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,731581,94.0,12,1.346581,103.744085,0.667553928,4,6,5.0,4,0,0,0,0,1,118
1,730790,122.0,5,1.357618,103.961379,0.767025394,4,6,5.0,5,0,1,0,0,0,126
2,731032,67.0,12,1.337804,103.741998,0.515380197,1,3,2.0,3,0,0,0,0,1,111
3,735689,82.0,12,1.380084,103.849574,0.547114337,4,6,5.0,3,0,0,0,1,0,110
4,731672,117.0,17,1.31396,103.769831,0.516298405,1,3,2.0,5,0,0,0,0,1,107


In [25]:
train.head()

Unnamed: 0,month,floor_area_sqm,flat_model,latitude,longitude,resale_price,dist_mrt,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,730698,118.0,8,1.369008,103.958697,209700.0,1.137651115,1,3,2.0,4,0,1,0,0,0,87
1,735507,110.0,5,1.399007,103.906991,402300.0,0.118453389,10,12,11.0,5,0,0,0,1,0,88
2,737669,112.0,13,1.388348,103.873815,351000.0,0.479542021,1,3,2.0,5,0,0,0,1,0,83
3,730394,67.0,12,1.318493,103.766702,151200.0,0.421345802,7,9,8.0,3,0,0,0,0,1,79
4,734869,73.0,8,1.348149,103.742658,318600.0,0.775005151,7,9,8.0,3,0,0,0,0,1,71


# Linear Regression implementation for full dataset

In [26]:
X = train.drop (["resale_price"],axis = 1)
y = train["resale_price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [27]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [28]:
print(regressor.intercept_)
print(regressor.coef_)
print('Training score: {}'.format(regressor.score(X_train, y_train)))
print('Test score: {}'.format(regressor.score(X_test, y_test)))

-29578542.444176175
[ 3.64634067e+01  2.41284647e+03  9.88004200e+02 -9.88322329e+05
  3.76200389e+04 -1.56009811e+04 -3.60901041e+04  3.83321475e+04
  1.12102170e+03  1.64102607e+04  2.36717556e+04 -1.29708516e+04
  2.06437277e+04  3.70706495e+03 -3.50516967e+04  2.22881528e+03]
Training score: 0.7915196159422424
Test score: 0.7896683181682163


In [29]:
y_pred = regressor.predict(X_test)
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(df)
print('RMSE for Linear Regression => ',np.sqrt(mean_squared_error(y_test,y_pred)))

          Actual      Predicted
205085  414000.0  321251.675587
394509  328500.0  377537.550936
283069   90000.0  160895.924973
69767   225000.0  292097.153898
384613  202500.0  176501.151111
...          ...            ...
413904  279000.0  275790.558759
399764  328500.0  313847.765426
142745  297000.0  288016.756321
8845    310500.0  328663.581610
383030  137700.0  157372.434380

[86476 rows x 2 columns]
RMSE for Linear Regression =>  59064.35155993195


In [30]:
regressor = LinearRegression()
regressor.fit(X, y)

LinearRegression()

In [31]:
y_pred = regressor.predict(X)
df = pd.DataFrame({'Actual': y, 'Predicted': y_pred})
print(df)

          Actual      Predicted
0       209700.0  226048.044937
1       402300.0  428609.102053
2       351000.0  482433.949383
3       151200.0  113213.935480
4       318600.0  233385.832714
...          ...            ...
432371  238500.0  228483.682232
432372  376200.0  435768.424505
432373  255600.0  214448.282281
432374  508500.0  457083.305809
432375  162000.0  235424.226545

[432376 rows x 2 columns]


# Polinomial regression for full dataset

In [32]:
poly_reg = PolynomialFeatures(degree = 4)
X_poly = poly_reg.fit_transform(X)
print(len(X_poly))
print(len(y))
lin_reg = LinearRegression()
lin_reg.fit(X_poly,y)

432376
432376


LinearRegression()

In [33]:
print('Training score: {}'.format(lin_reg.score(X_poly, y)))
# print('Training score: {}'.format(lin_reg.score(X_test, y_test)))

Training score: 0.8686771561473176


In [34]:
y_pred = lin_reg.predict(X_poly)
df = pd.DataFrame({'Real Values':y, 'Predicted Values':y_pred})
df

Unnamed: 0,Real Values,Predicted Values
0,209700.0,219467.939266
1,402300.0,396791.174320
2,351000.0,488037.831478
3,151200.0,128535.053570
4,318600.0,252649.576096
...,...,...
432371,238500.0,215668.427620
432372,376200.0,461025.722519
432373,255600.0,239826.167221
432374,508500.0,424534.370373


In [35]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=3)),
    ('model', LinearRegression())
]
pipeline = Pipeline(steps)
pipeline.fit(X_train, y_train)
print('Training score: {}'.format(pipeline.score(X_train, y_train)))
print('Test score: {}'.format(pipeline.score(X_test, y_test)))

Training score: 0.9322848214660738
Test score: 0.9315496441035966


In [36]:
y_pred_pipeline = pipeline.predict(X_test)
df = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred_pipeline})
print(df)
mse = mean_squared_error(y_test, y_pred_pipeline)
rmse = math.sqrt(mse)
print('RMSE: {}'.format(rmse))

        Real Values  Predicted Values
205085     414000.0         349761.25
394509     328500.0         320681.25
283069      90000.0          89868.00
69767      225000.0         198203.75
384613     202500.0         204339.50
...             ...               ...
413904     279000.0         299682.25
399764     328500.0         308660.00
142745     297000.0         293021.00
8845       310500.0         355029.75
383030     137700.0         143899.75

[86476 rows x 2 columns]
RMSE: 33694.649912894405


In [37]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=4)),
    ('model', LinearRegression())
]
pipeline = Pipeline(steps)
pipeline.fit(X_train, y_train)
print('Training score: {}'.format(pipeline.score(X_train, y_train)))
print('Test score: {}'.format(pipeline.score(X_test, y_test)))

Training score: 0.9511139066340778
Test score: 0.9499397968548


In [38]:
y_pred_pipeline = pipeline.predict(X_test)
df = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred_pipeline})
print(df)
mse = mean_squared_error(y_test, y_pred_pipeline)
rmse = math.sqrt(mse)
print('RMSE: {}'.format(rmse))

        Real Values  Predicted Values
205085     414000.0     342087.164062
394509     328500.0     325444.664062
283069      90000.0      59516.429688
69767      225000.0     216292.710938
384613     202500.0     224053.562500
...             ...               ...
413904     279000.0     294682.476562
399764     328500.0     333414.617188
142745     297000.0     294229.578125
8845       310500.0     343485.046875
383030     137700.0     142557.882812

[86476 rows x 2 columns]
RMSE: 28815.051292056713


# Split for 3 periods 

In [39]:
train_with_data = train_with_data.drop(columns=['elevation','town','subzone','planning_area','block', 'street_name', 'eco_category'])
train_with_data.head()

Unnamed: 0,month,floor_area_sqm,flat_model,latitude,longitude,resale_price,dist_mrt,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2001-08,118.0,8,1.369008,103.958697,209700.0,1.137651115,1,3,2.0,4,0,1,0,0,0,87
1,2014-10,110.0,5,1.399007,103.906991,402300.0,0.118453389,10,12,11.0,5,0,0,0,1,0,88
2,2020-09,112.0,13,1.388348,103.873815,351000.0,0.479542021,1,3,2.0,5,0,0,0,1,0,83
3,2000-10,67.0,12,1.318493,103.766702,151200.0,0.421345802,7,9,8.0,3,0,0,0,0,1,79
4,2013-01,73.0,8,1.348149,103.742658,318600.0,0.775005151,7,9,8.0,3,0,0,0,0,1,71


Splitting data 

In [40]:
train_first = train_with_data[train_with_data['month']< '2007-01']
train_second = train_with_data[train_with_data['month'] > '2007-01']
train_second = train_second[train_second['month']< '2013-01']
train_third = train_with_data[train_with_data['month']>'2013-01']

convert date to dataframe format

In [41]:
train_first['month'] = pd.to_datetime(train_first['month'])
train_first['month']=train_first['month'].map(dt.datetime.toordinal)
train_second['month'] = pd.to_datetime(train_second['month'])
train_second['month']=train_second['month'].map(dt.datetime.toordinal)
train_third['month'] = pd.to_datetime(train_third['month'])
train_third['month']=train_third['month'].map(dt.datetime.toordinal)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

train first dataset

In [42]:
X = train_first.drop (["resale_price"],axis = 1)
y = train_first["resale_price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [43]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)
print('Training score: {}'.format(regressor.score(X_train, y_train)))
print('Test score: {}'.format(regressor.score(X_test, y_test)))
y_pred = regressor.predict(X_test)
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(df)
print('RMSE for Linear Regression=>',np.sqrt(mean_squared_error(y_test,y_pred)))

Training score: 0.857404945395325
Test score: 0.8586169601696162
          Actual      Predicted
49377   135000.0  136039.430663
236602  184500.0  193108.222995
174747  272700.0  272322.936568
137429  249300.0  253741.885109
419928  319500.0  350455.396843
...          ...            ...
200987  142200.0  136341.138528
427881  260100.0  319945.159026
306968  349200.0  290370.900380
179720  135000.0  102860.515114
414925  238500.0  257188.267741

[35449 rows x 2 columns]
RMSE for Linear Regression=> 30318.04828804393


In [44]:
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=4)),
    ('model', LinearRegression())
]

pipeline = Pipeline(steps)

pipeline.fit(X_train, y_train)

print('Training score: {}'.format(pipeline.score(X_train, y_train)))
print('Test score: {}'.format(pipeline.score(X_test, y_test)))

Training score: 0.9653919080055593
Test score: 0.9623866530390116


In [45]:
y_pred_pipeline = pipeline.predict(X_test)
df = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred_pipeline})
print(df)
mse = mean_squared_error(y_test, y_pred_pipeline)
rmse = math.sqrt(mse)
print('RMSE: {}'.format(rmse))

        Real Values  Predicted Values
49377      135000.0      141388.28125
236602     184500.0      196992.68750
174747     272700.0      264190.78125
137429     249300.0      245778.87500
419928     319500.0      325419.62500
...             ...               ...
200987     142200.0      143307.03125
427881     260100.0      305751.03125
306968     349200.0      335730.21875
179720     135000.0      140682.93750
414925     238500.0      226197.37500

[35449 rows x 2 columns]
RMSE: 15637.725049115412


train second dataset

In [46]:
X = train_second.drop (["resale_price"],axis = 1)
y = train_second["resale_price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [47]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)
print('Training score: {}'.format(regressor.score(X_train, y_train)))
print('Test score: {}'.format(regressor.score(X_test, y_test)))
y_pred = regressor.predict(X_test)
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(df)
print('RMSE for Linear Regression=>',np.sqrt(mean_squared_error(y_test,y_pred)))

Training score: 0.8631737131355272
Test score: 0.8614856930566235
          Actual      Predicted
46956   461700.0  425711.283471
357492  229500.0  303833.513203
307424  450000.0  463831.255797
432363  279000.0  306407.577567
324661  400500.0  380242.397495
...          ...            ...
340007  314100.0  314727.395942
205710  218700.0  220642.784355
295833  483300.0  478559.175982
357376  270000.0  267128.879834
421925  310500.0  332123.298757

[25954 rows x 2 columns]
RMSE for Linear Regression=> 39840.96453325132


In [48]:
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=4)),
    ('model', LinearRegression())
]

pipeline = Pipeline(steps)

pipeline.fit(X_train, y_train)

print('Training score: {}'.format(pipeline.score(X_train, y_train)))
print('Test score: {}'.format(pipeline.score(X_test, y_test)))

Training score: 0.9651625203016343
Test score: 0.9628405808184206


In [49]:
y_pred_pipeline = pipeline.predict(X_test)
df = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred_pipeline})
print(df)
mse = mean_squared_error(y_test, y_pred_pipeline)
rmse = math.sqrt(mse)
print('RMSE: {}'.format(rmse))

        Real Values  Predicted Values
46956      461700.0      422229.93750
357492     229500.0      259350.78125
307424     450000.0      510604.25000
432363     279000.0      289783.90625
324661     400500.0      404331.06250
...             ...               ...
340007     314100.0      287772.50000
205710     218700.0      208836.96875
295833     483300.0      485509.12500
357376     270000.0      260361.37500
421925     310500.0      301294.96875

[25954 rows x 2 columns]
RMSE: 20635.594532602005


In [50]:
X = train_second.drop (["resale_price"],axis = 1)
y = train_second["resale_price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [51]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)
print('Training score: {}'.format(regressor.score(X_train, y_train)))
print('Test score: {}'.format(regressor.score(X_test, y_test)))
y_pred = regressor.predict(X_test)
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(df)
print('RMSE for Linear Regression=>',np.sqrt(mean_squared_error(y_test,y_pred)))

Training score: 0.8631737131355272
Test score: 0.8614856930566235
          Actual      Predicted
46956   461700.0  425711.283471
357492  229500.0  303833.513203
307424  450000.0  463831.255797
432363  279000.0  306407.577567
324661  400500.0  380242.397495
...          ...            ...
340007  314100.0  314727.395942
205710  218700.0  220642.784355
295833  483300.0  478559.175982
357376  270000.0  267128.879834
421925  310500.0  332123.298757

[25954 rows x 2 columns]
RMSE for Linear Regression=> 39840.96453325132


In [52]:
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=4)),
    ('model', LinearRegression())
]

pipeline = Pipeline(steps)

pipeline.fit(X_train, y_train)

print('Training score: {}'.format(pipeline.score(X_train, y_train)))
print('Test score: {}'.format(pipeline.score(X_test, y_test)))

Training score: 0.9651625203016343
Test score: 0.9628405808184206


In [53]:
y_pred_pipeline = pipeline.predict(X_test)
df = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred_pipeline})
print(df)
mse = mean_squared_error(y_test, y_pred_pipeline)
rmse = math.sqrt(mse)
print('RMSE: {}'.format(rmse))

        Real Values  Predicted Values
46956      461700.0      422229.93750
357492     229500.0      259350.78125
307424     450000.0      510604.25000
432363     279000.0      289783.90625
324661     400500.0      404331.06250
...             ...               ...
340007     314100.0      287772.50000
205710     218700.0      208836.96875
295833     483300.0      485509.12500
357376     270000.0      260361.37500
421925     310500.0      301294.96875

[25954 rows x 2 columns]
RMSE: 20635.594532602005
