# EDA and Data Pre-processing

## Importing Packages

In [501]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pprint
import csv
import datetime as dt
from sklearn.metrics import classification_report, confusion_matrix, roc_curve
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
import math
import xgboost

## Importing Dataset

In [502]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

## Pre-processing on Train Dataset

## Repeat pre-cleaning below before training the model

## Feature Engineering to Create "dist_mrt" from Auxiliary Data

In [503]:
# do not change order of this .dropna() function
train.dropna()
train = train.reset_index(drop=True)
train

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,resale_price
0,2001-08,pasir ris,4 room,440,pasir ris drive 4,01 to 03,118.00,model a,uncategorized,1989,1.37,103.96,0.00,pasir ris drive,pasir ris,east region,209700.00
1,2014-10,punggol,5-room,196B,punggol field,10 to 12,110.00,improved,uncategorized,2003,1.40,103.91,0.00,punggol field,punggol,north-east region,402300.00
2,2020-09,sengkang,5 room,404A,fernvale lane,01 to 03,112.00,premium apartment,uncategorized,2004,1.39,103.87,0.00,fernvale,sengkang,north-east region,351000.00
3,2000-10,clementi,3 room,375,clementi avenue 4,07 to 09,67.00,new generation,uncategorized,1980,1.32,103.77,0.00,clementi north,clementi,west region,151200.00
4,2013-01,bukit batok,3-room,163,bukit batok street 11,07 to 09,73.00,model a,uncategorized,1985,1.35,103.74,0.00,bukit batok west,bukit batok,west region,318600.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431727,2005-03,woodlands,4 room,537,Woodlands Drive 16,01 to 03,101.00,model a,uncategorized,2000,1.43,103.79,0.00,woodlands south,woodlands,north region,238500.00
431728,2016-04,sengkang,4 room,410A,fernvale road,13 to 15,95.00,premium apartment,uncategorized,2012,1.39,103.88,0.00,fernvale,sengkang,north-east region,376200.00
431729,2011-01,tampines,3-room,829,tampines street 81,01 to 03,67.00,new generation,uncategorized,1986,1.35,103.93,0.00,tampines west,tampines,east region,255600.00
431730,2013-05,sengkang,5-room,233,compassvale walk,16 to 18,123.00,improved,uncategorized,1999,1.39,103.90,0.00,sengkang town centre,sengkang,north-east region,508500.00


In [504]:
with open('./auxiliary-data/distance-to-mrt.csv', newline='') as f:
    reader = csv.reader(f)
    mrt_list = list(reader)

In [505]:
train['dist_mrt'] = np.array(mrt_list).astype(float)
train.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,resale_price,dist_mrt
0,2001-08,pasir ris,4 room,440,pasir ris drive 4,01 to 03,118.0,model a,uncategorized,1989,1.37,103.96,0.0,pasir ris drive,pasir ris,east region,209700.0,1.14
1,2014-10,punggol,5-room,196B,punggol field,10 to 12,110.0,improved,uncategorized,2003,1.4,103.91,0.0,punggol field,punggol,north-east region,402300.0,0.12
2,2020-09,sengkang,5 room,404A,fernvale lane,01 to 03,112.0,premium apartment,uncategorized,2004,1.39,103.87,0.0,fernvale,sengkang,north-east region,351000.0,0.48
3,2000-10,clementi,3 room,375,clementi avenue 4,07 to 09,67.0,new generation,uncategorized,1980,1.32,103.77,0.0,clementi north,clementi,west region,151200.0,0.42
4,2013-01,bukit batok,3-room,163,bukit batok street 11,07 to 09,73.0,model a,uncategorized,1985,1.35,103.74,0.0,bukit batok west,bukit batok,west region,318600.0,0.78


## Convert "storey_range" into Numerical Data Type "storey_average"

In [506]:
# first number in "storey_range"
train['lower'] = train['storey_range'].str.extract('(\d+)').astype(str).astype(int)
train['upper'] = train['storey_range'].str.extract('(\d+$)').astype(str).astype(int)
# calculate average storey from the range
train['storey_average'] = train['upper'] + train['lower']
train['storey_average'] = train['storey_average'] / 2.0
train = train.drop(columns=['storey_range', 'lower', 'upper'])
train.head()

Unnamed: 0,month,town,flat_type,block,street_name,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,resale_price,dist_mrt,storey_average
0,2001-08,pasir ris,4 room,440,pasir ris drive 4,118.0,model a,uncategorized,1989,1.37,103.96,0.0,pasir ris drive,pasir ris,east region,209700.0,1.14,2.0
1,2014-10,punggol,5-room,196B,punggol field,110.0,improved,uncategorized,2003,1.4,103.91,0.0,punggol field,punggol,north-east region,402300.0,0.12,11.0
2,2020-09,sengkang,5 room,404A,fernvale lane,112.0,premium apartment,uncategorized,2004,1.39,103.87,0.0,fernvale,sengkang,north-east region,351000.0,0.48,2.0
3,2000-10,clementi,3 room,375,clementi avenue 4,67.0,new generation,uncategorized,1980,1.32,103.77,0.0,clementi north,clementi,west region,151200.0,0.42,8.0
4,2013-01,bukit batok,3-room,163,bukit batok street 11,73.0,model a,uncategorized,1985,1.35,103.74,0.0,bukit batok west,bukit batok,west region,318600.0,0.78,8.0


## Convert "flat_type" to Ordinal Data Type "flattype_mapping"

In [507]:
train = train.replace(to_replace ="5 room", value ="5-room")
train = train.replace(to_replace ="4 room", value ="4-room")
train = train.replace(to_replace ="3 room", value ="3-room")
train = train.replace(to_replace ="2 room", value ="2-room")
train = train.replace(to_replace ="1 room", value ="1-room")
flattype_mapping={"1-room": 1,
                   "2-room": 2,
                   "3-room": 3,
                   "4-room": 4,
                   "5-room": 5,
                   "multi generation": 6,
                   "executive": 7,}
train["flattype_mapping"] = train['flat_type'].map(flattype_mapping)
train = train.drop(columns=['flat_type'])
train.head()

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,resale_price,dist_mrt,storey_average,flattype_mapping
0,2001-08,pasir ris,440,pasir ris drive 4,118.0,model a,uncategorized,1989,1.37,103.96,0.0,pasir ris drive,pasir ris,east region,209700.0,1.14,2.0,4
1,2014-10,punggol,196B,punggol field,110.0,improved,uncategorized,2003,1.4,103.91,0.0,punggol field,punggol,north-east region,402300.0,0.12,11.0,5
2,2020-09,sengkang,404A,fernvale lane,112.0,premium apartment,uncategorized,2004,1.39,103.87,0.0,fernvale,sengkang,north-east region,351000.0,0.48,2.0,5
3,2000-10,clementi,375,clementi avenue 4,67.0,new generation,uncategorized,1980,1.32,103.77,0.0,clementi north,clementi,west region,151200.0,0.42,8.0,3
4,2013-01,bukit batok,163,bukit batok street 11,73.0,model a,uncategorized,1985,1.35,103.74,0.0,bukit batok west,bukit batok,west region,318600.0,0.78,8.0,3


## Convert "region" to one-hot encoding

In [508]:
train = pd.get_dummies(train, columns=['region'], prefix=['region'])
train

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,...,planning_area,resale_price,dist_mrt,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region
0,2001-08,pasir ris,440,pasir ris drive 4,118.00,model a,uncategorized,1989,1.37,103.96,...,pasir ris,209700.00,1.14,2.00,4,0,1,0,0,0
1,2014-10,punggol,196B,punggol field,110.00,improved,uncategorized,2003,1.40,103.91,...,punggol,402300.00,0.12,11.00,5,0,0,0,1,0
2,2020-09,sengkang,404A,fernvale lane,112.00,premium apartment,uncategorized,2004,1.39,103.87,...,sengkang,351000.00,0.48,2.00,5,0,0,0,1,0
3,2000-10,clementi,375,clementi avenue 4,67.00,new generation,uncategorized,1980,1.32,103.77,...,clementi,151200.00,0.42,8.00,3,0,0,0,0,1
4,2013-01,bukit batok,163,bukit batok street 11,73.00,model a,uncategorized,1985,1.35,103.74,...,bukit batok,318600.00,0.78,8.00,3,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431727,2005-03,woodlands,537,Woodlands Drive 16,101.00,model a,uncategorized,2000,1.43,103.79,...,woodlands,238500.00,0.30,2.00,4,0,0,1,0,0
431728,2016-04,sengkang,410A,fernvale road,95.00,premium apartment,uncategorized,2012,1.39,103.88,...,sengkang,376200.00,0.21,14.00,4,0,0,0,1,0
431729,2011-01,tampines,829,tampines street 81,67.00,new generation,uncategorized,1986,1.35,103.93,...,tampines,255600.00,0.57,2.00,3,0,1,0,0,0
431730,2013-05,sengkang,233,compassvale walk,123.00,improved,uncategorized,1999,1.39,103.90,...,sengkang,508500.00,0.50,17.00,5,0,0,0,1,0


## Feature engineering to create "remaining_lease" feature from "lease_commencement_date"

In [509]:
lease_commence_date = train['lease_commence_date'].to_numpy()
#assuming 99-year lease, we calculate the remaining years on the lease
remaining_lease = lease_commence_date + 99 - pd.DatetimeIndex(train['month']).year.to_numpy()
train['remaining_lease'] = remaining_lease
train = train.drop(columns=['lease_commence_date'])
train.head()

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,latitude,longitude,elevation,...,resale_price,dist_mrt,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2001-08,pasir ris,440,pasir ris drive 4,118.0,model a,uncategorized,1.37,103.96,0.0,...,209700.0,1.14,2.0,4,0,1,0,0,0,87
1,2014-10,punggol,196B,punggol field,110.0,improved,uncategorized,1.4,103.91,0.0,...,402300.0,0.12,11.0,5,0,0,0,1,0,88
2,2020-09,sengkang,404A,fernvale lane,112.0,premium apartment,uncategorized,1.39,103.87,0.0,...,351000.0,0.48,2.0,5,0,0,0,1,0,83
3,2000-10,clementi,375,clementi avenue 4,67.0,new generation,uncategorized,1.32,103.77,0.0,...,151200.0,0.42,8.0,3,0,0,0,0,1,79
4,2013-01,bukit batok,163,bukit batok street 11,73.0,model a,uncategorized,1.35,103.74,0.0,...,318600.0,0.78,8.0,3,0,0,0,0,1,71


## Convert "flat_model" to Ordinal Data Type

In [510]:
flat_models = train.groupby('flat_model')
keys = flat_models.groups.keys()

flat_type_dict = {}

# store the average "resale_price" of each "flat_model" in a dictionary with
#    key : "flat_model"
#    value : average "resale_price"
for key in keys:
    df_i = flat_models.get_group(key)
    flat_type_dict.update({key : df_i["resale_price"].mean()})

# ordering the different "flat_model" by average "resale_price" (ascending order)
{k: v for k, v in sorted(flat_type_dict.items(), key=lambda item: item[1])}
# print(flat_type_dict)
# print(flat_type_dict.keys())

# save the ordered list of keys from dictionary as a list for easier index finding
flat_model_list = list(flat_type_dict.keys())
print(flat_model_list)

for flat_model in flat_model_list:
    ordinal_value = flat_model_list.index(flat_model) + 1
    train['flat_model'] = train['flat_model'].str.replace(rf'^{flat_model}$', str(ordinal_value), regex=True)
    
train['flat_model'] = np.array(train['flat_model']).astype(str).astype(int)
train.head()


['2-room', 'adjoined flat', 'apartment', 'dbss', 'improved', 'improved maisonette', 'maisonette', 'model a', 'model a maisonette', 'model a2', 'multi generation', 'new generation', 'premium apartment', 'premium apartment loft', 'premium maisonette', 'simplified', 'standard', 'terrace', 'type s1', 'type s2']


Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,latitude,longitude,elevation,...,resale_price,dist_mrt,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2001-08,pasir ris,440,pasir ris drive 4,118.0,8,uncategorized,1.37,103.96,0.0,...,209700.0,1.14,2.0,4,0,1,0,0,0,87
1,2014-10,punggol,196B,punggol field,110.0,5,uncategorized,1.4,103.91,0.0,...,402300.0,0.12,11.0,5,0,0,0,1,0,88
2,2020-09,sengkang,404A,fernvale lane,112.0,13,uncategorized,1.39,103.87,0.0,...,351000.0,0.48,2.0,5,0,0,0,1,0,83
3,2000-10,clementi,375,clementi avenue 4,67.0,12,uncategorized,1.32,103.77,0.0,...,151200.0,0.42,8.0,3,0,0,0,0,1,79
4,2013-01,bukit batok,163,bukit batok street 11,73.0,8,uncategorized,1.35,103.74,0.0,...,318600.0,0.78,8.0,3,0,0,0,0,1,71


## Drop Columns "block", "street_name", and "eco_category"

In [511]:
train = train.drop(columns=['block', 'street_name', 'eco_category'])
train.head()

Unnamed: 0,month,town,floor_area_sqm,flat_model,latitude,longitude,elevation,subzone,planning_area,resale_price,dist_mrt,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2001-08,pasir ris,118.0,8,1.37,103.96,0.0,pasir ris drive,pasir ris,209700.0,1.14,2.0,4,0,1,0,0,0,87
1,2014-10,punggol,110.0,5,1.4,103.91,0.0,punggol field,punggol,402300.0,0.12,11.0,5,0,0,0,1,0,88
2,2020-09,sengkang,112.0,13,1.39,103.87,0.0,fernvale,sengkang,351000.0,0.48,2.0,5,0,0,0,1,0,83
3,2000-10,clementi,67.0,12,1.32,103.77,0.0,clementi north,clementi,151200.0,0.42,8.0,3,0,0,0,0,1,79
4,2013-01,bukit batok,73.0,8,1.35,103.74,0.0,bukit batok west,bukit batok,318600.0,0.78,8.0,3,0,0,0,0,1,71


## Drop Columns "elevation", "town", "subzone" and "planning_area"

In [512]:
train = train.drop(columns=['elevation','town','subzone','planning_area'])
train.head()

Unnamed: 0,month,floor_area_sqm,flat_model,latitude,longitude,resale_price,dist_mrt,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2001-08,118.0,8,1.37,103.96,209700.0,1.14,2.0,4,0,1,0,0,0,87
1,2014-10,110.0,5,1.4,103.91,402300.0,0.12,11.0,5,0,0,0,1,0,88
2,2020-09,112.0,13,1.39,103.87,351000.0,0.48,2.0,5,0,0,0,1,0,83
3,2000-10,67.0,12,1.32,103.77,151200.0,0.42,8.0,3,0,0,0,0,1,79
4,2013-01,73.0,8,1.35,103.74,318600.0,0.78,8.0,3,0,0,0,0,1,71


## Convert "month" to datetime

In [513]:
train['month'] = pd.to_datetime(train['month'])
train['datetime_month']= train['month'].map(dt.datetime.toordinal)

In [514]:
train_with_data = train.copy()
train_with_data.head()

Unnamed: 0,month,floor_area_sqm,flat_model,latitude,longitude,resale_price,dist_mrt,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease,datetime_month
0,2001-08-01,118.0,8,1.37,103.96,209700.0,1.14,2.0,4,0,1,0,0,0,87,730698
1,2014-10-01,110.0,5,1.4,103.91,402300.0,0.12,11.0,5,0,0,0,1,0,88,735507
2,2020-09-01,112.0,13,1.39,103.87,351000.0,0.48,2.0,5,0,0,0,1,0,83,737669
3,2000-10-01,67.0,12,1.32,103.77,151200.0,0.42,8.0,3,0,0,0,0,1,79,730394
4,2013-01-01,73.0,8,1.35,103.74,318600.0,0.78,8.0,3,0,0,0,0,1,71,734869


## Pre-processing on Test Dataset

## Feature Engineering to Create "dist_mrt" from Auxiliary Data

In [515]:
test.dropna()
test = test.reset_index(drop=True)
test

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region
0,2004-01,bukit batok,4 room,186,bukit batok west avenue 6,04 to 06,94.00,new generation,uncategorized,1989,1.35,103.74,0.00,bukit batok west,bukit batok,west region
1,2001-11,tampines,5 room,366,tampines street 34,04 to 06,122.00,improved,uncategorized,1997,1.36,103.96,0.00,tampines east,tampines,east region
2,2002-07,jurong east,3 room,206,jurong east street 21,01 to 03,67.00,new generation,uncategorized,1982,1.34,103.74,0.00,toh guan,jurong east,west region
3,2015-04,ang mo kio,3 room,180,Ang Mo Kio Avenue 5,04 to 06,82.00,new generation,uncategorized,1981,1.38,103.85,0.00,yio chu kang east,ang mo kio,north-east region
4,2004-04,clementi,5 room,356,clementi avenue 2,01 to 03,117.00,standard,uncategorized,1978,1.31,103.77,0.00,clementi north,clementi,west region
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107929,2008-04,hougang,5 room,981D,buangkok crescent,10 to 12,110.00,improved,uncategorized,2003,1.38,103.88,0.00,trafalgar,hougang,north-east region
107930,2006-01,kallang/whampoa,4 room,13,upper boon keng road,13 to 15,102.00,model a,uncategorized,1999,1.31,103.87,0.00,boon keng,kallang,central region
107931,2000-01,kallang/whampoa,3 room,1,beach road,07 to 09,68.00,improved,uncategorized,1979,1.29,103.85,0.00,city hall,downtown core,central region
107932,2009-07,jurong west,4 room,919,jurong west street 91,10 to 12,104.00,model a,uncategorized,1988,1.34,103.69,0.00,yunnan,jurong west,west region


In [516]:
with open('./auxiliary-data/distance-to-mrt-test-edited.csv', newline='') as f:
    reader = csv.reader(f)
    mrt_list = list(reader)

In [517]:
test['dist_mrt'] = np.array(mrt_list).astype(float)
test

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,dist_mrt
0,2004-01,bukit batok,4 room,186,bukit batok west avenue 6,04 to 06,94.00,new generation,uncategorized,1989,1.35,103.74,0.00,bukit batok west,bukit batok,west region,0.67
1,2001-11,tampines,5 room,366,tampines street 34,04 to 06,122.00,improved,uncategorized,1997,1.36,103.96,0.00,tampines east,tampines,east region,0.77
2,2002-07,jurong east,3 room,206,jurong east street 21,01 to 03,67.00,new generation,uncategorized,1982,1.34,103.74,0.00,toh guan,jurong east,west region,0.52
3,2015-04,ang mo kio,3 room,180,Ang Mo Kio Avenue 5,04 to 06,82.00,new generation,uncategorized,1981,1.38,103.85,0.00,yio chu kang east,ang mo kio,north-east region,0.55
4,2004-04,clementi,5 room,356,clementi avenue 2,01 to 03,117.00,standard,uncategorized,1978,1.31,103.77,0.00,clementi north,clementi,west region,0.52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107929,2008-04,hougang,5 room,981D,buangkok crescent,10 to 12,110.00,improved,uncategorized,2003,1.38,103.88,0.00,trafalgar,hougang,north-east region,1.22
107930,2006-01,kallang/whampoa,4 room,13,upper boon keng road,13 to 15,102.00,model a,uncategorized,1999,1.31,103.87,0.00,boon keng,kallang,central region,0.35
107931,2000-01,kallang/whampoa,3 room,1,beach road,07 to 09,68.00,improved,uncategorized,1979,1.29,103.85,0.00,city hall,downtown core,central region,0.22
107932,2009-07,jurong west,4 room,919,jurong west street 91,10 to 12,104.00,model a,uncategorized,1988,1.34,103.69,0.00,yunnan,jurong west,west region,1.14


## Convert "storey_range" into Numerical Data Type "storey_average" 

In [518]:
test['lower'] = test['storey_range'].str.extract('(\d+)').astype(str).astype(int)
test['upper'] = test['storey_range'].str.extract('(\d+$)').astype(str).astype(int)
# calculate average storey from the range
test['storey_average'] = test['upper'] + test['lower']
test['storey_average'] = test['storey_average'] / 2.0
test = test.drop(columns=['storey_range', 'lower', 'upper'])
test

Unnamed: 0,month,town,flat_type,block,street_name,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,dist_mrt,storey_average
0,2004-01,bukit batok,4 room,186,bukit batok west avenue 6,94.00,new generation,uncategorized,1989,1.35,103.74,0.00,bukit batok west,bukit batok,west region,0.67,5.00
1,2001-11,tampines,5 room,366,tampines street 34,122.00,improved,uncategorized,1997,1.36,103.96,0.00,tampines east,tampines,east region,0.77,5.00
2,2002-07,jurong east,3 room,206,jurong east street 21,67.00,new generation,uncategorized,1982,1.34,103.74,0.00,toh guan,jurong east,west region,0.52,2.00
3,2015-04,ang mo kio,3 room,180,Ang Mo Kio Avenue 5,82.00,new generation,uncategorized,1981,1.38,103.85,0.00,yio chu kang east,ang mo kio,north-east region,0.55,5.00
4,2004-04,clementi,5 room,356,clementi avenue 2,117.00,standard,uncategorized,1978,1.31,103.77,0.00,clementi north,clementi,west region,0.52,2.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107929,2008-04,hougang,5 room,981D,buangkok crescent,110.00,improved,uncategorized,2003,1.38,103.88,0.00,trafalgar,hougang,north-east region,1.22,11.00
107930,2006-01,kallang/whampoa,4 room,13,upper boon keng road,102.00,model a,uncategorized,1999,1.31,103.87,0.00,boon keng,kallang,central region,0.35,14.00
107931,2000-01,kallang/whampoa,3 room,1,beach road,68.00,improved,uncategorized,1979,1.29,103.85,0.00,city hall,downtown core,central region,0.22,8.00
107932,2009-07,jurong west,4 room,919,jurong west street 91,104.00,model a,uncategorized,1988,1.34,103.69,0.00,yunnan,jurong west,west region,1.14,11.00


## Convert "flat_type" to Ordinal Data Type "flattype_mapping"

In [519]:
test = test.replace(to_replace ="5 room", value ="5-room")
test = test.replace(to_replace ="4 room", value ="4-room")
test = test.replace(to_replace ="3 room", value ="3-room")
test = test.replace(to_replace ="2 room", value ="2-room")
test = test.replace(to_replace ="1 room", value ="1-room")
flattype_mapping={"1-room": 1,
                   "2-room": 2,
                   "3-room": 3,
                   "4-room": 4,
                   "5-room": 5,
                   "multi generation": 6,
                   "executive": 7,}
test["flattype_mapping"] = test['flat_type'].map(flattype_mapping)
test = test.drop(columns=['flat_type'])
test

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,dist_mrt,storey_average,flattype_mapping
0,2004-01,bukit batok,186,bukit batok west avenue 6,94.00,new generation,uncategorized,1989,1.35,103.74,0.00,bukit batok west,bukit batok,west region,0.67,5.00,4
1,2001-11,tampines,366,tampines street 34,122.00,improved,uncategorized,1997,1.36,103.96,0.00,tampines east,tampines,east region,0.77,5.00,5
2,2002-07,jurong east,206,jurong east street 21,67.00,new generation,uncategorized,1982,1.34,103.74,0.00,toh guan,jurong east,west region,0.52,2.00,3
3,2015-04,ang mo kio,180,Ang Mo Kio Avenue 5,82.00,new generation,uncategorized,1981,1.38,103.85,0.00,yio chu kang east,ang mo kio,north-east region,0.55,5.00,3
4,2004-04,clementi,356,clementi avenue 2,117.00,standard,uncategorized,1978,1.31,103.77,0.00,clementi north,clementi,west region,0.52,2.00,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107929,2008-04,hougang,981D,buangkok crescent,110.00,improved,uncategorized,2003,1.38,103.88,0.00,trafalgar,hougang,north-east region,1.22,11.00,5
107930,2006-01,kallang/whampoa,13,upper boon keng road,102.00,model a,uncategorized,1999,1.31,103.87,0.00,boon keng,kallang,central region,0.35,14.00,4
107931,2000-01,kallang/whampoa,1,beach road,68.00,improved,uncategorized,1979,1.29,103.85,0.00,city hall,downtown core,central region,0.22,8.00,3
107932,2009-07,jurong west,919,jurong west street 91,104.00,model a,uncategorized,1988,1.34,103.69,0.00,yunnan,jurong west,west region,1.14,11.00,4


## Convert "region" to one hot encoding

In [520]:
test = pd.get_dummies(test, columns=['region'], prefix=['region'])
test

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,...,subzone,planning_area,dist_mrt,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region
0,2004-01,bukit batok,186,bukit batok west avenue 6,94.00,new generation,uncategorized,1989,1.35,103.74,...,bukit batok west,bukit batok,0.67,5.00,4,0,0,0,0,1
1,2001-11,tampines,366,tampines street 34,122.00,improved,uncategorized,1997,1.36,103.96,...,tampines east,tampines,0.77,5.00,5,0,1,0,0,0
2,2002-07,jurong east,206,jurong east street 21,67.00,new generation,uncategorized,1982,1.34,103.74,...,toh guan,jurong east,0.52,2.00,3,0,0,0,0,1
3,2015-04,ang mo kio,180,Ang Mo Kio Avenue 5,82.00,new generation,uncategorized,1981,1.38,103.85,...,yio chu kang east,ang mo kio,0.55,5.00,3,0,0,0,1,0
4,2004-04,clementi,356,clementi avenue 2,117.00,standard,uncategorized,1978,1.31,103.77,...,clementi north,clementi,0.52,2.00,5,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107929,2008-04,hougang,981D,buangkok crescent,110.00,improved,uncategorized,2003,1.38,103.88,...,trafalgar,hougang,1.22,11.00,5,0,0,0,1,0
107930,2006-01,kallang/whampoa,13,upper boon keng road,102.00,model a,uncategorized,1999,1.31,103.87,...,boon keng,kallang,0.35,14.00,4,1,0,0,0,0
107931,2000-01,kallang/whampoa,1,beach road,68.00,improved,uncategorized,1979,1.29,103.85,...,city hall,downtown core,0.22,8.00,3,1,0,0,0,0
107932,2009-07,jurong west,919,jurong west street 91,104.00,model a,uncategorized,1988,1.34,103.69,...,yunnan,jurong west,1.14,11.00,4,0,0,0,0,1


## Feature engineering to create "remaining_lease" feature from "lease_commencement_date"

In [521]:
lease_commence_date = test['lease_commence_date'].to_numpy()
#assuming 99-year lease, we calculate the remaining years on the lease
remaining_lease = lease_commence_date + 99 - pd.DatetimeIndex(test['month']).year.to_numpy()
test['remaining_lease'] = remaining_lease
test = test.drop(columns=['lease_commence_date'])
test.head()

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,latitude,longitude,elevation,...,planning_area,dist_mrt,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2004-01,bukit batok,186,bukit batok west avenue 6,94.0,new generation,uncategorized,1.35,103.74,0.0,...,bukit batok,0.67,5.0,4,0,0,0,0,1,84
1,2001-11,tampines,366,tampines street 34,122.0,improved,uncategorized,1.36,103.96,0.0,...,tampines,0.77,5.0,5,0,1,0,0,0,95
2,2002-07,jurong east,206,jurong east street 21,67.0,new generation,uncategorized,1.34,103.74,0.0,...,jurong east,0.52,2.0,3,0,0,0,0,1,79
3,2015-04,ang mo kio,180,Ang Mo Kio Avenue 5,82.0,new generation,uncategorized,1.38,103.85,0.0,...,ang mo kio,0.55,5.0,3,0,0,0,1,0,65
4,2004-04,clementi,356,clementi avenue 2,117.0,standard,uncategorized,1.31,103.77,0.0,...,clementi,0.52,2.0,5,0,0,0,0,1,73


## Convert "flat_model" to Ordinal Data Type

In [522]:
# uses the same "flat_model_list" extracted from the "flat_model" dictionary calculated from train dataset
for flat_model in flat_model_list:
    ordinal_value = flat_model_list.index(flat_model) + 1
    test['flat_model'] = test['flat_model'].str.replace(rf'^{flat_model}$', str(ordinal_value), regex=True)
test['flat_model'] = np.array(test['flat_model']).astype(str).astype(int)
test.head()

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,latitude,longitude,elevation,...,planning_area,dist_mrt,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2004-01,bukit batok,186,bukit batok west avenue 6,94.0,12,uncategorized,1.35,103.74,0.0,...,bukit batok,0.67,5.0,4,0,0,0,0,1,84
1,2001-11,tampines,366,tampines street 34,122.0,5,uncategorized,1.36,103.96,0.0,...,tampines,0.77,5.0,5,0,1,0,0,0,95
2,2002-07,jurong east,206,jurong east street 21,67.0,12,uncategorized,1.34,103.74,0.0,...,jurong east,0.52,2.0,3,0,0,0,0,1,79
3,2015-04,ang mo kio,180,Ang Mo Kio Avenue 5,82.0,12,uncategorized,1.38,103.85,0.0,...,ang mo kio,0.55,5.0,3,0,0,0,1,0,65
4,2004-04,clementi,356,clementi avenue 2,117.0,17,uncategorized,1.31,103.77,0.0,...,clementi,0.52,2.0,5,0,0,0,0,1,73


## Drop Columns "block", "street_name", and "eco_category"

In [523]:
test = test.drop(columns=['block', 'street_name', 'eco_category'])
test.head()

Unnamed: 0,month,town,floor_area_sqm,flat_model,latitude,longitude,elevation,subzone,planning_area,dist_mrt,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2004-01,bukit batok,94.0,12,1.35,103.74,0.0,bukit batok west,bukit batok,0.67,5.0,4,0,0,0,0,1,84
1,2001-11,tampines,122.0,5,1.36,103.96,0.0,tampines east,tampines,0.77,5.0,5,0,1,0,0,0,95
2,2002-07,jurong east,67.0,12,1.34,103.74,0.0,toh guan,jurong east,0.52,2.0,3,0,0,0,0,1,79
3,2015-04,ang mo kio,82.0,12,1.38,103.85,0.0,yio chu kang east,ang mo kio,0.55,5.0,3,0,0,0,1,0,65
4,2004-04,clementi,117.0,17,1.31,103.77,0.0,clementi north,clementi,0.52,2.0,5,0,0,0,0,1,73


## Drop Columns "elevation", "town", "subzone" and "planning_area"

In [524]:
test = test.drop(columns=['elevation','town','subzone','planning_area'])
test.head()

Unnamed: 0,month,floor_area_sqm,flat_model,latitude,longitude,dist_mrt,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2004-01,94.0,12,1.35,103.74,0.67,5.0,4,0,0,0,0,1,84
1,2001-11,122.0,5,1.36,103.96,0.77,5.0,5,0,1,0,0,0,95
2,2002-07,67.0,12,1.34,103.74,0.52,2.0,3,0,0,0,0,1,79
3,2015-04,82.0,12,1.38,103.85,0.55,5.0,3,0,0,0,1,0,65
4,2004-04,117.0,17,1.31,103.77,0.52,2.0,5,0,0,0,0,1,73


## Convert "month" to datetime

In [525]:
test['month'] = pd.to_datetime(test['month'])
test['datetime_month']= test['month'].map(dt.datetime.toordinal)

In [526]:
test

Unnamed: 0,month,floor_area_sqm,flat_model,latitude,longitude,dist_mrt,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease,datetime_month
0,2004-01-01,94.00,12,1.35,103.74,0.67,5.00,4,0,0,0,0,1,84,731581
1,2001-11-01,122.00,5,1.36,103.96,0.77,5.00,5,0,1,0,0,0,95,730790
2,2002-07-01,67.00,12,1.34,103.74,0.52,2.00,3,0,0,0,0,1,79,731032
3,2015-04-01,82.00,12,1.38,103.85,0.55,5.00,3,0,0,0,1,0,65,735689
4,2004-04-01,117.00,17,1.31,103.77,0.52,2.00,5,0,0,0,0,1,73,731672
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107929,2008-04-01,110.00,5,1.38,103.88,1.22,11.00,5,0,0,0,1,0,94,733133
107930,2006-01-01,102.00,8,1.31,103.87,0.35,14.00,4,1,0,0,0,0,92,732312
107931,2000-01-01,68.00,5,1.29,103.85,0.22,8.00,3,1,0,0,0,0,78,730120
107932,2009-07-01,104.00,8,1.34,103.69,1.14,11.00,4,0,0,0,0,1,78,733589


# Experimenting with various ML models

## Implement linear regression

In [259]:
X = train.drop (["resale_price", "month"],axis = 1)
y = train["resale_price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [445]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [446]:
print(regressor.intercept_)
print(regressor.coef_)
print('Training score: {}'.format(regressor.score(X_train, y_train)))
print('Test score: {}'.format(regressor.score(X_test, y_test)))

-29945027.554563683
[ 2.41134868e+03  9.94147956e+02 -9.82572853e+05  4.10440727e+04
 -1.58815763e+04 -3.62493235e+04  3.84796687e+04  1.11517261e+03
  1.63507887e+04  2.38692243e+04 -1.31577842e+04  2.02620750e+04
  3.50291243e+03 -3.44764276e+04  2.23769216e+03  3.64668263e+01]
Training score: 0.7915924665460026
Test score: 0.7891979333601642


In [447]:
y_pred = regressor.predict(X_test)
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(df)
print('RMSE for Linear Regression => ',np.sqrt(mean_squared_error(y_test,y_pred)))

          Actual      Predicted
76998   319950.0  331083.083027
397501  315000.0  339260.343548
347486  396000.0  460663.919879
242624  243000.0  319792.689107
64696   194400.0  259165.994044
...          ...            ...
400889  189000.0  180928.703309
287118  292500.0  362791.924591
391252  238500.0  284130.059303
401219  159559.2  150656.625197
201018  220500.0  228062.297442

[86347 rows x 2 columns]
RMSE for Linear Regression =>  59489.00883760603


## Implement polynomial predictor

In [448]:
poly_reg = PolynomialFeatures(degree = 3)
X_poly = poly_reg.fit_transform(X)
print(len(X_poly))
print(len(y))
lin_reg = LinearRegression()
lin_reg.fit(X_poly,y)

431732
431732


LinearRegression()

In [449]:
print('Training score: {}'.format(lin_reg.score(X_poly, y)))
# print('Training score: {}'.format(lin_reg.score(X_test, y_test)))

Training score: 0.8687702493355828


In [450]:
y_pred = lin_reg.predict(X_poly)
df = pd.DataFrame({'Real Values':y, 'Predicted Values':y_pred})
df

Unnamed: 0,Real Values,Predicted Values
0,209700.0,214691.397038
1,402300.0,396429.510319
2,351000.0,500642.959538
3,151200.0,126869.674381
4,318600.0,252936.869694
...,...,...
431727,238500.0,217542.434147
431728,376200.0,459234.783756
431729,255600.0,239938.055241
431730,508500.0,415864.221256


## Implement pipeline predictor

In [451]:
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=3)),
    ('model', LinearRegression())
]
pipeline = Pipeline(steps)
pipeline.fit(X_train, y_train)
print('Training score: {}'.format(pipeline.score(X_train, y_train)))
print('Test score: {}'.format(pipeline.score(X_test, y_test)))

Training score: 0.9321259466567403
Test score: 0.9320959686445839


In [452]:
y_pred_pipeline = pipeline.predict(X_test)
df = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred_pipeline})
print(df)
mse = mean_squared_error(y_test, y_pred_pipeline)
rmse = math.sqrt(mse)
print('RMSE: {}'.format(rmse))

        Real Values  Predicted Values
76998      319950.0          304468.0
397501     315000.0          355534.0
347486     396000.0          394536.0
242624     243000.0          280289.0
64696      194400.0          215075.0
...             ...               ...
400889     189000.0          203515.0
287118     292500.0          346456.0
391252     238500.0          262195.0
401219     159559.2          152140.0
201018     220500.0          204774.0

[86347 rows x 2 columns]
RMSE: 33763.470453538845


## Tuning number of features for pipeline predictor

In [501]:
scaler = StandardScaler().fit(X_train)
X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)

for p in range(1,5):
    print(p)
    poly = PolynomialFeatures(p)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.fit_transform(X_test)

    lin_reg = LinearRegression().fit(X_train_poly, y_train)

    y_train_pred = lin_reg.predict(X_train_poly)
    y_test_pred = lin_reg.predict(X_test_poly)
    mse = mean_squared_error(y_test, y_test_pred)
    rmse = math.sqrt(mse)
    print('RMSE: {}'.format(rmse))

1
RMSE: 59488.525022365335
2
RMSE: 50185.54311819719
3
RMSE: 33763.470453538845
4


MemoryError: Unable to allocate 12.5 GiB for an array with shape (345385, 4845) and data type float64

## Implement XG boost

In [295]:
xgb_regressor = xgboost.XGBRegressor(objective ='reg:squarederror', learning_rate = 0.1, n_estimators = 1000)
xgb_regressor.fit(X_train, y_train)

y_pred_rf = xgb_regressor.predict(X_test)
df = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred_rf})
print(df)

mse = mean_squared_error(y_test, y_pred_rf)
rmse = math.sqrt(mse)
print('RMSE for random forest regression for dataset: {}'.format(rmse))

        Real Values  Predicted Values
76998     319950.00         326137.97
397501    315000.00         329924.44
347486    396000.00         406524.56
242624    243000.00         241392.64
64696     194400.00         196085.81
...             ...               ...
400889    189000.00         189676.05
287118    292500.00         298393.28
391252    238500.00         236991.81
401219    159559.20         152026.72
201018    220500.00         216943.16

[86347 rows x 2 columns]
RMSE for random forest regression for dataset: 16957.765615197197


In [298]:
pred = xgb_regressor.predict(test.drop(columns=['month']))
df_out = pd.DataFrame({'Predicted':pred})
df_out.reset_index(inplace=True)
df_out = df_out.rename(columns = {'index':'Id'})
print(df_out)
df_out.to_csv('./submission_xgb_full.csv', index = False, header=True)

            Id  Predicted
0            0  189723.30
1            1  303970.97
2            2  119944.59
3            3  321093.69
4            4  314263.31
...        ...        ...
107929  107929  331986.97
107930  107930  298446.34
107931  107931  159000.69
107932  107932  244500.81
107933  107933  332849.59

[107934 rows x 2 columns]


In [488]:
train_dmatrix = xgboost.DMatrix(data = X_train, label = y_train)
test_dmatrix = xgboost.DMatrix(data = X_test, label = y_test)

xgb_opt_regressor = xgb_r = xgboost.train(params = {'objective' :'reg:squarederror', 'lambda': 0.0001}, 
                                     dtrain = train_dmatrix, num_boost_round = 1000)

y_pred_xgb = xgb_opt_regressor.predict(test_dmatrix)
df = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred_xgb})
print(df)

mse = mean_squared_error(y_test, y_pred_xgb)
rmse = math.sqrt(mse)
print('RMSE for optimized xgboost regression for dataset: {}'.format(rmse))

        Real Values  Predicted Values
392819    324000.00         321568.41
44810     324000.00         351305.72
300884    205200.00         196237.23
267541    400500.00         395827.25
331384    567000.00         566471.31
...             ...               ...
32016     529200.00         543206.50
195448    360000.00         361442.78
357612    326700.00         309989.94
416007    322200.00         304690.09
408379    765000.00         713991.31

[24390 rows x 2 columns]
RMSE for optimized xgboost regression for dataset: 21584.923573248358


## Compare other regression models offered by scikit-learn

In [498]:
from sklearn import linear_model

scaler = StandardScaler().fit(X_train)
X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)

models = [
    linear_model.SGDRegressor(),
    linear_model.BayesianRidge(),
    linear_model.LassoLars(),
    linear_model.ARDRegression(),
    linear_model.PassiveAggressiveRegressor(),
    linear_model.TheilSenRegressor(),
    linear_model.LinearRegression()]


for item in models:
    print(item)
    model = item
    model.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = math.sqrt(mse)
    print('RMSE: {}'.format(rmse))

SGDRegressor()
RMSE: 59105.260520049065
BayesianRidge()
RMSE: 59012.74630527616
LassoLars()
RMSE: 70899.45039972338
ARDRegression()




RMSE: 59012.82754357305
PassiveAggressiveRegressor()
RMSE: 60430.22332118529
TheilSenRegressor(max_subpopulation=10000)
RMSE: 61061.90972446903
LinearRegression()
RMSE: 59010.264842596014


None of these models perform as well as the regression and xgboost models

## Try few classifier models

In [500]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression


X = train.drop (["resale_price", "month"],axis = 1)
y = train["resale_price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

models = [KNeighborsClassifier(),
          DecisionTreeClassifier(),
          RandomForestClassifier(),
          AdaBoostClassifier(),
          GradientBoostingClassifier(),
          LogisticRegression()]

for item in models:
    print(item)
    model = item
    # since classifiers dont accept float input
    model.fit(X_train, y_train.astype(int))
    y_pred = clf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = math.sqrt(mse)
    print('RMSE: {}'.format(rmse))

KNeighborsClassifier()
RMSE: 1.4045529694596254e+17
DecisionTreeClassifier()
RMSE: 1.4045529694596254e+17
RandomForestClassifier()


MemoryError: could not allocate 23655874560 bytes

## Trying lazy predict library to understand variety of regression models

In [28]:
import lazypredict
from lazypredict.Supervised import LazyRegressor

X = train.drop (["resale_price", "month"],axis = 1)
y = train["resale_price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

scaler = StandardScaler().fit(X_train)
X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)

reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)

print(models)

 26%|█████████████████████▍                                                            | 11/42 [03:51<13:33, 26.24s/it]

GaussianProcessRegressor model failed to execute
Unable to allocate 889. GiB for an array with shape (345385, 345385) and data type float64


 40%|█████████████████████████████████▏                                                | 17/42 [05:20<05:30, 13.24s/it]

KernelRidge model failed to execute
Unable to allocate 889. GiB for an array with shape (345385, 345385) and data type float64


100%|███████████████████████████████████████████████████████████████████████████████| 42/42 [9:06:50<00:00, 781.21s/it]

                               Adjusted R-Squared  R-Squared      RMSE  \
Model                                                                    
RandomForestRegressor                        0.98       0.98  17586.35   
ExtraTreesRegressor                          0.98       0.98  17943.71   
BaggingRegressor                             0.98       0.98  18511.93   
XGBRegressor                                 0.98       0.98  19750.51   
LGBMRegressor                                0.97       0.97  22795.51   
HistGradientBoostingRegressor                0.97       0.97  22814.32   
KNeighborsRegressor                          0.96       0.96  24539.40   
DecisionTreeRegressor                        0.96       0.96  24720.59   
ExtraTreeRegressor                           0.96       0.96  26710.78   
GradientBoostingRegressor                    0.93       0.93  33438.68   
MLPRegressor                                 0.86       0.86  49227.71   
PoissonRegressor                      




From this series of tests, we decided to train:  
    - basic linear regression  
    - pipeline polynomial regression  
    - random forest regressor  
    - xgboost regressor

# Split for 3 periods

### During EDA, we observed 3 distinct timeframes over the 20 years of data, so we think it will be beneficial to train 3 different models; one for each distinct timeframe

In [527]:
train.head()

Unnamed: 0,month,floor_area_sqm,flat_model,latitude,longitude,resale_price,dist_mrt,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease,datetime_month
0,2001-08-01,118.0,8,1.37,103.96,209700.0,1.14,2.0,4,0,1,0,0,0,87,730698
1,2014-10-01,110.0,5,1.4,103.91,402300.0,0.12,11.0,5,0,0,0,1,0,88,735507
2,2020-09-01,112.0,13,1.39,103.87,351000.0,0.48,2.0,5,0,0,0,1,0,83,737669
3,2000-10-01,67.0,12,1.32,103.77,151200.0,0.42,8.0,3,0,0,0,0,1,79,730394
4,2013-01-01,73.0,8,1.35,103.74,318600.0,0.78,8.0,3,0,0,0,0,1,71,734869


In [528]:
test.head()

Unnamed: 0,month,floor_area_sqm,flat_model,latitude,longitude,dist_mrt,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease,datetime_month
0,2004-01-01,94.0,12,1.35,103.74,0.67,5.0,4,0,0,0,0,1,84,731581
1,2001-11-01,122.0,5,1.36,103.96,0.77,5.0,5,0,1,0,0,0,95,730790
2,2002-07-01,67.0,12,1.34,103.74,0.52,2.0,3,0,0,0,0,1,79,731032
3,2015-04-01,82.0,12,1.38,103.85,0.55,5.0,3,0,0,0,1,0,65,735689
4,2004-04-01,117.0,17,1.31,103.77,0.52,2.0,5,0,0,0,0,1,73,731672


In [529]:
train.dtypes , test.dtypes

(month                       datetime64[ns]
 floor_area_sqm                     float64
 flat_model                           int32
 latitude                           float64
 longitude                          float64
 resale_price                       float64
 dist_mrt                           float64
 storey_average                     float64
 flattype_mapping                     int64
 region_central region                uint8
 region_east region                   uint8
 region_north region                  uint8
 region_north-east region             uint8
 region_west region                   uint8
 remaining_lease                      int64
 datetime_month                       int64
 dtype: object,
 month                       datetime64[ns]
 floor_area_sqm                     float64
 flat_model                           int32
 latitude                           float64
 longitude                          float64
 dist_mrt                           float64
 storey_average 

## Splitting data 

In [530]:
train_first = train[train['month'] <= dt.datetime(2007,1,1)]
train_second = train[train['month'] > dt.datetime(2007,1,1)]
train_second = train_second[train_second['month'] <= dt.datetime(2013,1,1)]
train_third = train[train['month'] > dt.datetime(2013,1,1)]

In [531]:
train['month']

0        2001-08-01
1        2014-10-01
2        2020-09-01
3        2000-10-01
4        2013-01-01
            ...    
431727   2005-03-01
431728   2016-04-01
431729   2011-01-01
431730   2013-05-01
431731   2007-08-01
Name: month, Length: 431732, dtype: datetime64[ns]

In [532]:
def run_linear_regression(X_train, X_test, y_train, y_test):    
    lin_regressor = LinearRegression()
    lin_regressor.fit(X_train, y_train)

    print('Training score: {}'.format(lin_regressor.score(X_train, y_train)))
    print('Test score: {} \n'.format(lin_regressor.score(X_test, y_test)))
    y_pred_lin = lin_regressor.predict(X_test)

    df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_lin})
    print(df)

    mse = mean_squared_error(y_test, y_pred_lin)
    rmse = math.sqrt(mse)
    print('RMSE for Linear Regression for dataset: {}'.format(rmse))
    return lin_regressor

In [533]:
def run_pipeline_regression(X_train, X_test, y_train, y_test):
    steps = [
        ('scalar', StandardScaler()),
        ('poly', PolynomialFeatures(degree=3)),
        ('model', LinearRegression())
    ]

    pipeline = Pipeline(steps)
    pipeline.fit(X_train, y_train)

    print('Training score: {}'.format(pipeline.score(X_train, y_train)))
    print('Test score: {} \n'.format(pipeline.score(X_test, y_test)))

    y_pred_pipeline = pipeline.predict(X_test)
    df = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred_pipeline})
    print(df)

    mse = mean_squared_error(y_test, y_pred_pipeline)
    rmse = math.sqrt(mse)
    print('RMSE for pipeline regression for dataset: {}'.format(rmse))
    
    return pipeline

In [534]:
def run_rf_regression(X_train, X_test, y_train, y_test):
    rf_regressor = RandomForestRegressor()
    rf_regressor.fit(X_train, y_train)

    print('Training score: {}'.format(rf_regressor.score(X_train, y_train)))
    print('Test score: {} \n'.format(rf_regressor.score(X_test, y_test)))

    y_pred_rf = rf_regressor.predict(X_test)
    df = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred_rf})
    print(df)

    mse = mean_squared_error(y_test, y_pred_rf)
    rmse = math.sqrt(mse)
    print('RMSE for random forest regression for dataset: {}'.format(rmse))
    
    return rf_regressor

In [535]:
def run_xgboost_regression(X_train, X_test, y_train, y_test):
    xgb_regressor = xgboost.XGBRegressor(objective ='reg:squarederror', learning_rate = 0.1, n_estimators = 1000)
    xgb_regressor.fit(X_train, y_train)

    print('Training score: {}'.format(xgb_regressor.score(X_train, y_train)))
    print('Test score: {} \n'.format(xgb_regressor.score(X_test, y_test)))

    y_pred_xgb = xgb_regressor.predict(X_test)
    df = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred_xgb})
    print(df)

    mse = mean_squared_error(y_test, y_pred_xgb)
    rmse = math.sqrt(mse)
    print('RMSE for xgboost regression for dataset: {}'.format(rmse))
    
    return xgb_regressor

In [536]:
def run_xgboost_optimize(X_train, X_test, y_train, y_test):
    #run xgboost using dmatrix, an optimized data structure for xgboost
    
    train_dmatrix = xgboost.DMatrix(data = X_train, label = y_train)
    test_dmatrix = xgboost.DMatrix(data = X_test, label = y_test)

    xgb_opt_regressor = xgb_r = xgboost.train(params = {'objective' :'reg:squarederror', 'lambda': 0.001}, 
                                     dtrain = train_dmatrix, num_boost_round = 1000)

    y_pred_xgb = xgb_opt_regressor.predict(test_dmatrix)
    df = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred_xgb})
    print(df)

    mse = mean_squared_error(y_test, y_pred_xgb)
    rmse = math.sqrt(mse)
    print('RMSE for optimized xgboost regression for dataset: {}'.format(rmse))
    
    return xgb_opt_regressor

## train first dataset

In [537]:
X = train_first.drop (["resale_price","month"],axis = 1)
y = train_first["resale_price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [538]:
lin_regressor1 = run_linear_regression(X_train, X_test, y_train, y_test)

Training score: 0.8568633829084817
Test score: 0.8581647757476782 

          Actual  Predicted
26265  255600.00  259894.48
106610 162450.00  166585.95
43527  324000.00  365671.16
309467 190800.00  200449.52
404446 173700.00  203287.22
...          ...        ...
98780  486000.00  390869.40
426912 283500.00  311487.07
364642 321120.00  345280.23
340955 114300.00  144005.07
199410 186300.00  203308.13

[35773 rows x 2 columns]
RMSE for Linear Regression for dataset: 30384.174533407597


In [539]:
pipeline1 = run_pipeline_regression(X_train, X_test, y_train, y_test)

Training score: 0.9544833531425638
Test score: 0.9544926327223215 

        Real Values  Predicted Values
26265     255600.00         229120.44
106610    162450.00         155704.44
43527     324000.00         321064.44
309467    190800.00         199596.44
404446    173700.00         161230.44
...             ...               ...
98780     486000.00         481990.44
426912    283500.00         285976.44
364642    321120.00         298236.44
340955    114300.00         137072.44
199410    186300.00         201228.44

[35773 rows x 2 columns]
RMSE for pipeline regression for dataset: 17210.609382233222


In [540]:
rf_regressor1 = run_rf_regression(X_train, X_test, y_train, y_test)

Training score: 0.9956396494177446
Test score: 0.9703310938679007 

        Real Values  Predicted Values
26265     255600.00         263213.10
106610    162450.00         164637.00
43527     324000.00         319617.00
309467    190800.00         189791.93
404446    173700.00         178767.00
...             ...               ...
98780     486000.00         504760.57
426912    283500.00         271979.10
364642    321120.00         302220.00
340955    114300.00         132791.40
199410    186300.00         205201.80

[35773 rows x 2 columns]
RMSE for random forest regression for dataset: 13896.523286768552


In [541]:
xgb_regressor1 = run_xgboost_regression(X_train, X_test, y_train, y_test)

Training score: 0.9808051017693984
Test score: 0.9736341192813595 

        Real Values  Predicted Values
26265     255600.00         251320.75
106610    162450.00         164911.95
43527     324000.00         318371.94
309467    190800.00         194815.97
404446    173700.00         177386.30
...             ...               ...
98780     486000.00         487798.66
426912    283500.00         276395.59
364642    321120.00         316545.31
340955    114300.00         123513.48
199410    186300.00         201515.17

[35773 rows x 2 columns]
RMSE for xgboost regression for dataset: 13100.158001564827


In [542]:
xgb_opt_regressor1 = run_xgboost_optimize(X_train, X_test, y_train, y_test)

        Real Values  Predicted Values
26265     255600.00         254483.83
106610    162450.00         166758.11
43527     324000.00         324832.28
309467    190800.00         195473.17
404446    173700.00         172648.67
...             ...               ...
98780     486000.00         487505.69
426912    283500.00         282018.50
364642    321120.00         306527.41
340955    114300.00         120644.75
199410    186300.00         201057.16

[35773 rows x 2 columns]
RMSE for optimized xgboost regression for dataset: 13406.944292466209


## train second dataset

In [543]:
X = train_second.drop (["resale_price", "month"],axis = 1)
y = train_second["resale_price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [544]:
lin_regressor2 = run_linear_regression(X_train, X_test, y_train, y_test)

Training score: 0.8636108345595086
Test score: 0.8625072826642828 

          Actual  Predicted
337221 241200.00  194139.56
398947 369000.00  449409.88
250384 340200.00  358302.48
377764 328500.00  363829.15
85992  421999.20  500251.49
...          ...        ...
355414 252000.00  233628.94
299224 436500.00  421558.60
198805 292500.00  304855.16
56695  418500.00  452908.89
11061  388800.00  376876.20

[26185 rows x 2 columns]
RMSE for Linear Regression for dataset: 40110.537040586154


In [545]:
pipeline2 = run_pipeline_regression(X_train, X_test, y_train, y_test)

Training score: 0.9517704022559752
Test score: 0.9507346081496559 

        Real Values  Predicted Values
337221    241200.00         247868.00
398947    369000.00         455541.00
250384    340200.00         360524.00
377764    328500.00         349898.00
85992     421999.20         467700.00
...             ...               ...
355414    252000.00         278623.00
299224    436500.00         413818.00
198805    292500.00         291012.00
56695     418500.00         430204.00
11061     388800.00         374818.00

[26185 rows x 2 columns]
RMSE for pipeline regression for dataset: 24009.858462748467


In [546]:
rf_regressor2 = run_rf_regression(X_train, X_test, y_train, y_test)

Training score: 0.995521453783386
Test score: 0.9688719329056752 

        Real Values  Predicted Values
337221    241200.00         230007.49
398947    369000.00         412628.40
250384    340200.00         341034.46
377764    328500.00         342401.83
85992     421999.20         439701.19
...             ...               ...
355414    252000.00         255202.99
299224    436500.00         409964.08
198805    292500.00         283637.70
56695     418500.00         395433.00
11061     388800.00         348067.48

[26185 rows x 2 columns]
RMSE for random forest regression for dataset: 19085.111701199512


In [547]:
xgb_regressor2 = run_xgboost_regression(X_train, X_test, y_train, y_test)

Training score: 0.9846831130822897
Test score: 0.976872742855545 

        Real Values  Predicted Values
337221    241200.00         248201.38
398947    369000.00         412456.94
250384    340200.00         341171.31
377764    328500.00         337375.41
85992     421999.20         456313.12
...             ...               ...
355414    252000.00         279576.78
299224    436500.00         422030.94
198805    292500.00         288482.44
56695     418500.00         405688.19
11061     388800.00         381117.62

[26185 rows x 2 columns]
RMSE for xgboost regression for dataset: 16450.56027482583


In [548]:
xgb_opt_regressor2 = run_xgboost_optimize(X_train, X_test, y_train, y_test)

        Real Values  Predicted Values
337221    241200.00         240212.94
398947    369000.00         403225.94
250384    340200.00         344460.44
377764    328500.00         341131.03
85992     421999.20         453635.91
...             ...               ...
355414    252000.00         276544.00
299224    436500.00         425224.50
198805    292500.00         287610.44
56695     418500.00         408932.31
11061     388800.00         372754.22

[26185 rows x 2 columns]
RMSE for optimized xgboost regression for dataset: 17236.017447938262


## train third dataset

In [549]:
X = train_third.drop (["resale_price", "month"],axis = 1)
y = train_third["resale_price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [550]:
lin_regressor3 = run_linear_regression(X_train, X_test, y_train, y_test)

Training score: 0.7954090357322795
Test score: 0.7918299736638449 

          Actual  Predicted
392819 324000.00  357515.32
44810  324000.00  373945.22
300884 205200.00  222638.59
267541 400500.00  329885.83
331384 567000.00  595964.11
...          ...        ...
32016  529200.00  620556.87
195448 360000.00  314848.09
357612 326700.00  310907.48
416007 322200.00  307142.06
408379 765000.00  594561.17

[24390 rows x 2 columns]
RMSE for Linear Regression for dataset: 59012.693116244816


In [551]:
pipeline3 = run_pipeline_regression(X_train, X_test, y_train, y_test)

Training score: 0.939470244964489
Test score: 0.9378752570602247 

        Real Values  Predicted Values
392819    324000.00         341601.25
44810     324000.00         350429.00
300884    205200.00         209323.00
267541    400500.00         383040.75
331384    567000.00         573780.00
...             ...               ...
32016     529200.00         578531.75
195448    360000.00         374364.50
357612    326700.00         343928.50
416007    322200.00         293005.50
408379    765000.00         715218.25

[24390 rows x 2 columns]
RMSE for pipeline regression for dataset: 32238.040639844392


In [552]:
rf_regressor3 = run_rf_regression(X_train, X_test, y_train, y_test)

Training score: 0.9960110397397989
Test score: 0.9715352651257693 

        Real Values  Predicted Values
392819    324000.00         327349.79
44810     324000.00         336295.29
300884    205200.00         208116.00
267541    400500.00         406839.96
331384    567000.00         608137.99
...             ...               ...
32016     529200.00         547209.00
195448    360000.00         375568.20
357612    326700.00         315100.40
416007    322200.00         307861.99
408379    765000.00         718006.50

[24390 rows x 2 columns]
RMSE for random forest regression for dataset: 21821.766662589238


In [553]:
xgb_regressor3 = run_xgboost_regression(X_train, X_test, y_train, y_test)

Training score: 0.9828138251878352
Test score: 0.9738996339622793 

        Real Values  Predicted Values
392819    324000.00         331076.09
44810     324000.00         344971.38
300884    205200.00         204474.78
267541    400500.00         392625.66
331384    567000.00         571179.38
...             ...               ...
32016     529200.00         577943.75
195448    360000.00         370496.19
357612    326700.00         309086.09
416007    322200.00         301197.47
408379    765000.00         714590.62

[24390 rows x 2 columns]
RMSE for xgboost regression for dataset: 20895.830419665726


In [554]:
xgb_opt_regressor3 = run_xgboost_optimize(X_train, X_test, y_train, y_test)

        Real Values  Predicted Values
392819    324000.00         322838.50
44810     324000.00         347435.00
300884    205200.00         197855.45
267541    400500.00         390649.59
331384    567000.00         566283.19
...             ...               ...
32016     529200.00         546985.69
195448    360000.00         368188.16
357612    326700.00         320253.69
416007    322200.00         305956.38
408379    765000.00         713969.69

[24390 rows x 2 columns]
RMSE for optimized xgboost regression for dataset: 21597.884356066486


# Generate output

In [555]:
def rules_lin(row):
    if row['month'] <= dt.datetime(2007,1,1):
        return lin_regressor1.predict([row.drop('month')])[0]
    elif row['month'] > dt.datetime(2007,1,1) and row['month'] <= dt.datetime(2013,1,1):
        return lin_regressor2.predict([row.drop('month')])[0]
    else:
        return lin_regressor3.predict([row.drop('month')])[0]

pred = test.apply(rules_lin, axis=1)
df_out = pd.DataFrame({'Predicted':pred})
df_out.reset_index(inplace=True)
df_out = df_out.rename(columns = {'index':'Id'})
print(df_out)
df_out.to_csv('./submission_lin.csv', index = False, header=True)

            Id  Predicted
0            0  191576.83
1            1  306157.76
2            2  113239.93
3            3  281367.01
4            4  248409.89
...        ...        ...
107929  107929  326266.50
107930  107930  294203.29
107931  107931  190622.28
107932  107932  312136.06
107933  107933  327974.60

[107934 rows x 2 columns]


In [556]:
def rules_pipeline(row):
    if row['month'] <= dt.datetime(2007,1,1):
        return pipeline1.predict([row.drop('month')])[0]
    elif row['month'] > dt.datetime(2007,1,1) and row['month'] <= dt.datetime(2013,1,1):
        return pipeline2.predict([row.drop('month')])[0]
    else:
        return pipeline3.predict([row.drop('month')])[0]

pred = test.apply(rules_pipeline, axis=1)
df_out = pd.DataFrame({'Predicted':pred})
df_out.reset_index(inplace=True)
df_out = df_out.rename(columns = {'index':'Id'})
print(df_out)
df_out.to_csv('./submission_pipeline.csv', index = False, header=True)

            Id  Predicted
0            0  195136.31
1            1  308314.00
2            2  119349.88
3            3  334305.50
4            4  279042.31
...        ...        ...
107929  107929  324264.00
107930  107930  313861.50
107931  107931  175762.62
107932  107932  240219.00
107933  107933  321498.00

[107934 rows x 2 columns]


In [557]:
def rules_rf(row):
    if row['month'] <= dt.datetime(2007,1,1):
        return rf_regressor1.predict([row.drop('month')])[0]
    elif row['month'] > dt.datetime(2007,1,1) and row['month'] <= dt.datetime(2013,1,1):
        return rf_regressor2.predict([row.drop('month')])[0]
    else:
        return rf_regressor3.predict([row.drop('month')])[0]

pred = test.apply(rules_rf, axis=1)
df_out = pd.DataFrame({'Predicted':pred})
df_out.reset_index(inplace=True)
df_out = df_out.rename(columns = {'index':'Id'})
print(df_out)
df_out.to_csv('./submission_rf.csv', index = False, header=True)

            Id  Predicted
0            0  176018.98
1            1  314343.00
2            2  126184.50
3            3  311240.48
4            4  311571.00
...        ...        ...
107929  107929  332284.18
107930  107930  304680.08
107931  107931  166207.50
107932  107932  244417.50
107933  107933  319997.59

[107934 rows x 2 columns]


In [558]:
test_first = test[test['month'] <= dt.datetime(2007,1,1)]
test_second = test[test['month'] > dt.datetime(2007,1,1)]
test_second = test_second[test_second['month'] <= dt.datetime(2013,1,1)]
test_third = test[test['month'] > dt.datetime(2013,1,1)]

pred1 = xgb_regressor1.predict(test_first.drop(columns=['month']))
test_first['Prediction1'] = pred1
pred2 = xgb_regressor2.predict(test_second.drop(columns=['month']))
test_second['Prediction2'] = pred2
pred3 = xgb_regressor3.predict(test_third.drop(columns=['month']))
test_third['Prediction3'] = pred3

result = pd.concat([test_first, test_second,test_third], axis=1).fillna(0)
# addition doesnt matter because 2 out of 3 values are zero 
result['Prediction'] = result['Prediction1'] + result['Prediction2'] + result['Prediction3']
df_out = pd.DataFrame({'Predicted':np.array(result['Prediction'])})
df_out.reset_index(inplace=True)
df_out = df_out.rename(columns = {'index':'Id'})
print(df_out)
df_out.to_csv('./submission_xgb_3.csv', index = False, header=True)

            Id  Predicted
0            0  186831.06
1            1  303162.16
2            2  125100.49
3            3  320570.78
4            4  313520.53
...        ...        ...
107929  107929  328184.25
107930  107930  300959.53
107931  107931  157323.61
107932  107932  242723.14
107933  107933  334300.03

[107934 rows x 2 columns]


In [559]:
test_first = test[test['month'] <= dt.datetime(2007,1,1)]
test_second = test[test['month'] > dt.datetime(2007,1,1)]
test_second = test_second[test_second['month'] <= dt.datetime(2013,1,1)]
test_third = test[test['month'] > dt.datetime(2013,1,1)]


pred1 = xgb_opt_regressor1.predict(xgboost.DMatrix(test_first.drop(columns=['month'])))
test_first['Prediction1'] = pred1
pred2 = xgb_opt_regressor2.predict(xgboost.DMatrix(test_second.drop(columns=['month'])))
test_second['Prediction2'] = pred2
pred3 = xgb_opt_regressor3.predict(xgboost.DMatrix(test_third.drop(columns=['month'])))
test_third['Prediction3'] = pred3

result = pd.concat([test_first, test_second,test_third], axis=1).fillna(0)
# addition doesnt matter because 2 out of 3 values are zero 
result['Prediction'] = result['Prediction1'] + result['Prediction2'] + result['Prediction3']
df_out = pd.DataFrame({'Predicted':np.array(result['Prediction'])})
df_out.reset_index(inplace=True)
df_out = df_out.rename(columns = {'index':'Id'})
print(df_out)
df_out.to_csv('./submission_xgb_opt.csv', index = False, header=True)

            Id  Predicted
0            0  187770.92
1            1  305504.41
2            2  125773.39
3            3  320201.75
4            4  321521.72
...        ...        ...
107929  107929  322456.44
107930  107930  299521.31
107931  107931  156950.77
107932  107932  247035.30
107933  107933  337873.34

[107934 rows x 2 columns]
