# EDA and Data Pre-processing

## Importing Packages

In [162]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pprint
import csv
import datetime as dt
from sklearn.metrics import classification_report, confusion_matrix, roc_curve
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import math

## Importing Dataset

In [163]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

## Pre-processing on Train Dataset

## Feature Engineering to Create "dist_mrt" from Auxiliary Data

In [164]:
# do not change order of this .dropna() function
train.dropna()
train = train.reset_index(drop=True)
train

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,resale_price
0,2001-08,pasir ris,4 room,440,pasir ris drive 4,01 to 03,118.0,model a,uncategorized,1989,1.369008,103.958697,0.0,pasir ris drive,pasir ris,east region,209700.0
1,2014-10,punggol,5-room,196B,punggol field,10 to 12,110.0,improved,uncategorized,2003,1.399007,103.906991,0.0,punggol field,punggol,north-east region,402300.0
2,2020-09,sengkang,5 room,404A,fernvale lane,01 to 03,112.0,premium apartment,uncategorized,2004,1.388348,103.873815,0.0,fernvale,sengkang,north-east region,351000.0
3,2000-10,clementi,3 room,375,clementi avenue 4,07 to 09,67.0,new generation,uncategorized,1980,1.318493,103.766702,0.0,clementi north,clementi,west region,151200.0
4,2013-01,bukit batok,3-room,163,bukit batok street 11,07 to 09,73.0,model a,uncategorized,1985,1.348149,103.742658,0.0,bukit batok west,bukit batok,west region,318600.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431727,2005-03,woodlands,4 room,537,Woodlands Drive 16,01 to 03,101.0,model a,uncategorized,2000,1.429658,103.792583,0.0,woodlands south,woodlands,north region,238500.0
431728,2016-04,sengkang,4 room,410A,fernvale road,13 to 15,95.0,premium apartment,uncategorized,2012,1.390053,103.875941,0.0,fernvale,sengkang,north-east region,376200.0
431729,2011-01,tampines,3-room,829,tampines street 81,01 to 03,67.0,new generation,uncategorized,1986,1.349224,103.934913,0.0,tampines west,tampines,east region,255600.0
431730,2013-05,sengkang,5-room,233,compassvale walk,16 to 18,123.0,improved,uncategorized,1999,1.389941,103.900721,0.0,sengkang town centre,sengkang,north-east region,508500.0


In [165]:
with open('./auxiliary-data/distance-to-mrt.csv', newline='') as f:
    reader = csv.reader(f)
    mrt_list = list(reader)

In [166]:
train['dist_mrt'] = np.array(mrt_list)
train.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,resale_price,dist_mrt
0,2001-08,pasir ris,4 room,440,pasir ris drive 4,01 to 03,118.0,model a,uncategorized,1989,1.369008,103.958697,0.0,pasir ris drive,pasir ris,east region,209700.0,1.137651115
1,2014-10,punggol,5-room,196B,punggol field,10 to 12,110.0,improved,uncategorized,2003,1.399007,103.906991,0.0,punggol field,punggol,north-east region,402300.0,0.118453389
2,2020-09,sengkang,5 room,404A,fernvale lane,01 to 03,112.0,premium apartment,uncategorized,2004,1.388348,103.873815,0.0,fernvale,sengkang,north-east region,351000.0,0.479542021
3,2000-10,clementi,3 room,375,clementi avenue 4,07 to 09,67.0,new generation,uncategorized,1980,1.318493,103.766702,0.0,clementi north,clementi,west region,151200.0,0.421345802
4,2013-01,bukit batok,3-room,163,bukit batok street 11,07 to 09,73.0,model a,uncategorized,1985,1.348149,103.742658,0.0,bukit batok west,bukit batok,west region,318600.0,0.775005151


## Convert "storey_range" into Numerical Data Type "storey_average"

In [167]:
# first number in "storey_range"
train['lower'] = train['storey_range'].str.extract('(\d+)')
train['upper'] = train['storey_range'].str.extract('(\d+$)')
# calculate average storey from the range
train['storey_average'] = train['upper'].astype(int) + train['lower'].astype(int)
train['storey_average'] = train['storey_average'] / 2.0
train = train.drop(columns=['storey_range'])
train.head()

Unnamed: 0,month,town,flat_type,block,street_name,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,resale_price,dist_mrt,lower,upper,storey_average
0,2001-08,pasir ris,4 room,440,pasir ris drive 4,118.0,model a,uncategorized,1989,1.369008,103.958697,0.0,pasir ris drive,pasir ris,east region,209700.0,1.137651115,1,3,2.0
1,2014-10,punggol,5-room,196B,punggol field,110.0,improved,uncategorized,2003,1.399007,103.906991,0.0,punggol field,punggol,north-east region,402300.0,0.118453389,10,12,11.0
2,2020-09,sengkang,5 room,404A,fernvale lane,112.0,premium apartment,uncategorized,2004,1.388348,103.873815,0.0,fernvale,sengkang,north-east region,351000.0,0.479542021,1,3,2.0
3,2000-10,clementi,3 room,375,clementi avenue 4,67.0,new generation,uncategorized,1980,1.318493,103.766702,0.0,clementi north,clementi,west region,151200.0,0.421345802,7,9,8.0
4,2013-01,bukit batok,3-room,163,bukit batok street 11,73.0,model a,uncategorized,1985,1.348149,103.742658,0.0,bukit batok west,bukit batok,west region,318600.0,0.775005151,7,9,8.0


## Convert "flat_type" to Ordinal Data Type "flattype_mapping"

In [168]:
train = train.replace(to_replace ="5 room", value ="5-room")
train = train.replace(to_replace ="4 room", value ="4-room")
train = train.replace(to_replace ="3 room", value ="3-room")
train = train.replace(to_replace ="2 room", value ="2-room")
train = train.replace(to_replace ="1 room", value ="1-room")
flattype_mapping={"1-room": 1,
                   "2-room": 2,
                   "3-room": 3,
                   "4-room": 4,
                   "5-room": 5,
                   "multi generation": 6,
                   "executive": 7,}
train["flattype_mapping"] = train['flat_type'].map(flattype_mapping)
train = train.drop(columns=['flat_type'])
train.head()

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,resale_price,dist_mrt,lower,upper,storey_average,flattype_mapping
0,2001-08,pasir ris,440,pasir ris drive 4,118.0,model a,uncategorized,1989,1.369008,103.958697,0.0,pasir ris drive,pasir ris,east region,209700.0,1.137651115,1,3,2.0,4
1,2014-10,punggol,196B,punggol field,110.0,improved,uncategorized,2003,1.399007,103.906991,0.0,punggol field,punggol,north-east region,402300.0,0.118453389,10,12,11.0,5
2,2020-09,sengkang,404A,fernvale lane,112.0,premium apartment,uncategorized,2004,1.388348,103.873815,0.0,fernvale,sengkang,north-east region,351000.0,0.479542021,1,3,2.0,5
3,2000-10,clementi,375,clementi avenue 4,67.0,new generation,uncategorized,1980,1.318493,103.766702,0.0,clementi north,clementi,west region,151200.0,0.421345802,7,9,8.0,3
4,2013-01,bukit batok,163,bukit batok street 11,73.0,model a,uncategorized,1985,1.348149,103.742658,0.0,bukit batok west,bukit batok,west region,318600.0,0.775005151,7,9,8.0,3


## Convert "region" to Binary Data Type

In [169]:
one_hot_cols = pd.get_dummies(train, columns=['region'], prefix=['region'])
train = train.merge(one_hot_cols)
train = train.drop(columns=['region'])
train.head()

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,...,dist_mrt,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region
0,2001-08,pasir ris,440,pasir ris drive 4,118.0,model a,uncategorized,1989,1.369008,103.958697,...,1.137651115,1,3,2.0,4,0,1,0,0,0
1,2014-10,punggol,196B,punggol field,110.0,improved,uncategorized,2003,1.399007,103.906991,...,0.118453389,10,12,11.0,5,0,0,0,1,0
2,2020-09,sengkang,404A,fernvale lane,112.0,premium apartment,uncategorized,2004,1.388348,103.873815,...,0.479542021,1,3,2.0,5,0,0,0,1,0
3,2000-10,clementi,375,clementi avenue 4,67.0,new generation,uncategorized,1980,1.318493,103.766702,...,0.421345802,7,9,8.0,3,0,0,0,0,1
4,2013-01,bukit batok,163,bukit batok street 11,73.0,model a,uncategorized,1985,1.348149,103.742658,...,0.775005151,7,9,8.0,3,0,0,0,0,1


## Feature engineering to create "remaining_lease" feature from "lease_commencement_date"

In [170]:
lease_commence_date = train['lease_commence_date'].to_numpy()
#assuming 99-year lease, we calculate the remaining years on the lease
remaining_lease = lease_commence_date + 99 - pd.DatetimeIndex(train['month']).year.to_numpy()
train['remaining_lease'] = remaining_lease
train = train.drop(columns=['lease_commence_date'])
train.head()

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,latitude,longitude,elevation,...,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2001-08,pasir ris,440,pasir ris drive 4,118.0,model a,uncategorized,1.369008,103.958697,0.0,...,1,3,2.0,4,0,1,0,0,0,87
1,2014-10,punggol,196B,punggol field,110.0,improved,uncategorized,1.399007,103.906991,0.0,...,10,12,11.0,5,0,0,0,1,0,88
2,2020-09,sengkang,404A,fernvale lane,112.0,premium apartment,uncategorized,1.388348,103.873815,0.0,...,1,3,2.0,5,0,0,0,1,0,83
3,2000-10,clementi,375,clementi avenue 4,67.0,new generation,uncategorized,1.318493,103.766702,0.0,...,7,9,8.0,3,0,0,0,0,1,79
4,2013-01,bukit batok,163,bukit batok street 11,73.0,model a,uncategorized,1.348149,103.742658,0.0,...,7,9,8.0,3,0,0,0,0,1,71


## Convert "flat_model" to Ordinal Data Type

In [171]:
flat_models = train.groupby('flat_model')
keys = flat_models.groups.keys()

flat_type_dict = {}

# store the average "resale_price" of each "flat_model" in a dictionary with
#    key : "flat_model"
#    value : average "resale_price"
for key in keys:
    df_i = flat_models.get_group(key)
    flat_type_dict.update({key : df_i["resale_price"].mean()})

# ordering the different "flat_model" by average "resale_price" (ascending order)
{k: v for k, v in sorted(flat_type_dict.items(), key=lambda item: item[1])}
# print(flat_type_dict)
# print(flat_type_dict.keys())

# save the ordered list of keys from dictionary as a list for easier index finding
flat_model_list = list(flat_type_dict.keys())

for flat_model in flat_model_list:
    ordinal_value = flat_model_list.index(flat_model) + 1
    train['flat_model'] = train['flat_model'].str.replace(rf'^{flat_model}$', str(ordinal_value), regex=True)
    
train.head()

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,latitude,longitude,elevation,...,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2001-08,pasir ris,440,pasir ris drive 4,118.0,8,uncategorized,1.369008,103.958697,0.0,...,1,3,2.0,4,0,1,0,0,0,87
1,2014-10,punggol,196B,punggol field,110.0,5,uncategorized,1.399007,103.906991,0.0,...,10,12,11.0,5,0,0,0,1,0,88
2,2020-09,sengkang,404A,fernvale lane,112.0,13,uncategorized,1.388348,103.873815,0.0,...,1,3,2.0,5,0,0,0,1,0,83
3,2000-10,clementi,375,clementi avenue 4,67.0,12,uncategorized,1.318493,103.766702,0.0,...,7,9,8.0,3,0,0,0,0,1,79
4,2013-01,bukit batok,163,bukit batok street 11,73.0,8,uncategorized,1.348149,103.742658,0.0,...,7,9,8.0,3,0,0,0,0,1,71


## Drop Columns "block", "street_name", and "eco_category"

In [172]:
train = train.drop(columns=['block', 'street_name', 'eco_category'])
train.head()

Unnamed: 0,month,town,floor_area_sqm,flat_model,latitude,longitude,elevation,subzone,planning_area,resale_price,...,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2001-08,pasir ris,118.0,8,1.369008,103.958697,0.0,pasir ris drive,pasir ris,209700.0,...,1,3,2.0,4,0,1,0,0,0,87
1,2014-10,punggol,110.0,5,1.399007,103.906991,0.0,punggol field,punggol,402300.0,...,10,12,11.0,5,0,0,0,1,0,88
2,2020-09,sengkang,112.0,13,1.388348,103.873815,0.0,fernvale,sengkang,351000.0,...,1,3,2.0,5,0,0,0,1,0,83
3,2000-10,clementi,67.0,12,1.318493,103.766702,0.0,clementi north,clementi,151200.0,...,7,9,8.0,3,0,0,0,0,1,79
4,2013-01,bukit batok,73.0,8,1.348149,103.742658,0.0,bukit batok west,bukit batok,318600.0,...,7,9,8.0,3,0,0,0,0,1,71


In [173]:
train = train.drop(columns=['elevation','town','subzone','planning_area'])
train.head()

Unnamed: 0,month,floor_area_sqm,flat_model,latitude,longitude,resale_price,dist_mrt,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2001-08,118.0,8,1.369008,103.958697,209700.0,1.137651115,1,3,2.0,4,0,1,0,0,0,87
1,2014-10,110.0,5,1.399007,103.906991,402300.0,0.118453389,10,12,11.0,5,0,0,0,1,0,88
2,2020-09,112.0,13,1.388348,103.873815,351000.0,0.479542021,1,3,2.0,5,0,0,0,1,0,83
3,2000-10,67.0,12,1.318493,103.766702,151200.0,0.421345802,7,9,8.0,3,0,0,0,0,1,79
4,2013-01,73.0,8,1.348149,103.742658,318600.0,0.775005151,7,9,8.0,3,0,0,0,0,1,71


## Convert "month" to datetime

In [174]:
train['datetime_month'] = pd.to_datetime(train['month'])
train['datetime_month']= train['datetime_month'].map(dt.datetime.toordinal)

In [175]:
train_with_data = train.copy()
train_with_data.head()

Unnamed: 0,month,floor_area_sqm,flat_model,latitude,longitude,resale_price,dist_mrt,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease,datetime_month
0,2001-08,118.0,8,1.369008,103.958697,209700.0,1.137651115,1,3,2.0,4,0,1,0,0,0,87,730698
1,2014-10,110.0,5,1.399007,103.906991,402300.0,0.118453389,10,12,11.0,5,0,0,0,1,0,88,735507
2,2020-09,112.0,13,1.388348,103.873815,351000.0,0.479542021,1,3,2.0,5,0,0,0,1,0,83,737669
3,2000-10,67.0,12,1.318493,103.766702,151200.0,0.421345802,7,9,8.0,3,0,0,0,0,1,79,730394
4,2013-01,73.0,8,1.348149,103.742658,318600.0,0.775005151,7,9,8.0,3,0,0,0,0,1,71,734869


## Pre-processing on Test Dataset

## Feature Engineering to Create "dist_mrt" from Auxiliary Data

In [176]:
test.dropna()
test = test.reset_index(drop=True)
test

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region
0,2004-01,bukit batok,4 room,186,bukit batok west avenue 6,04 to 06,94.0,new generation,uncategorized,1989,1.346581,103.744085,0.0,bukit batok west,bukit batok,west region
1,2001-11,tampines,5 room,366,tampines street 34,04 to 06,122.0,improved,uncategorized,1997,1.357618,103.961379,0.0,tampines east,tampines,east region
2,2002-07,jurong east,3 room,206,jurong east street 21,01 to 03,67.0,new generation,uncategorized,1982,1.337804,103.741998,0.0,toh guan,jurong east,west region
3,2015-04,ang mo kio,3 room,180,Ang Mo Kio Avenue 5,04 to 06,82.0,new generation,uncategorized,1981,1.380084,103.849574,0.0,yio chu kang east,ang mo kio,north-east region
4,2004-04,clementi,5 room,356,clementi avenue 2,01 to 03,117.0,standard,uncategorized,1978,1.313960,103.769831,0.0,clementi north,clementi,west region
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107929,2008-04,hougang,5 room,981D,buangkok crescent,10 to 12,110.0,improved,uncategorized,2003,1.380452,103.879333,0.0,trafalgar,hougang,north-east region
107930,2006-01,kallang/whampoa,4 room,13,upper boon keng road,13 to 15,102.0,model a,uncategorized,1999,1.314481,103.870458,0.0,boon keng,kallang,central region
107931,2000-01,kallang/whampoa,3 room,1,beach road,07 to 09,68.0,improved,uncategorized,1979,1.294924,103.854315,0.0,city hall,downtown core,central region
107932,2009-07,jurong west,4 room,919,jurong west street 91,10 to 12,104.0,model a,uncategorized,1988,1.339927,103.687354,0.0,yunnan,jurong west,west region


In [177]:
with open('./auxiliary-data/distance-to-mrt-test-edited.csv', newline='') as f:
    reader = csv.reader(f)
    mrt_list = list(reader)

In [178]:
test['dist_mrt'] = np.array(mrt_list)
test.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,dist_mrt
0,2004-01,bukit batok,4 room,186,bukit batok west avenue 6,04 to 06,94.0,new generation,uncategorized,1989,1.346581,103.744085,0.0,bukit batok west,bukit batok,west region,0.667553928
1,2001-11,tampines,5 room,366,tampines street 34,04 to 06,122.0,improved,uncategorized,1997,1.357618,103.961379,0.0,tampines east,tampines,east region,0.767025394
2,2002-07,jurong east,3 room,206,jurong east street 21,01 to 03,67.0,new generation,uncategorized,1982,1.337804,103.741998,0.0,toh guan,jurong east,west region,0.515380197
3,2015-04,ang mo kio,3 room,180,Ang Mo Kio Avenue 5,04 to 06,82.0,new generation,uncategorized,1981,1.380084,103.849574,0.0,yio chu kang east,ang mo kio,north-east region,0.547114337
4,2004-04,clementi,5 room,356,clementi avenue 2,01 to 03,117.0,standard,uncategorized,1978,1.31396,103.769831,0.0,clementi north,clementi,west region,0.516298405


## Convert "storey_range" into Numerical Data Type "storey_average" 

In [179]:
test['lower'] = test['storey_range'].str.extract('(\d+)')
test['upper'] = test['storey_range'].str.extract('(\d+$)')
# calculate average storey from the range
test['storey_average'] = test['upper'].astype(int) + test['lower'].astype(int)
test['storey_average'] = test['storey_average'] / 2.0
test = test.drop(columns=['storey_range'])
test.head()

Unnamed: 0,month,town,flat_type,block,street_name,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,dist_mrt,lower,upper,storey_average
0,2004-01,bukit batok,4 room,186,bukit batok west avenue 6,94.0,new generation,uncategorized,1989,1.346581,103.744085,0.0,bukit batok west,bukit batok,west region,0.667553928,4,6,5.0
1,2001-11,tampines,5 room,366,tampines street 34,122.0,improved,uncategorized,1997,1.357618,103.961379,0.0,tampines east,tampines,east region,0.767025394,4,6,5.0
2,2002-07,jurong east,3 room,206,jurong east street 21,67.0,new generation,uncategorized,1982,1.337804,103.741998,0.0,toh guan,jurong east,west region,0.515380197,1,3,2.0
3,2015-04,ang mo kio,3 room,180,Ang Mo Kio Avenue 5,82.0,new generation,uncategorized,1981,1.380084,103.849574,0.0,yio chu kang east,ang mo kio,north-east region,0.547114337,4,6,5.0
4,2004-04,clementi,5 room,356,clementi avenue 2,117.0,standard,uncategorized,1978,1.31396,103.769831,0.0,clementi north,clementi,west region,0.516298405,1,3,2.0


## Convert "flat_type" to Ordinal Data Type "flattype_mapping"

In [180]:
test = test.replace(to_replace ="5 room", value ="5-room")
test = test.replace(to_replace ="4 room", value ="4-room")
test = test.replace(to_replace ="3 room", value ="3-room")
test = test.replace(to_replace ="2 room", value ="2-room")
test = test.replace(to_replace ="1 room", value ="1-room")
flattype_mapping={"1-room": 1,
                   "2-room": 2,
                   "3-room": 3,
                   "4-room": 4,
                   "5-room": 5,
                   "multi generation": 6,
                   "executive": 7,}
test["flattype_mapping"] = test['flat_type'].map(flattype_mapping)
test = test.drop(columns=['flat_type'])
test.head()

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,dist_mrt,lower,upper,storey_average,flattype_mapping
0,2004-01,bukit batok,186,bukit batok west avenue 6,94.0,new generation,uncategorized,1989,1.346581,103.744085,0.0,bukit batok west,bukit batok,west region,0.667553928,4,6,5.0,4
1,2001-11,tampines,366,tampines street 34,122.0,improved,uncategorized,1997,1.357618,103.961379,0.0,tampines east,tampines,east region,0.767025394,4,6,5.0,5
2,2002-07,jurong east,206,jurong east street 21,67.0,new generation,uncategorized,1982,1.337804,103.741998,0.0,toh guan,jurong east,west region,0.515380197,1,3,2.0,3
3,2015-04,ang mo kio,180,Ang Mo Kio Avenue 5,82.0,new generation,uncategorized,1981,1.380084,103.849574,0.0,yio chu kang east,ang mo kio,north-east region,0.547114337,4,6,5.0,3
4,2004-04,clementi,356,clementi avenue 2,117.0,standard,uncategorized,1978,1.31396,103.769831,0.0,clementi north,clementi,west region,0.516298405,1,3,2.0,5


## Convert "region" to Binary Data Type

In [181]:
one_hot_cols = pd.get_dummies(test, columns=['region'], prefix=['region'])
test = test.merge(one_hot_cols)
test = test.drop(columns=['region'])
test.head()

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,...,dist_mrt,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region
0,2004-01,bukit batok,186,bukit batok west avenue 6,94.0,new generation,uncategorized,1989,1.346581,103.744085,...,0.667553928,4,6,5.0,4,0,0,0,0,1
1,2001-11,tampines,366,tampines street 34,122.0,improved,uncategorized,1997,1.357618,103.961379,...,0.767025394,4,6,5.0,5,0,1,0,0,0
2,2002-07,jurong east,206,jurong east street 21,67.0,new generation,uncategorized,1982,1.337804,103.741998,...,0.515380197,1,3,2.0,3,0,0,0,0,1
3,2015-04,ang mo kio,180,Ang Mo Kio Avenue 5,82.0,new generation,uncategorized,1981,1.380084,103.849574,...,0.547114337,4,6,5.0,3,0,0,0,1,0
4,2004-04,clementi,356,clementi avenue 2,117.0,standard,uncategorized,1978,1.31396,103.769831,...,0.516298405,1,3,2.0,5,0,0,0,0,1


## Feature engineering to create "remaining_lease" feature from "lease_commencement_date"

In [182]:
lease_commence_date = test['lease_commence_date'].to_numpy()
#assuming 99-year lease, we calculate the remaining years on the lease
remaining_lease = lease_commence_date + 99 - pd.DatetimeIndex(test['month']).year.to_numpy()
test['remaining_lease'] = remaining_lease
test = test.drop(columns=['lease_commence_date'])
test.head()

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,latitude,longitude,elevation,...,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2004-01,bukit batok,186,bukit batok west avenue 6,94.0,new generation,uncategorized,1.346581,103.744085,0.0,...,4,6,5.0,4,0,0,0,0,1,84
1,2001-11,tampines,366,tampines street 34,122.0,improved,uncategorized,1.357618,103.961379,0.0,...,4,6,5.0,5,0,1,0,0,0,95
2,2002-07,jurong east,206,jurong east street 21,67.0,new generation,uncategorized,1.337804,103.741998,0.0,...,1,3,2.0,3,0,0,0,0,1,79
3,2015-04,ang mo kio,180,Ang Mo Kio Avenue 5,82.0,new generation,uncategorized,1.380084,103.849574,0.0,...,4,6,5.0,3,0,0,0,1,0,65
4,2004-04,clementi,356,clementi avenue 2,117.0,standard,uncategorized,1.31396,103.769831,0.0,...,1,3,2.0,5,0,0,0,0,1,73


## Convert "flat_model" to Ordinal Data Type

In [183]:
# uses the same "flat_model_list" extracted from the "flat_model" dictionary calculated from train dataset
for flat_model in flat_model_list:
    ordinal_value = flat_model_list.index(flat_model) + 1
    test['flat_model'] = test['flat_model'].str.replace(rf'^{flat_model}$', str(ordinal_value), regex=True)

test.head()

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,latitude,longitude,elevation,...,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2004-01,bukit batok,186,bukit batok west avenue 6,94.0,12,uncategorized,1.346581,103.744085,0.0,...,4,6,5.0,4,0,0,0,0,1,84
1,2001-11,tampines,366,tampines street 34,122.0,5,uncategorized,1.357618,103.961379,0.0,...,4,6,5.0,5,0,1,0,0,0,95
2,2002-07,jurong east,206,jurong east street 21,67.0,12,uncategorized,1.337804,103.741998,0.0,...,1,3,2.0,3,0,0,0,0,1,79
3,2015-04,ang mo kio,180,Ang Mo Kio Avenue 5,82.0,12,uncategorized,1.380084,103.849574,0.0,...,4,6,5.0,3,0,0,0,1,0,65
4,2004-04,clementi,356,clementi avenue 2,117.0,17,uncategorized,1.31396,103.769831,0.0,...,1,3,2.0,5,0,0,0,0,1,73


## Drop Columns "block", "street_name", and "eco_category"

In [184]:
test = test.drop(columns=['block', 'street_name', 'eco_category'])
test.head()

Unnamed: 0,month,town,floor_area_sqm,flat_model,latitude,longitude,elevation,subzone,planning_area,dist_mrt,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2004-01,bukit batok,94.0,12,1.346581,103.744085,0.0,bukit batok west,bukit batok,0.667553928,4,6,5.0,4,0,0,0,0,1,84
1,2001-11,tampines,122.0,5,1.357618,103.961379,0.0,tampines east,tampines,0.767025394,4,6,5.0,5,0,1,0,0,0,95
2,2002-07,jurong east,67.0,12,1.337804,103.741998,0.0,toh guan,jurong east,0.515380197,1,3,2.0,3,0,0,0,0,1,79
3,2015-04,ang mo kio,82.0,12,1.380084,103.849574,0.0,yio chu kang east,ang mo kio,0.547114337,4,6,5.0,3,0,0,0,1,0,65
4,2004-04,clementi,117.0,17,1.31396,103.769831,0.0,clementi north,clementi,0.516298405,1,3,2.0,5,0,0,0,0,1,73


In [185]:
test = test.drop(columns=['elevation','town','subzone','planning_area'])
test.head()

Unnamed: 0,month,floor_area_sqm,flat_model,latitude,longitude,dist_mrt,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2004-01,94.0,12,1.346581,103.744085,0.667553928,4,6,5.0,4,0,0,0,0,1,84
1,2001-11,122.0,5,1.357618,103.961379,0.767025394,4,6,5.0,5,0,1,0,0,0,95
2,2002-07,67.0,12,1.337804,103.741998,0.515380197,1,3,2.0,3,0,0,0,0,1,79
3,2015-04,82.0,12,1.380084,103.849574,0.547114337,4,6,5.0,3,0,0,0,1,0,65
4,2004-04,117.0,17,1.31396,103.769831,0.516298405,1,3,2.0,5,0,0,0,0,1,73


## Convert "month" to datetime

In [186]:
test['datetime_month'] = pd.to_datetime(test['month'])
test['datetime_month']= test['datetime_month'].map(dt.datetime.toordinal)

In [187]:
test.head()

Unnamed: 0,month,floor_area_sqm,flat_model,latitude,longitude,dist_mrt,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease,datetime_month
0,2004-01,94.0,12,1.346581,103.744085,0.667553928,4,6,5.0,4,0,0,0,0,1,84,731581
1,2001-11,122.0,5,1.357618,103.961379,0.767025394,4,6,5.0,5,0,1,0,0,0,95,730790
2,2002-07,67.0,12,1.337804,103.741998,0.515380197,1,3,2.0,3,0,0,0,0,1,79,731032
3,2015-04,82.0,12,1.380084,103.849574,0.547114337,4,6,5.0,3,0,0,0,1,0,65,735689
4,2004-04,117.0,17,1.31396,103.769831,0.516298405,1,3,2.0,5,0,0,0,0,1,73,731672


# Linear Regression implementation for full dataset

In [26]:
X = train.drop (["resale_price"],axis = 1)
y = train["resale_price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [27]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [28]:
print(regressor.intercept_)
print(regressor.coef_)
print('Training score: {}'.format(regressor.score(X_train, y_train)))
print('Test score: {}'.format(regressor.score(X_test, y_test)))

-29578542.44417625
[ 3.64634067e+01  2.41284647e+03  9.88004200e+02 -9.88322329e+05
  3.76200389e+04 -1.56009811e+04 -3.60901041e+04  3.83321475e+04
  1.12102170e+03  1.64102607e+04  2.36717556e+04 -1.29708516e+04
  2.06437277e+04  3.70706495e+03 -3.50516967e+04  2.22881528e+03]
Training score: 0.7915196159422416
Test score: 0.7896683181682156


In [29]:
y_pred = regressor.predict(X_test)
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(df)
print('RMSE for Linear Regression => ',np.sqrt(mean_squared_error(y_test,y_pred)))

          Actual      Predicted
205085  414000.0  321251.675587
394509  328500.0  377537.550936
283069   90000.0  160895.924973
69767   225000.0  292097.153898
384613  202500.0  176501.151111
...          ...            ...
413904  279000.0  275790.558759
399764  328500.0  313847.765426
142745  297000.0  288016.756321
8845    310500.0  328663.581610
383030  137700.0  157372.434380

[86476 rows x 2 columns]
RMSE for Linear Regression =>  59064.351559932045


In [30]:
regressor = LinearRegression()
regressor.fit(X, y)

LinearRegression()

In [31]:
y_pred = regressor.predict(X)
df = pd.DataFrame({'Actual': y, 'Predicted': y_pred})
print(df)

          Actual      Predicted
0       209700.0  226048.044937
1       402300.0  428609.102053
2       351000.0  482433.949383
3       151200.0  113213.935480
4       318600.0  233385.832714
...          ...            ...
432371  238500.0  228483.682232
432372  376200.0  435768.424505
432373  255600.0  214448.282281
432374  508500.0  457083.305809
432375  162000.0  235424.226545

[432376 rows x 2 columns]


# Polinomial regression for full dataset

In [32]:
poly_reg = PolynomialFeatures(degree = 3)
X_poly = poly_reg.fit_transform(X)
print(len(X_poly))
print(len(y))
lin_reg = LinearRegression()
lin_reg.fit(X_poly,y)

432376
432376


LinearRegression()

In [33]:
print('Training score: {}'.format(lin_reg.score(X_poly, y)))
# print('Training score: {}'.format(lin_reg.score(X_test, y_test)))

Training score: 0.8689064390831351


In [34]:
y_pred = lin_reg.predict(X_poly)
df = pd.DataFrame({'Real Values':y, 'Predicted Values':y_pred})
df

Unnamed: 0,Real Values,Predicted Values
0,209700.0,214498.762833
1,402300.0,396804.755020
2,351000.0,497829.905411
3,151200.0,125826.903458
4,318600.0,253205.075333
...,...,...
432371,238500.0,218000.959122
432372,376200.0,459742.926895
432373,255600.0,240746.594864
432374,508500.0,417157.145645


In [35]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=3)),
    ('model', LinearRegression())
]
pipeline = Pipeline(steps)
pipeline.fit(X_train, y_train)
print('Training score: {}'.format(pipeline.score(X_train, y_train)))
print('Test score: {}'.format(pipeline.score(X_test, y_test)))

Training score: 0.9322869851053108
Test score: 0.9315502824110583


In [36]:
y_pred_pipeline = pipeline.predict(X_test)
df = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred_pipeline})
print(df)
mse = mean_squared_error(y_test, y_pred_pipeline)
rmse = math.sqrt(mse)
print('RMSE: {}'.format(rmse))

        Real Values  Predicted Values
205085     414000.0        349745.750
394509     328500.0        320587.500
283069      90000.0         89865.500
69767      225000.0        198110.500
384613     202500.0        204470.125
...             ...               ...
413904     279000.0        299480.500
399764     328500.0        308677.750
142745     297000.0        293083.125
8845       310500.0        354967.000
383030     137700.0        144179.000

[86476 rows x 2 columns]
RMSE: 33694.49280928028


In [37]:
from sklearn.pipeline import Pipeline

steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=4)),
    ('model', LinearRegression())
]
pipeline = Pipeline(steps)
pipeline.fit(X_train, y_train)
print('Training score: {}'.format(pipeline.score(X_train, y_train)))
print('Test score: {}'.format(pipeline.score(X_test, y_test)))

KeyboardInterrupt: 

In [None]:
y_pred_pipeline = pipeline.predict(X_test)
df = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred_pipeline})
print(df)
mse = mean_squared_error(y_test, y_pred_pipeline)
rmse = math.sqrt(mse)
print('RMSE: {}'.format(rmse))

In [84]:
from sklearn import linear_model
from sklearn import svm

classifiers = [
    linear_model.SGDRegressor(),
    linear_model.BayesianRidge(),
    linear_model.LassoLars(),
    linear_model.ARDRegression(),
    linear_model.PassiveAggressiveRegressor(),
    linear_model.TheilSenRegressor()]


for item in classifiers:
    print(item)
    clf = item
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = math.sqrt(mse)
    print('RMSE: {}'.format(rmse))

SGDRegressor()
RMSE: 1.3537643017128187e+20
BayesianRidge()
RMSE: 39840.85915827851
LassoLars()
RMSE: 39889.99849207766
ARDRegression()
RMSE: 39841.43144699237
PassiveAggressiveRegressor()
RMSE: 127451.10320480415
TheilSenRegressor(max_subpopulation=10000)
RMSE: 5345571.168435565


# Split for 3 periods 

In [188]:
train.head()


Unnamed: 0,month,floor_area_sqm,flat_model,latitude,longitude,resale_price,dist_mrt,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease,datetime_month
0,2001-08,118.0,8,1.369008,103.958697,209700.0,1.137651115,1,3,2.0,4,0,1,0,0,0,87,730698
1,2014-10,110.0,5,1.399007,103.906991,402300.0,0.118453389,10,12,11.0,5,0,0,0,1,0,88,735507
2,2020-09,112.0,13,1.388348,103.873815,351000.0,0.479542021,1,3,2.0,5,0,0,0,1,0,83,737669
3,2000-10,67.0,12,1.318493,103.766702,151200.0,0.421345802,7,9,8.0,3,0,0,0,0,1,79,730394
4,2013-01,73.0,8,1.348149,103.742658,318600.0,0.775005151,7,9,8.0,3,0,0,0,0,1,71,734869


In [189]:
test.head()

Unnamed: 0,month,floor_area_sqm,flat_model,latitude,longitude,dist_mrt,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease,datetime_month
0,2004-01,94.0,12,1.346581,103.744085,0.667553928,4,6,5.0,4,0,0,0,0,1,84,731581
1,2001-11,122.0,5,1.357618,103.961379,0.767025394,4,6,5.0,5,0,1,0,0,0,95,730790
2,2002-07,67.0,12,1.337804,103.741998,0.515380197,1,3,2.0,3,0,0,0,0,1,79,731032
3,2015-04,82.0,12,1.380084,103.849574,0.547114337,4,6,5.0,3,0,0,0,1,0,65,735689
4,2004-04,117.0,17,1.31396,103.769831,0.516298405,1,3,2.0,5,0,0,0,0,1,73,731672


Splitting data 

In [86]:
train_first = train_with_data[train_with_data['month']< '2007-01']
train_second = train_with_data[train_with_data['month'] > '2007-01']
train_second = train_second[train_second['month']< '2013-01']
train_third = train_with_data[train_with_data['month']>'2013-01']

convert date to dataframe format

In [87]:
train_first['datetime_month'] = pd.to_datetime(train_first['month'])
train_first['datetime_month']=train_first['datetime_month'].map(dt.datetime.toordinal)
train_second['datetime_month'] = pd.to_datetime(train_second['month'])
train_second['datetime_month']=train_second['datetime_month'].map(dt.datetime.toordinal)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_first['month'] = pd.to_datetime(train_first['month'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_first['month']=train_first['month'].map(dt.datetime.toordinal)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_third['month'] = pd.to_datetime(train_third['month'])
A value is tryin

train first dataset

In [88]:
X = train_first.drop (["resale_price"],axis = 1)
y = train_first["resale_price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [89]:
regressor1 = LinearRegression()
regressor1.fit(X_train, y_train)
print('Training score: {}'.format(regressor1.score(X_train, y_train)))
print('Test score: {}'.format(regressor1.score(X_test, y_test)))
y_pred = regressor1.predict(X_test)
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(df)
print('RMSE for Linear Regression=>',np.sqrt(mean_squared_error(y_test,y_pred)))

Training score: 0.8574049453953243
Test score: 0.8586169601696155
          Actual      Predicted
49377   135000.0  136039.430663
236602  184500.0  193108.222995
174747  272700.0  272322.936568
137429  249300.0  253741.885109
419928  319500.0  350455.396843
...          ...            ...
200987  142200.0  136341.138528
427881  260100.0  319945.159026
306968  349200.0  290370.900380
179720  135000.0  102860.515114
414925  238500.0  257188.267741

[35449 rows x 2 columns]
RMSE for Linear Regression=> 30318.048288043996


In [90]:
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=3)),
    ('model', LinearRegression())
]

pipeline1 = Pipeline(steps)

pipeline1.fit(X_train, y_train)

print('Training score: {}'.format(pipeline1.score(X_train, y_train)))
print('Test score: {}'.format(pipeline1.score(X_test, y_test)))

Training score: 0.9549281535197599
Test score: 0.9546939056325854


In [91]:
y_pred_pipeline = pipeline1.predict(X_test)
df = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred_pipeline})
print(df)
mse = mean_squared_error(y_test, y_pred_pipeline)
rmse = math.sqrt(mse)
print('RMSE: {}'.format(rmse))

        Real Values  Predicted Values
49377      135000.0     135271.261719
236602     184500.0     189958.980469
174747     272700.0     267893.011719
137429     249300.0     247772.292969
419928     319500.0     324410.074219
...             ...               ...
200987     142200.0     142541.542969
427881     260100.0     317374.167969
306968     349200.0     330728.574219
179720     135000.0     143479.949219
414925     238500.0     230326.765625

[35449 rows x 2 columns]
RMSE: 17162.51364087483


train second dataset

In [92]:
X = train_second.drop (["resale_price"],axis = 1)
y = train_second["resale_price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [93]:
regressor2 = LinearRegression()
regressor2.fit(X_train, y_train)
print('Training score: {}'.format(regressor2.score(X_train, y_train)))
print('Test score: {}'.format(regressor2.score(X_test, y_test)))
y_pred = regressor2.predict(X_test)
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(df)
print('RMSE for Linear Regression=>',np.sqrt(mean_squared_error(y_test,y_pred)))

Training score: 0.8631737131355315
Test score: 0.8614856930566268
          Actual      Predicted
46956   461700.0  425711.283471
357492  229500.0  303833.513203
307424  450000.0  463831.255797
432363  279000.0  306407.577567
324661  400500.0  380242.397495
...          ...            ...
340007  314100.0  314727.395942
205710  218700.0  220642.784355
295833  483300.0  478559.175982
357376  270000.0  267128.879834
421925  310500.0  332123.298757

[25954 rows x 2 columns]
RMSE for Linear Regression=> 39840.964533250844


In [94]:
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=3)),
    ('model', LinearRegression())
]

pipeline2 = Pipeline(steps)

pipeline2.fit(X_train, y_train)

print('Training score: {}'.format(pipeline2.score(X_train, y_train)))
print('Test score: {}'.format(pipeline2.score(X_test, y_test)))

Training score: 0.9516397470663902
Test score: 0.9502719677394591


In [95]:
y_pred_pipeline = pipeline2.predict(X_test)
df = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred_pipeline})
print(df)
mse = mean_squared_error(y_test, y_pred_pipeline)
rmse = math.sqrt(mse)
print('RMSE: {}'.format(rmse))

        Real Values  Predicted Values
46956      461700.0     430157.734375
357492     229500.0     260555.140625
307424     450000.0     501767.640625
432363     279000.0     295517.640625
324661     400500.0     383984.953125
...             ...               ...
340007     314100.0     285853.500000
205710     218700.0     176210.890625
295833     483300.0     488551.109375
357376     270000.0     266268.765625
421925     310500.0     312396.046875

[25954 rows x 2 columns]
RMSE: 23871.689917986412


train third dataset

In [96]:
X = train_second.drop (["resale_price"],axis = 1)
y = train_second["resale_price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [97]:
regressor3 = LinearRegression()
regressor3.fit(X_train, y_train)
print('Training score: {}'.format(regressor3.score(X_train, y_train)))
print('Test score: {}'.format(regressor3.score(X_test, y_test)))
y_pred = regressor3.predict(X_test)
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(df)
print('RMSE for Linear Regression=>',np.sqrt(mean_squared_error(y_test,y_pred)))

Training score: 0.8631737131355315
Test score: 0.8614856930566268
          Actual      Predicted
46956   461700.0  425711.283471
357492  229500.0  303833.513203
307424  450000.0  463831.255797
432363  279000.0  306407.577567
324661  400500.0  380242.397495
...          ...            ...
340007  314100.0  314727.395942
205710  218700.0  220642.784355
295833  483300.0  478559.175982
357376  270000.0  267128.879834
421925  310500.0  332123.298757

[25954 rows x 2 columns]
RMSE for Linear Regression=> 39840.964533250844


In [98]:
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=3)),
    ('model', LinearRegression())
]

pipeline3 = Pipeline(steps)

pipeline3.fit(X_train, y_train)

print('Training score: {}'.format(pipeline3.score(X_train, y_train)))
print('Test score: {}'.format(pipeline3.score(X_test, y_test)))

Training score: 0.9516397470663902
Test score: 0.9502719677394591


In [99]:
y_pred_pipeline = pipeline3.predict(X_test)
df = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred_pipeline})
print(df)
mse = mean_squared_error(y_test, y_pred_pipeline)
rmse = math.sqrt(mse)
print('RMSE: {}'.format(rmse))

        Real Values  Predicted Values
46956      461700.0     430157.734375
357492     229500.0     260555.140625
307424     450000.0     501767.640625
432363     279000.0     295517.640625
324661     400500.0     383984.953125
...             ...               ...
340007     314100.0     285853.500000
205710     218700.0     176210.890625
295833     483300.0     488551.109375
357376     270000.0     266268.765625
421925     310500.0     312396.046875

[25954 rows x 2 columns]
RMSE: 23871.689917986412


In [158]:
test.head()

Unnamed: 0,month,floor_area_sqm,flat_model,latitude,longitude,dist_mrt,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2004-01,94.0,12,1.346581,103.744085,0.667553928,4,6,5.0,4,0,0,0,0,1,84
1,2001-11,122.0,5,1.357618,103.961379,0.767025394,4,6,5.0,5,0,1,0,0,0,95
2,2002-07,67.0,12,1.337804,103.741998,0.515380197,1,3,2.0,3,0,0,0,0,1,79
3,2015-04,82.0,12,1.380084,103.849574,0.547114337,4,6,5.0,3,0,0,0,1,0,65
4,2004-04,117.0,17,1.31396,103.769831,0.516298405,1,3,2.0,5,0,0,0,0,1,73


In [159]:
def rules(row):
    if row['month']< '2007-01':
        return regressor1.predict(row)
    elif row['month'] > '2007-01' and row['month']< '2013-01':
        return regressor2.predict(row)
    else:
        return regressor3.predict(row)

pred = test.apply(rules, axis=1)
print(pred)

AttributeError: 'Timestamp' object has no attribute 'map'