# EDA and Data Pre-processing

## Importing Packages

In [418]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pprint
import csv
import datetime as dt
from sklearn.metrics import classification_report, confusion_matrix, roc_curve
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import math

## Importing Dataset

In [419]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

## Pre-processing on Train Dataset

## Feature Engineering to Create "dist_mrt" from Auxiliary Data

In [420]:
# do not change order of this .dropna() function
train.dropna()
train = train.reset_index(drop=True)
train

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,resale_price
0,2001-08,pasir ris,4 room,440,pasir ris drive 4,01 to 03,118.0,model a,uncategorized,1989,1.369008,103.958697,0.0,pasir ris drive,pasir ris,east region,209700.0
1,2014-10,punggol,5-room,196B,punggol field,10 to 12,110.0,improved,uncategorized,2003,1.399007,103.906991,0.0,punggol field,punggol,north-east region,402300.0
2,2020-09,sengkang,5 room,404A,fernvale lane,01 to 03,112.0,premium apartment,uncategorized,2004,1.388348,103.873815,0.0,fernvale,sengkang,north-east region,351000.0
3,2000-10,clementi,3 room,375,clementi avenue 4,07 to 09,67.0,new generation,uncategorized,1980,1.318493,103.766702,0.0,clementi north,clementi,west region,151200.0
4,2013-01,bukit batok,3-room,163,bukit batok street 11,07 to 09,73.0,model a,uncategorized,1985,1.348149,103.742658,0.0,bukit batok west,bukit batok,west region,318600.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431727,2005-03,woodlands,4 room,537,Woodlands Drive 16,01 to 03,101.0,model a,uncategorized,2000,1.429658,103.792583,0.0,woodlands south,woodlands,north region,238500.0
431728,2016-04,sengkang,4 room,410A,fernvale road,13 to 15,95.0,premium apartment,uncategorized,2012,1.390053,103.875941,0.0,fernvale,sengkang,north-east region,376200.0
431729,2011-01,tampines,3-room,829,tampines street 81,01 to 03,67.0,new generation,uncategorized,1986,1.349224,103.934913,0.0,tampines west,tampines,east region,255600.0
431730,2013-05,sengkang,5-room,233,compassvale walk,16 to 18,123.0,improved,uncategorized,1999,1.389941,103.900721,0.0,sengkang town centre,sengkang,north-east region,508500.0


In [421]:
with open('./auxiliary-data/distance-to-mrt.csv', newline='') as f:
    reader = csv.reader(f)
    mrt_list = list(reader)

In [422]:
train['dist_mrt'] = np.array(mrt_list)
train.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,resale_price,dist_mrt
0,2001-08,pasir ris,4 room,440,pasir ris drive 4,01 to 03,118.0,model a,uncategorized,1989,1.369008,103.958697,0.0,pasir ris drive,pasir ris,east region,209700.0,1.137651115
1,2014-10,punggol,5-room,196B,punggol field,10 to 12,110.0,improved,uncategorized,2003,1.399007,103.906991,0.0,punggol field,punggol,north-east region,402300.0,0.118453389
2,2020-09,sengkang,5 room,404A,fernvale lane,01 to 03,112.0,premium apartment,uncategorized,2004,1.388348,103.873815,0.0,fernvale,sengkang,north-east region,351000.0,0.479542021
3,2000-10,clementi,3 room,375,clementi avenue 4,07 to 09,67.0,new generation,uncategorized,1980,1.318493,103.766702,0.0,clementi north,clementi,west region,151200.0,0.421345802
4,2013-01,bukit batok,3-room,163,bukit batok street 11,07 to 09,73.0,model a,uncategorized,1985,1.348149,103.742658,0.0,bukit batok west,bukit batok,west region,318600.0,0.775005151


## Convert "storey_range" into Numerical Data Type "storey_average"

In [423]:
# first number in "storey_range"
train['lower'] = train['storey_range'].str.extract('(\d+)')
train['upper'] = train['storey_range'].str.extract('(\d+$)')
# calculate average storey from the range
train['storey_average'] = train['upper'].astype(int) + train['lower'].astype(int)
train['storey_average'] = train['storey_average'] / 2.0
train = train.drop(columns=['storey_range'])
train.head()

Unnamed: 0,month,town,flat_type,block,street_name,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,resale_price,dist_mrt,lower,upper,storey_average
0,2001-08,pasir ris,4 room,440,pasir ris drive 4,118.0,model a,uncategorized,1989,1.369008,103.958697,0.0,pasir ris drive,pasir ris,east region,209700.0,1.137651115,1,3,2.0
1,2014-10,punggol,5-room,196B,punggol field,110.0,improved,uncategorized,2003,1.399007,103.906991,0.0,punggol field,punggol,north-east region,402300.0,0.118453389,10,12,11.0
2,2020-09,sengkang,5 room,404A,fernvale lane,112.0,premium apartment,uncategorized,2004,1.388348,103.873815,0.0,fernvale,sengkang,north-east region,351000.0,0.479542021,1,3,2.0
3,2000-10,clementi,3 room,375,clementi avenue 4,67.0,new generation,uncategorized,1980,1.318493,103.766702,0.0,clementi north,clementi,west region,151200.0,0.421345802,7,9,8.0
4,2013-01,bukit batok,3-room,163,bukit batok street 11,73.0,model a,uncategorized,1985,1.348149,103.742658,0.0,bukit batok west,bukit batok,west region,318600.0,0.775005151,7,9,8.0


## Convert "flat_type" to Ordinal Data Type "flattype_mapping"

In [424]:
train = train.replace(to_replace ="5 room", value ="5-room")
train = train.replace(to_replace ="4 room", value ="4-room")
train = train.replace(to_replace ="3 room", value ="3-room")
train = train.replace(to_replace ="2 room", value ="2-room")
train = train.replace(to_replace ="1 room", value ="1-room")
flattype_mapping={"1-room": 1,
                   "2-room": 2,
                   "3-room": 3,
                   "4-room": 4,
                   "5-room": 5,
                   "multi generation": 6,
                   "executive": 7,}
train["flattype_mapping"] = train['flat_type'].map(flattype_mapping)
train = train.drop(columns=['flat_type'])
train.head()

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,resale_price,dist_mrt,lower,upper,storey_average,flattype_mapping
0,2001-08,pasir ris,440,pasir ris drive 4,118.0,model a,uncategorized,1989,1.369008,103.958697,0.0,pasir ris drive,pasir ris,east region,209700.0,1.137651115,1,3,2.0,4
1,2014-10,punggol,196B,punggol field,110.0,improved,uncategorized,2003,1.399007,103.906991,0.0,punggol field,punggol,north-east region,402300.0,0.118453389,10,12,11.0,5
2,2020-09,sengkang,404A,fernvale lane,112.0,premium apartment,uncategorized,2004,1.388348,103.873815,0.0,fernvale,sengkang,north-east region,351000.0,0.479542021,1,3,2.0,5
3,2000-10,clementi,375,clementi avenue 4,67.0,new generation,uncategorized,1980,1.318493,103.766702,0.0,clementi north,clementi,west region,151200.0,0.421345802,7,9,8.0,3
4,2013-01,bukit batok,163,bukit batok street 11,73.0,model a,uncategorized,1985,1.348149,103.742658,0.0,bukit batok west,bukit batok,west region,318600.0,0.775005151,7,9,8.0,3


## Convert "region" to one-hot encoding

In [425]:
train = pd.get_dummies(train, columns=['region'], prefix=['region'])
train

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,...,dist_mrt,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region
0,2001-08,pasir ris,440,pasir ris drive 4,118.0,model a,uncategorized,1989,1.369008,103.958697,...,1.137651115,01,03,2.0,4,0,1,0,0,0
1,2014-10,punggol,196B,punggol field,110.0,improved,uncategorized,2003,1.399007,103.906991,...,0.118453389,10,12,11.0,5,0,0,0,1,0
2,2020-09,sengkang,404A,fernvale lane,112.0,premium apartment,uncategorized,2004,1.388348,103.873815,...,0.479542021,01,03,2.0,5,0,0,0,1,0
3,2000-10,clementi,375,clementi avenue 4,67.0,new generation,uncategorized,1980,1.318493,103.766702,...,0.421345802,07,09,8.0,3,0,0,0,0,1
4,2013-01,bukit batok,163,bukit batok street 11,73.0,model a,uncategorized,1985,1.348149,103.742658,...,0.775005151,07,09,8.0,3,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431727,2005-03,woodlands,537,Woodlands Drive 16,101.0,model a,uncategorized,2000,1.429658,103.792583,...,0.300402068,01,03,2.0,4,0,0,1,0,0
431728,2016-04,sengkang,410A,fernvale road,95.0,premium apartment,uncategorized,2012,1.390053,103.875941,...,0.206723436,13,15,14.0,4,0,0,0,1,0
431729,2011-01,tampines,829,tampines street 81,67.0,new generation,uncategorized,1986,1.349224,103.934913,...,0.567510655,01,03,2.0,3,0,1,0,0,0
431730,2013-05,sengkang,233,compassvale walk,123.0,improved,uncategorized,1999,1.389941,103.900721,...,0.503935104,16,18,17.0,5,0,0,0,1,0


## Feature engineering to create "remaining_lease" feature from "lease_commencement_date"

In [426]:
lease_commence_date = train['lease_commence_date'].to_numpy()
#assuming 99-year lease, we calculate the remaining years on the lease
remaining_lease = lease_commence_date + 99 - pd.DatetimeIndex(train['month']).year.to_numpy()
train['remaining_lease'] = remaining_lease
train = train.drop(columns=['lease_commence_date'])
train.head()

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,latitude,longitude,elevation,...,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2001-08,pasir ris,440,pasir ris drive 4,118.0,model a,uncategorized,1.369008,103.958697,0.0,...,1,3,2.0,4,0,1,0,0,0,87
1,2014-10,punggol,196B,punggol field,110.0,improved,uncategorized,1.399007,103.906991,0.0,...,10,12,11.0,5,0,0,0,1,0,88
2,2020-09,sengkang,404A,fernvale lane,112.0,premium apartment,uncategorized,1.388348,103.873815,0.0,...,1,3,2.0,5,0,0,0,1,0,83
3,2000-10,clementi,375,clementi avenue 4,67.0,new generation,uncategorized,1.318493,103.766702,0.0,...,7,9,8.0,3,0,0,0,0,1,79
4,2013-01,bukit batok,163,bukit batok street 11,73.0,model a,uncategorized,1.348149,103.742658,0.0,...,7,9,8.0,3,0,0,0,0,1,71


## Convert "flat_model" to Ordinal Data Type

In [427]:
flat_models = train.groupby('flat_model')
keys = flat_models.groups.keys()

flat_type_dict = {}

# store the average "resale_price" of each "flat_model" in a dictionary with
#    key : "flat_model"
#    value : average "resale_price"
for key in keys:
    df_i = flat_models.get_group(key)
    flat_type_dict.update({key : df_i["resale_price"].mean()})

# ordering the different "flat_model" by average "resale_price" (ascending order)
{k: v for k, v in sorted(flat_type_dict.items(), key=lambda item: item[1])}
# print(flat_type_dict)
# print(flat_type_dict.keys())

# save the ordered list of keys from dictionary as a list for easier index finding
flat_model_list = list(flat_type_dict.keys())

for flat_model in flat_model_list:
    ordinal_value = flat_model_list.index(flat_model) + 1
    train['flat_model'] = train['flat_model'].str.replace(rf'^{flat_model}$', str(ordinal_value), regex=True)
    
train.head()

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,latitude,longitude,elevation,...,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2001-08,pasir ris,440,pasir ris drive 4,118.0,8,uncategorized,1.369008,103.958697,0.0,...,1,3,2.0,4,0,1,0,0,0,87
1,2014-10,punggol,196B,punggol field,110.0,5,uncategorized,1.399007,103.906991,0.0,...,10,12,11.0,5,0,0,0,1,0,88
2,2020-09,sengkang,404A,fernvale lane,112.0,13,uncategorized,1.388348,103.873815,0.0,...,1,3,2.0,5,0,0,0,1,0,83
3,2000-10,clementi,375,clementi avenue 4,67.0,12,uncategorized,1.318493,103.766702,0.0,...,7,9,8.0,3,0,0,0,0,1,79
4,2013-01,bukit batok,163,bukit batok street 11,73.0,8,uncategorized,1.348149,103.742658,0.0,...,7,9,8.0,3,0,0,0,0,1,71


## Drop Columns "block", "street_name", and "eco_category"

In [428]:
train = train.drop(columns=['block', 'street_name', 'eco_category'])
train.head()

Unnamed: 0,month,town,floor_area_sqm,flat_model,latitude,longitude,elevation,subzone,planning_area,resale_price,...,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2001-08,pasir ris,118.0,8,1.369008,103.958697,0.0,pasir ris drive,pasir ris,209700.0,...,1,3,2.0,4,0,1,0,0,0,87
1,2014-10,punggol,110.0,5,1.399007,103.906991,0.0,punggol field,punggol,402300.0,...,10,12,11.0,5,0,0,0,1,0,88
2,2020-09,sengkang,112.0,13,1.388348,103.873815,0.0,fernvale,sengkang,351000.0,...,1,3,2.0,5,0,0,0,1,0,83
3,2000-10,clementi,67.0,12,1.318493,103.766702,0.0,clementi north,clementi,151200.0,...,7,9,8.0,3,0,0,0,0,1,79
4,2013-01,bukit batok,73.0,8,1.348149,103.742658,0.0,bukit batok west,bukit batok,318600.0,...,7,9,8.0,3,0,0,0,0,1,71


In [429]:
train = train.drop(columns=['elevation','town','subzone','planning_area'])
train.head()

Unnamed: 0,month,floor_area_sqm,flat_model,latitude,longitude,resale_price,dist_mrt,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2001-08,118.0,8,1.369008,103.958697,209700.0,1.137651115,1,3,2.0,4,0,1,0,0,0,87
1,2014-10,110.0,5,1.399007,103.906991,402300.0,0.118453389,10,12,11.0,5,0,0,0,1,0,88
2,2020-09,112.0,13,1.388348,103.873815,351000.0,0.479542021,1,3,2.0,5,0,0,0,1,0,83
3,2000-10,67.0,12,1.318493,103.766702,151200.0,0.421345802,7,9,8.0,3,0,0,0,0,1,79
4,2013-01,73.0,8,1.348149,103.742658,318600.0,0.775005151,7,9,8.0,3,0,0,0,0,1,71


## Convert "month" to datetime

In [430]:
train['datetime_month'] = pd.to_datetime(train['month'])
train['datetime_month']= train['datetime_month'].map(dt.datetime.toordinal)

In [431]:
train_with_data = train.copy()
train_with_data.head()

Unnamed: 0,month,floor_area_sqm,flat_model,latitude,longitude,resale_price,dist_mrt,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease,datetime_month
0,2001-08,118.0,8,1.369008,103.958697,209700.0,1.137651115,1,3,2.0,4,0,1,0,0,0,87,730698
1,2014-10,110.0,5,1.399007,103.906991,402300.0,0.118453389,10,12,11.0,5,0,0,0,1,0,88,735507
2,2020-09,112.0,13,1.388348,103.873815,351000.0,0.479542021,1,3,2.0,5,0,0,0,1,0,83,737669
3,2000-10,67.0,12,1.318493,103.766702,151200.0,0.421345802,7,9,8.0,3,0,0,0,0,1,79,730394
4,2013-01,73.0,8,1.348149,103.742658,318600.0,0.775005151,7,9,8.0,3,0,0,0,0,1,71,734869


## Pre-processing on Test Dataset

## Feature Engineering to Create "dist_mrt" from Auxiliary Data

In [432]:
test.dropna()
test = test.reset_index(drop=True)
test

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region
0,2004-01,bukit batok,4 room,186,bukit batok west avenue 6,04 to 06,94.0,new generation,uncategorized,1989,1.346581,103.744085,0.0,bukit batok west,bukit batok,west region
1,2001-11,tampines,5 room,366,tampines street 34,04 to 06,122.0,improved,uncategorized,1997,1.357618,103.961379,0.0,tampines east,tampines,east region
2,2002-07,jurong east,3 room,206,jurong east street 21,01 to 03,67.0,new generation,uncategorized,1982,1.337804,103.741998,0.0,toh guan,jurong east,west region
3,2015-04,ang mo kio,3 room,180,Ang Mo Kio Avenue 5,04 to 06,82.0,new generation,uncategorized,1981,1.380084,103.849574,0.0,yio chu kang east,ang mo kio,north-east region
4,2004-04,clementi,5 room,356,clementi avenue 2,01 to 03,117.0,standard,uncategorized,1978,1.313960,103.769831,0.0,clementi north,clementi,west region
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107929,2008-04,hougang,5 room,981D,buangkok crescent,10 to 12,110.0,improved,uncategorized,2003,1.380452,103.879333,0.0,trafalgar,hougang,north-east region
107930,2006-01,kallang/whampoa,4 room,13,upper boon keng road,13 to 15,102.0,model a,uncategorized,1999,1.314481,103.870458,0.0,boon keng,kallang,central region
107931,2000-01,kallang/whampoa,3 room,1,beach road,07 to 09,68.0,improved,uncategorized,1979,1.294924,103.854315,0.0,city hall,downtown core,central region
107932,2009-07,jurong west,4 room,919,jurong west street 91,10 to 12,104.0,model a,uncategorized,1988,1.339927,103.687354,0.0,yunnan,jurong west,west region


In [433]:
with open('./auxiliary-data/distance-to-mrt-test-edited.csv', newline='') as f:
    reader = csv.reader(f)
    mrt_list = list(reader)

In [434]:
test['dist_mrt'] = np.array(mrt_list)
test

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,dist_mrt
0,2004-01,bukit batok,4 room,186,bukit batok west avenue 6,04 to 06,94.0,new generation,uncategorized,1989,1.346581,103.744085,0.0,bukit batok west,bukit batok,west region,0.667553928
1,2001-11,tampines,5 room,366,tampines street 34,04 to 06,122.0,improved,uncategorized,1997,1.357618,103.961379,0.0,tampines east,tampines,east region,0.767025394
2,2002-07,jurong east,3 room,206,jurong east street 21,01 to 03,67.0,new generation,uncategorized,1982,1.337804,103.741998,0.0,toh guan,jurong east,west region,0.515380197
3,2015-04,ang mo kio,3 room,180,Ang Mo Kio Avenue 5,04 to 06,82.0,new generation,uncategorized,1981,1.380084,103.849574,0.0,yio chu kang east,ang mo kio,north-east region,0.547114337
4,2004-04,clementi,5 room,356,clementi avenue 2,01 to 03,117.0,standard,uncategorized,1978,1.313960,103.769831,0.0,clementi north,clementi,west region,0.516298405
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107929,2008-04,hougang,5 room,981D,buangkok crescent,10 to 12,110.0,improved,uncategorized,2003,1.380452,103.879333,0.0,trafalgar,hougang,north-east region,1.221623645
107930,2006-01,kallang/whampoa,4 room,13,upper boon keng road,13 to 15,102.0,model a,uncategorized,1999,1.314481,103.870458,0.0,boon keng,kallang,central region,0.346681968
107931,2000-01,kallang/whampoa,3 room,1,beach road,07 to 09,68.0,improved,uncategorized,1979,1.294924,103.854315,0.0,city hall,downtown core,central region,0.221180677
107932,2009-07,jurong west,4 room,919,jurong west street 91,10 to 12,104.0,model a,uncategorized,1988,1.339927,103.687354,0.0,yunnan,jurong west,west region,1.138997242


## Convert "storey_range" into Numerical Data Type "storey_average" 

In [435]:
test['lower'] = test['storey_range'].str.extract('(\d+)')
test['upper'] = test['storey_range'].str.extract('(\d+$)')
# calculate average storey from the range
test['storey_average'] = test['upper'].astype(int) + test['lower'].astype(int)
test['storey_average'] = test['storey_average'] / 2.0
test = test.drop(columns=['storey_range'])
test

Unnamed: 0,month,town,flat_type,block,street_name,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,dist_mrt,lower,upper,storey_average
0,2004-01,bukit batok,4 room,186,bukit batok west avenue 6,94.0,new generation,uncategorized,1989,1.346581,103.744085,0.0,bukit batok west,bukit batok,west region,0.667553928,04,06,5.0
1,2001-11,tampines,5 room,366,tampines street 34,122.0,improved,uncategorized,1997,1.357618,103.961379,0.0,tampines east,tampines,east region,0.767025394,04,06,5.0
2,2002-07,jurong east,3 room,206,jurong east street 21,67.0,new generation,uncategorized,1982,1.337804,103.741998,0.0,toh guan,jurong east,west region,0.515380197,01,03,2.0
3,2015-04,ang mo kio,3 room,180,Ang Mo Kio Avenue 5,82.0,new generation,uncategorized,1981,1.380084,103.849574,0.0,yio chu kang east,ang mo kio,north-east region,0.547114337,04,06,5.0
4,2004-04,clementi,5 room,356,clementi avenue 2,117.0,standard,uncategorized,1978,1.313960,103.769831,0.0,clementi north,clementi,west region,0.516298405,01,03,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107929,2008-04,hougang,5 room,981D,buangkok crescent,110.0,improved,uncategorized,2003,1.380452,103.879333,0.0,trafalgar,hougang,north-east region,1.221623645,10,12,11.0
107930,2006-01,kallang/whampoa,4 room,13,upper boon keng road,102.0,model a,uncategorized,1999,1.314481,103.870458,0.0,boon keng,kallang,central region,0.346681968,13,15,14.0
107931,2000-01,kallang/whampoa,3 room,1,beach road,68.0,improved,uncategorized,1979,1.294924,103.854315,0.0,city hall,downtown core,central region,0.221180677,07,09,8.0
107932,2009-07,jurong west,4 room,919,jurong west street 91,104.0,model a,uncategorized,1988,1.339927,103.687354,0.0,yunnan,jurong west,west region,1.138997242,10,12,11.0


## Convert "flat_type" to Ordinal Data Type "flattype_mapping"

In [436]:
test = test.replace(to_replace ="5 room", value ="5-room")
test = test.replace(to_replace ="4 room", value ="4-room")
test = test.replace(to_replace ="3 room", value ="3-room")
test = test.replace(to_replace ="2 room", value ="2-room")
test = test.replace(to_replace ="1 room", value ="1-room")
flattype_mapping={"1-room": 1,
                   "2-room": 2,
                   "3-room": 3,
                   "4-room": 4,
                   "5-room": 5,
                   "multi generation": 6,
                   "executive": 7,}
test["flattype_mapping"] = test['flat_type'].map(flattype_mapping)
test = test.drop(columns=['flat_type'])
test

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,dist_mrt,lower,upper,storey_average,flattype_mapping
0,2004-01,bukit batok,186,bukit batok west avenue 6,94.0,new generation,uncategorized,1989,1.346581,103.744085,0.0,bukit batok west,bukit batok,west region,0.667553928,04,06,5.0,4
1,2001-11,tampines,366,tampines street 34,122.0,improved,uncategorized,1997,1.357618,103.961379,0.0,tampines east,tampines,east region,0.767025394,04,06,5.0,5
2,2002-07,jurong east,206,jurong east street 21,67.0,new generation,uncategorized,1982,1.337804,103.741998,0.0,toh guan,jurong east,west region,0.515380197,01,03,2.0,3
3,2015-04,ang mo kio,180,Ang Mo Kio Avenue 5,82.0,new generation,uncategorized,1981,1.380084,103.849574,0.0,yio chu kang east,ang mo kio,north-east region,0.547114337,04,06,5.0,3
4,2004-04,clementi,356,clementi avenue 2,117.0,standard,uncategorized,1978,1.313960,103.769831,0.0,clementi north,clementi,west region,0.516298405,01,03,2.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107929,2008-04,hougang,981D,buangkok crescent,110.0,improved,uncategorized,2003,1.380452,103.879333,0.0,trafalgar,hougang,north-east region,1.221623645,10,12,11.0,5
107930,2006-01,kallang/whampoa,13,upper boon keng road,102.0,model a,uncategorized,1999,1.314481,103.870458,0.0,boon keng,kallang,central region,0.346681968,13,15,14.0,4
107931,2000-01,kallang/whampoa,1,beach road,68.0,improved,uncategorized,1979,1.294924,103.854315,0.0,city hall,downtown core,central region,0.221180677,07,09,8.0,3
107932,2009-07,jurong west,919,jurong west street 91,104.0,model a,uncategorized,1988,1.339927,103.687354,0.0,yunnan,jurong west,west region,1.138997242,10,12,11.0,4


## Convert "region" to one hot encoding

In [437]:
test = pd.get_dummies(test, columns=['region'], prefix=['region'])
test

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,...,dist_mrt,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region
0,2004-01,bukit batok,186,bukit batok west avenue 6,94.0,new generation,uncategorized,1989,1.346581,103.744085,...,0.667553928,04,06,5.0,4,0,0,0,0,1
1,2001-11,tampines,366,tampines street 34,122.0,improved,uncategorized,1997,1.357618,103.961379,...,0.767025394,04,06,5.0,5,0,1,0,0,0
2,2002-07,jurong east,206,jurong east street 21,67.0,new generation,uncategorized,1982,1.337804,103.741998,...,0.515380197,01,03,2.0,3,0,0,0,0,1
3,2015-04,ang mo kio,180,Ang Mo Kio Avenue 5,82.0,new generation,uncategorized,1981,1.380084,103.849574,...,0.547114337,04,06,5.0,3,0,0,0,1,0
4,2004-04,clementi,356,clementi avenue 2,117.0,standard,uncategorized,1978,1.313960,103.769831,...,0.516298405,01,03,2.0,5,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107929,2008-04,hougang,981D,buangkok crescent,110.0,improved,uncategorized,2003,1.380452,103.879333,...,1.221623645,10,12,11.0,5,0,0,0,1,0
107930,2006-01,kallang/whampoa,13,upper boon keng road,102.0,model a,uncategorized,1999,1.314481,103.870458,...,0.346681968,13,15,14.0,4,1,0,0,0,0
107931,2000-01,kallang/whampoa,1,beach road,68.0,improved,uncategorized,1979,1.294924,103.854315,...,0.221180677,07,09,8.0,3,1,0,0,0,0
107932,2009-07,jurong west,919,jurong west street 91,104.0,model a,uncategorized,1988,1.339927,103.687354,...,1.138997242,10,12,11.0,4,0,0,0,0,1


## Feature engineering to create "remaining_lease" feature from "lease_commencement_date"

In [438]:
lease_commence_date = test['lease_commence_date'].to_numpy()
#assuming 99-year lease, we calculate the remaining years on the lease
remaining_lease = lease_commence_date + 99 - pd.DatetimeIndex(test['month']).year.to_numpy()
test['remaining_lease'] = remaining_lease
test = test.drop(columns=['lease_commence_date'])
test.head()

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,latitude,longitude,elevation,...,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2004-01,bukit batok,186,bukit batok west avenue 6,94.0,new generation,uncategorized,1.346581,103.744085,0.0,...,4,6,5.0,4,0,0,0,0,1,84
1,2001-11,tampines,366,tampines street 34,122.0,improved,uncategorized,1.357618,103.961379,0.0,...,4,6,5.0,5,0,1,0,0,0,95
2,2002-07,jurong east,206,jurong east street 21,67.0,new generation,uncategorized,1.337804,103.741998,0.0,...,1,3,2.0,3,0,0,0,0,1,79
3,2015-04,ang mo kio,180,Ang Mo Kio Avenue 5,82.0,new generation,uncategorized,1.380084,103.849574,0.0,...,4,6,5.0,3,0,0,0,1,0,65
4,2004-04,clementi,356,clementi avenue 2,117.0,standard,uncategorized,1.31396,103.769831,0.0,...,1,3,2.0,5,0,0,0,0,1,73


## Convert "flat_model" to Ordinal Data Type

In [439]:
# uses the same "flat_model_list" extracted from the "flat_model" dictionary calculated from train dataset
for flat_model in flat_model_list:
    ordinal_value = flat_model_list.index(flat_model) + 1
    test['flat_model'] = test['flat_model'].str.replace(rf'^{flat_model}$', str(ordinal_value), regex=True)

test.head()

Unnamed: 0,month,town,block,street_name,floor_area_sqm,flat_model,eco_category,latitude,longitude,elevation,...,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2004-01,bukit batok,186,bukit batok west avenue 6,94.0,12,uncategorized,1.346581,103.744085,0.0,...,4,6,5.0,4,0,0,0,0,1,84
1,2001-11,tampines,366,tampines street 34,122.0,5,uncategorized,1.357618,103.961379,0.0,...,4,6,5.0,5,0,1,0,0,0,95
2,2002-07,jurong east,206,jurong east street 21,67.0,12,uncategorized,1.337804,103.741998,0.0,...,1,3,2.0,3,0,0,0,0,1,79
3,2015-04,ang mo kio,180,Ang Mo Kio Avenue 5,82.0,12,uncategorized,1.380084,103.849574,0.0,...,4,6,5.0,3,0,0,0,1,0,65
4,2004-04,clementi,356,clementi avenue 2,117.0,17,uncategorized,1.31396,103.769831,0.0,...,1,3,2.0,5,0,0,0,0,1,73


## Drop Columns "block", "street_name", and "eco_category"

In [440]:
test = test.drop(columns=['block', 'street_name', 'eco_category'])
test.head()

Unnamed: 0,month,town,floor_area_sqm,flat_model,latitude,longitude,elevation,subzone,planning_area,dist_mrt,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2004-01,bukit batok,94.0,12,1.346581,103.744085,0.0,bukit batok west,bukit batok,0.667553928,4,6,5.0,4,0,0,0,0,1,84
1,2001-11,tampines,122.0,5,1.357618,103.961379,0.0,tampines east,tampines,0.767025394,4,6,5.0,5,0,1,0,0,0,95
2,2002-07,jurong east,67.0,12,1.337804,103.741998,0.0,toh guan,jurong east,0.515380197,1,3,2.0,3,0,0,0,0,1,79
3,2015-04,ang mo kio,82.0,12,1.380084,103.849574,0.0,yio chu kang east,ang mo kio,0.547114337,4,6,5.0,3,0,0,0,1,0,65
4,2004-04,clementi,117.0,17,1.31396,103.769831,0.0,clementi north,clementi,0.516298405,1,3,2.0,5,0,0,0,0,1,73


In [441]:
test = test.drop(columns=['elevation','town','subzone','planning_area'])
test.head()

Unnamed: 0,month,floor_area_sqm,flat_model,latitude,longitude,dist_mrt,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2004-01,94.0,12,1.346581,103.744085,0.667553928,4,6,5.0,4,0,0,0,0,1,84
1,2001-11,122.0,5,1.357618,103.961379,0.767025394,4,6,5.0,5,0,1,0,0,0,95
2,2002-07,67.0,12,1.337804,103.741998,0.515380197,1,3,2.0,3,0,0,0,0,1,79
3,2015-04,82.0,12,1.380084,103.849574,0.547114337,4,6,5.0,3,0,0,0,1,0,65
4,2004-04,117.0,17,1.31396,103.769831,0.516298405,1,3,2.0,5,0,0,0,0,1,73


## Convert "month" to datetime

In [442]:
test['datetime_month'] = pd.to_datetime(test['month'])
test['datetime_month']= test['datetime_month'].map(dt.datetime.toordinal)

In [443]:
test

Unnamed: 0,month,floor_area_sqm,flat_model,latitude,longitude,dist_mrt,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease,datetime_month
0,2004-01,94.0,12,1.346581,103.744085,0.667553928,04,06,5.0,4,0,0,0,0,1,84,731581
1,2001-11,122.0,5,1.357618,103.961379,0.767025394,04,06,5.0,5,0,1,0,0,0,95,730790
2,2002-07,67.0,12,1.337804,103.741998,0.515380197,01,03,2.0,3,0,0,0,0,1,79,731032
3,2015-04,82.0,12,1.380084,103.849574,0.547114337,04,06,5.0,3,0,0,0,1,0,65,735689
4,2004-04,117.0,17,1.313960,103.769831,0.516298405,01,03,2.0,5,0,0,0,0,1,73,731672
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107929,2008-04,110.0,5,1.380452,103.879333,1.221623645,10,12,11.0,5,0,0,0,1,0,94,733133
107930,2006-01,102.0,8,1.314481,103.870458,0.346681968,13,15,14.0,4,1,0,0,0,0,92,732312
107931,2000-01,68.0,5,1.294924,103.854315,0.221180677,07,09,8.0,3,1,0,0,0,0,78,730120
107932,2009-07,104.0,8,1.339927,103.687354,1.138997242,10,12,11.0,4,0,0,0,0,1,78,733589


# Linear Regression implementation for full dataset

In [444]:
X = train.drop (["resale_price", "month"],axis = 1)
y = train["resale_price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [445]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [446]:
print(regressor.intercept_)
print(regressor.coef_)
print('Training score: {}'.format(regressor.score(X_train, y_train)))
print('Test score: {}'.format(regressor.score(X_test, y_test)))

-29945027.554563683
[ 2.41134868e+03  9.94147956e+02 -9.82572853e+05  4.10440727e+04
 -1.58815763e+04 -3.62493235e+04  3.84796687e+04  1.11517261e+03
  1.63507887e+04  2.38692243e+04 -1.31577842e+04  2.02620750e+04
  3.50291243e+03 -3.44764276e+04  2.23769216e+03  3.64668263e+01]
Training score: 0.7915924665460026
Test score: 0.7891979333601642


In [447]:
y_pred = regressor.predict(X_test)
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(df)
print('RMSE for Linear Regression => ',np.sqrt(mean_squared_error(y_test,y_pred)))

          Actual      Predicted
76998   319950.0  331083.083027
397501  315000.0  339260.343548
347486  396000.0  460663.919879
242624  243000.0  319792.689107
64696   194400.0  259165.994044
...          ...            ...
400889  189000.0  180928.703309
287118  292500.0  362791.924591
391252  238500.0  284130.059303
401219  159559.2  150656.625197
201018  220500.0  228062.297442

[86347 rows x 2 columns]
RMSE for Linear Regression =>  59489.00883760603


# Polynomial regression for full dataset

## Implement polynomial predictor

In [448]:
poly_reg = PolynomialFeatures(degree = 3)
X_poly = poly_reg.fit_transform(X)
print(len(X_poly))
print(len(y))
lin_reg = LinearRegression()
lin_reg.fit(X_poly,y)

431732
431732


LinearRegression()

In [449]:
print('Training score: {}'.format(lin_reg.score(X_poly, y)))
# print('Training score: {}'.format(lin_reg.score(X_test, y_test)))

Training score: 0.8687702493355828


In [450]:
y_pred = lin_reg.predict(X_poly)
df = pd.DataFrame({'Real Values':y, 'Predicted Values':y_pred})
df

Unnamed: 0,Real Values,Predicted Values
0,209700.0,214691.397038
1,402300.0,396429.510319
2,351000.0,500642.959538
3,151200.0,126869.674381
4,318600.0,252936.869694
...,...,...
431727,238500.0,217542.434147
431728,376200.0,459234.783756
431729,255600.0,239938.055241
431730,508500.0,415864.221256


## Implement pipeline predictor

In [451]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=3)),
    ('model', LinearRegression())
]
pipeline = Pipeline(steps)
pipeline.fit(X_train, y_train)
print('Training score: {}'.format(pipeline.score(X_train, y_train)))
print('Test score: {}'.format(pipeline.score(X_test, y_test)))

Training score: 0.9321259466567403
Test score: 0.9320959686445839


In [452]:
y_pred_pipeline = pipeline.predict(X_test)
df = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred_pipeline})
print(df)
mse = mean_squared_error(y_test, y_pred_pipeline)
rmse = math.sqrt(mse)
print('RMSE: {}'.format(rmse))

        Real Values  Predicted Values
76998      319950.0          304468.0
397501     315000.0          355534.0
347486     396000.0          394536.0
242624     243000.0          280289.0
64696      194400.0          215075.0
...             ...               ...
400889     189000.0          203515.0
287118     292500.0          346456.0
391252     238500.0          262195.0
401219     159559.2          152140.0
201018     220500.0          204774.0

[86347 rows x 2 columns]
RMSE: 33763.470453538845


## Tuning number of features

In [501]:
scaler = StandardScaler().fit(X_train)
X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)

for p in range(1,5):
    print(p)
    poly = PolynomialFeatures(p)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.fit_transform(X_test)

    lin_reg = LinearRegression().fit(X_train_poly, y_train)

    y_train_pred = lin_reg.predict(X_train_poly)
    y_test_pred = lin_reg.predict(X_test_poly)
    mse = mean_squared_error(y_test, y_test_pred)
    rmse = math.sqrt(mse)
    print('RMSE: {}'.format(rmse))

1
RMSE: 59488.525022365335
2
RMSE: 50185.54311819719
3
RMSE: 33763.470453538845
4


MemoryError: Unable to allocate 12.5 GiB for an array with shape (345385, 4845) and data type float64

# Compare other regression models offered by scikit-learn

In [498]:
from sklearn import linear_model
from sklearn import svm

scaler = StandardScaler().fit(X_train)
X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)

models = [
    linear_model.SGDRegressor(),
    linear_model.BayesianRidge(),
    linear_model.LassoLars(),
    linear_model.ARDRegression(),
    linear_model.PassiveAggressiveRegressor(),
    linear_model.TheilSenRegressor(),
    linear_model.LinearRegression()]


for item in models:
    print(item)
    model = item
    model.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = math.sqrt(mse)
    print('RMSE: {}'.format(rmse))

SGDRegressor()
RMSE: 59105.260520049065
BayesianRidge()
RMSE: 59012.74630527616
LassoLars()
RMSE: 70899.45039972338
ARDRegression()




RMSE: 59012.82754357305
PassiveAggressiveRegressor()
RMSE: 60430.22332118529
TheilSenRegressor(max_subpopulation=10000)
RMSE: 61061.90972446903
LinearRegression()
RMSE: 59010.264842596014


# Split for 3 periods 

In [477]:
train.head()

Unnamed: 0,month,floor_area_sqm,flat_model,latitude,longitude,resale_price,dist_mrt,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease,datetime_month
0,2001-08,118.0,8,1.369008,103.958697,209700.0,1.137651115,1,3,2.0,4,0,1,0,0,0,87,730698
1,2014-10,110.0,5,1.399007,103.906991,402300.0,0.118453389,10,12,11.0,5,0,0,0,1,0,88,735507
2,2020-09,112.0,13,1.388348,103.873815,351000.0,0.479542021,1,3,2.0,5,0,0,0,1,0,83,737669
3,2000-10,67.0,12,1.318493,103.766702,151200.0,0.421345802,7,9,8.0,3,0,0,0,0,1,79,730394
4,2013-01,73.0,8,1.348149,103.742658,318600.0,0.775005151,7,9,8.0,3,0,0,0,0,1,71,734869


In [478]:
test.head()

Unnamed: 0,month,floor_area_sqm,flat_model,latitude,longitude,dist_mrt,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease,datetime_month
0,2004-01,94.0,12,1.346581,103.744085,0.667553928,4,6,5.0,4,0,0,0,0,1,84,731581
1,2001-11,122.0,5,1.357618,103.961379,0.767025394,4,6,5.0,5,0,1,0,0,0,95,730790
2,2002-07,67.0,12,1.337804,103.741998,0.515380197,1,3,2.0,3,0,0,0,0,1,79,731032
3,2015-04,82.0,12,1.380084,103.849574,0.547114337,4,6,5.0,3,0,0,0,1,0,65,735689
4,2004-04,117.0,17,1.31396,103.769831,0.516298405,1,3,2.0,5,0,0,0,0,1,73,731672


## Splitting data 

In [479]:
train_first = train[train['month']< '2007-01']
train_second = train[train['month'] > '2007-01']
train_second = train_second[train_second['month']< '2013-01']
train_third = train[train['month']>'2013-01']

## train first dataset

In [480]:
X = train_first.drop (["resale_price","month"],axis = 1)
y = train_first["resale_price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [481]:
regressor1 = LinearRegression()
regressor1.fit(X_train, y_train)
print('Training score: {}'.format(regressor1.score(X_train, y_train)))
print('Test score: {}'.format(regressor1.score(X_test, y_test)))
y_pred = regressor1.predict(X_test)
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(df)
print('RMSE for Linear Regression=>',np.sqrt(mean_squared_error(y_test,y_pred)))

Training score: 0.8579610683070618
Test score: 0.8564495969892346
          Actual      Predicted
330255  126000.0  151087.211957
37994   342000.0  380521.461957
217287  154800.0  130165.711957
110125  243000.0  263894.461957
96699   158400.0  142664.461957
...          ...            ...
103429  254700.0  267242.461957
321658  231300.0  237640.711957
195065  283500.0  242458.461957
146770  162000.0  129083.461957
354028  134100.0  164830.461957

[35407 rows x 2 columns]
RMSE for Linear Regression=> 30454.095789086245


In [482]:
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=3)),
    ('model', LinearRegression())
]

pipeline1 = Pipeline(steps)

pipeline1.fit(X_train, y_train)

print('Training score: {}'.format(pipeline1.score(X_train, y_train)))
print('Test score: {}'.format(pipeline1.score(X_test, y_test)))

Training score: 0.9550618428126569
Test score: 0.9542898288837451


In [483]:
y_pred_pipeline = pipeline1.predict(X_test)
df = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred_pipeline})
print(df)
mse = mean_squared_error(y_test, y_pred_pipeline)
rmse = math.sqrt(mse)
print('RMSE: {}'.format(rmse))

        Real Values  Predicted Values
330255     126000.0     124993.796875
37994      342000.0     365080.796875
217287     154800.0     157265.796875
110125     243000.0     261651.796875
96699      158400.0     152669.171875
...             ...               ...
103429     254700.0     257690.296875
321658     231300.0     247639.546875
195065     283500.0     248122.140625
146770     162000.0     173605.796875
354028     134100.0     140092.796875

[35407 rows x 2 columns]
RMSE: 17185.0153702381


## train second dataset

In [484]:
X = train_second.drop (["resale_price", "month"],axis = 1)
y = train_second["resale_price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [485]:
regressor2 = LinearRegression()
regressor2.fit(X_train, y_train)
print('Training score: {}'.format(regressor2.score(X_train, y_train)))
print('Test score: {}'.format(regressor2.score(X_test, y_test)))
y_pred = regressor2.predict(X_test)
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(df)
print('RMSE for Linear Regression=>',np.sqrt(mean_squared_error(y_test,y_pred)))

Training score: 0.8630487060809752
Test score: 0.8619319359713424
          Actual      Predicted
339243  310500.0  323214.891176
368870  337500.0  295550.891176
40330   237600.0  213102.891176
39786   288000.0  269934.891176
402363  358200.0  366878.891176
...          ...            ...
136718  310500.0  354926.891176
243813  232200.0  218974.891176
104288  181800.0  133166.891176
143191  630000.0  560238.891176
17301   279900.0  306814.891176

[25929 rows x 2 columns]
RMSE for Linear Regression=> 40121.52627563762


In [486]:
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=3)),
    ('model', LinearRegression())
]

pipeline2 = Pipeline(steps)

pipeline2.fit(X_train, y_train)

print('Training score: {}'.format(pipeline2.score(X_train, y_train)))
print('Test score: {}'.format(pipeline2.score(X_test, y_test)))

Training score: 0.9515574619453658
Test score: 0.9503212705479392


In [487]:
y_pred_pipeline = pipeline2.predict(X_test)
df = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred_pipeline})
print(df)
mse = mean_squared_error(y_test, y_pred_pipeline)
rmse = math.sqrt(mse)
print('RMSE: {}'.format(rmse))

        Real Values  Predicted Values
339243     310500.0       319673.9375
368870     337500.0       328081.4375
40330      237600.0       229336.4375
39786      288000.0       289243.9375
402363     358200.0       361460.9375
...             ...               ...
136718     310500.0       308806.9375
243813     232200.0       239932.4375
104288     181800.0       186071.9375
143191     630000.0       591196.9375
17301      279900.0       280705.9375

[25929 rows x 2 columns]
RMSE: 24066.673613530187


## train third dataset

In [488]:
X = train_third.drop (["resale_price", "month"],axis = 1)
y = train_third["resale_price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [489]:
regressor3 = LinearRegression()
regressor3.fit(X_train, y_train)
print('Training score: {}'.format(regressor3.score(X_train, y_train)))
print('Test score: {}'.format(regressor3.score(X_test, y_test)))
y_pred = regressor3.predict(X_test)
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(df)
print('RMSE for Linear Regression=>',np.sqrt(mean_squared_error(y_test,y_pred)))

Training score: 0.7954072341561418
Test score: 0.7918267619743851
          Actual      Predicted
392819  324000.0  357721.699529
44810   324000.0  374029.699529
300884  205200.0  222885.699529
267541  400500.0  330081.699529
331384  567000.0  595965.699529
...          ...            ...
32016   529200.0  620737.699529
195448  360000.0  315013.699529
357612  326700.0  311077.699529
416007  322200.0  307341.699529
408379  765000.0  594685.699529

[24390 rows x 2 columns]
RMSE for Linear Regression=> 59013.14834439843


In [490]:
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=3)),
    ('model', LinearRegression())
]

pipeline3 = Pipeline(steps)

pipeline3.fit(X_train, y_train)

print('Training score: {}'.format(pipeline3.score(X_train, y_train)))
print('Test score: {}'.format(pipeline3.score(X_test, y_test)))

Training score: 0.9394704344736985
Test score: 0.9378730822396448


In [491]:
y_pred_pipeline = pipeline3.predict(X_test)
df = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred_pipeline})
print(df)
mse = mean_squared_error(y_test, y_pred_pipeline)
rmse = math.sqrt(mse)
print('RMSE: {}'.format(rmse))

        Real Values  Predicted Values
392819     324000.0     341607.146661
44810      324000.0     350482.684905
300884     205200.0     209263.653876
267541     400500.0     383087.091164
331384     567000.0     573832.551334
...             ...               ...
32016      529200.0     578500.440036
195448     360000.0     374342.089478
357612     326700.0     343852.983538
416007     322200.0     293006.663186
408379     765000.0     715161.742445

[24390 rows x 2 columns]
RMSE: 32238.60491856271


# Generate output

In [492]:
def rules(row):
    if row['month']< '2007-01':
        return pipeline1.predict([row.drop('month')])[0]
    elif row['month'] > '2007-01' and row['month']< '2013-01':
        return pipeline2.predict([row.drop('month')])[0]
    else:
        return pipeline3.predict([row.drop('month')])[0]

pred = test.apply(rules, axis=1)
df_out = pd.DataFrame({'Predicted':pred})
df_out.reset_index(inplace=True)
df_out = df_out.rename(columns = {'index':'Id'})
print(df_out)


            Id     Predicted
0            0 -3.037420e+25
1            1 -2.937077e+25
2            2 -3.032822e+25
3            3 -1.030027e+21
4            4 -3.038200e+25
...        ...           ...
107929  107929 -8.150356e+25
107930  107930 -2.925018e+25
107931  107931 -2.907477e+25
107932  107932 -8.804880e+25
107933  107933 -8.214382e+25

[107934 rows x 2 columns]


In [493]:
test.shape

(107934, 17)

In [494]:
df_out.to_csv('./submission.csv', index = False, header=True)

## Trying Classifier models

In [500]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression


X = train.drop (["resale_price", "month"],axis = 1)
y = train["resale_price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

models = [KNeighborsClassifier(),
          DecisionTreeClassifier(),
          RandomForestClassifier(),
          AdaBoostClassifier(),
          GradientBoostingClassifier(),
          LogisticRegression()]

for item in models:
    print(item)
    model = item
    # since classifiers dont accept float input
    model.fit(X_train, y_train.astype(int))
    y_pred = clf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = math.sqrt(mse)
    print('RMSE: {}'.format(rmse))

KNeighborsClassifier()
RMSE: 1.4045529694596254e+17
DecisionTreeClassifier()
RMSE: 1.4045529694596254e+17
RandomForestClassifier()


MemoryError: could not allocate 23655874560 bytes