In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pprint

In [14]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [15]:
train['month'] = pd.to_datetime(train['month'])
train = train.drop(columns=['block', 'street_name'])

In [16]:

# first number in "storey_range"
train['lower'] = train['storey_range'].str.extract('(\d+)')
train['upper'] = train['storey_range'].str.extract('(\d+$)')
# calculate average storey from the range
train['storey_average'] = train['upper'].astype(int) + train['lower'].astype(int)
train['storey_average'] = train['storey_average'] / 2.0
train = train.drop(columns=['storey_range'])

In [17]:
train = train.replace(to_replace ="5 room", value ="5-room")
train = train.replace(to_replace ="4 room", value ="4-room")
train = train.replace(to_replace ="3 room", value ="3-room")
train = train.replace(to_replace ="2 room", value ="2-room")
train = train.replace(to_replace ="1 room", value ="1-room")
flattype_mapping={"1-room": 1,
                   "2-room": 2,
                   "3-room": 3,
                   "4-room": 4,
                   "5-room": 5,
                   "multi generation": 6,
                   "executive": 7,}
train["flattype_mapping"] = train['flat_type'].map(flattype_mapping)
train = train.drop(columns=['flat_type'])

In [18]:
train = train.drop(columns=['eco_category'])

In [19]:
one_hot_cols = pd.get_dummies(train, columns=['region'], prefix=['region'])
train = train.merge(one_hot_cols)
train = train.drop(columns=['region'])

In [20]:
lease_commence_date = train['lease_commence_date'].to_numpy()
#assuming 99-year lease, we calculate the remaining years on the lease
remaining_lease = lease_commence_date + 99 - pd.DatetimeIndex(train['month']).year.to_numpy()
train['remaining_lease'] = remaining_lease
train = train.drop(columns=['lease_commence_date'])

In [21]:
flat_models = train.groupby('flat_model')
keys = flat_models.groups.keys()

flat_type_dict = {}

# store the average "resale_price" of each "flat_model" in a dictionary with
#    key : "flat_model"
#    value : average "resale_price"
for key in keys:
    df_i = flat_models.get_group(key)
    flat_type_dict.update({key : df_i["resale_price"].mean()})

# ordering the different "flat_model" by average "resale_price" (ascending order)
{k: v for k, v in sorted(flat_type_dict.items(), key=lambda item: item[1])}
print(flat_type_dict)
print(flat_type_dict.keys())

# save the ordered list of keys from dictionary as a list for easier index finding
flat_model_list = list(flat_type_dict.keys())

for flat_model in flat_model_list:
    ordinal_value = flat_model_list.index(flat_model) + 1
    train['flat_model'] = train['flat_model'].str.replace(rf'^{flat_model}$', str(ordinal_value), regex=True)

{'2-room': 178910.52631578947, 'adjoined flat': 427049.8733009707, 'apartment': 447635.116593347, 'dbss': 673878.5657327594, 'improved': 316790.9768707737, 'improved maisonette': 455688.8470588235, 'maisonette': 470788.2769389391, 'model a': 296613.1202751046, 'model a maisonette': 457330.20206896536, 'model a2': 255978.02648366682, 'multi generation': 516351.54162162164, 'new generation': 228892.24870720712, 'premium apartment': 371717.8819421246, 'premium apartment loft': 776785.2923076922, 'premium maisonette': 545664.4656716419, 'simplified': 228073.0308484395, 'standard': 255991.59854674697, 'terrace': 547609.9727307692, 'type s1': 810214.2859090908, 'type s2': 915213.532075472}
dict_keys(['2-room', 'adjoined flat', 'apartment', 'dbss', 'improved', 'improved maisonette', 'maisonette', 'model a', 'model a maisonette', 'model a2', 'multi generation', 'new generation', 'premium apartment', 'premium apartment loft', 'premium maisonette', 'simplified', 'standard', 'terrace', 'type s1',

In [22]:
train = train.drop(columns=['elevation','town','subzone','planning_area'])


In [23]:
train.head()

Unnamed: 0,month,floor_area_sqm,flat_model,latitude,longitude,resale_price,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2001-08-01,118.0,8,1.369008,103.958697,209700.0,1,3,2.0,4,0,1,0,0,0,87
1,2014-10-01,110.0,5,1.399007,103.906991,402300.0,10,12,11.0,5,0,0,0,1,0,88
2,2020-09-01,112.0,13,1.388348,103.873815,351000.0,1,3,2.0,5,0,0,0,1,0,83
3,2000-10-01,67.0,12,1.318493,103.766702,151200.0,7,9,8.0,3,0,0,0,0,1,79
4,2013-01-01,73.0,8,1.348149,103.742658,318600.0,7,9,8.0,3,0,0,0,0,1,71


In [24]:
train.dropna()

Unnamed: 0,month,floor_area_sqm,flat_model,latitude,longitude,resale_price,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2001-08-01,118.0,8,1.369008,103.958697,209700.0,01,03,2.0,4,0,1,0,0,0,87
1,2014-10-01,110.0,5,1.399007,103.906991,402300.0,10,12,11.0,5,0,0,0,1,0,88
2,2020-09-01,112.0,13,1.388348,103.873815,351000.0,01,03,2.0,5,0,0,0,1,0,83
3,2000-10-01,67.0,12,1.318493,103.766702,151200.0,07,09,8.0,3,0,0,0,0,1,79
4,2013-01-01,73.0,8,1.348149,103.742658,318600.0,07,09,8.0,3,0,0,0,0,1,71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433013,2005-03-01,101.0,8,1.429658,103.792583,238500.0,01,03,2.0,4,0,0,1,0,0,94
433014,2016-04-01,95.0,13,1.390053,103.875941,376200.0,13,15,14.0,4,0,0,0,1,0,95
433015,2011-01-01,67.0,12,1.349224,103.934913,255600.0,01,03,2.0,3,0,1,0,0,0,74
433016,2013-05-01,123.0,5,1.389941,103.900721,508500.0,16,18,17.0,5,0,0,0,1,0,85


In [25]:
test['month'] = pd.to_datetime(test['month'])
test = test.drop(columns=['block', 'street_name'])

In [26]:
test['lower'] = test['storey_range'].str.extract('(\d+)')
test['upper'] = test['storey_range'].str.extract('(\d+$)')
# calculate average storey from the range
test['storey_average'] = test['upper'].astype(int) + test['lower'].astype(int)
test['storey_average'] = test['storey_average'] / 2.0
test = test.drop(columns=['storey_range'])

In [27]:
test = test.replace(to_replace ="5 room", value ="5-room")
test = test.replace(to_replace ="4 room", value ="4-room")
test = test.replace(to_replace ="3 room", value ="3-room")
test = test.replace(to_replace ="2 room", value ="2-room")
test = test.replace(to_replace ="1 room", value ="1-room")
flattype_mapping={"1-room": 1,
                   "2-room": 2,
                   "3-room": 3,
                   "4-room": 4,
                   "5-room": 5,
                   "multi generation": 6,
                   "executive": 7,}
test["flattype_mapping"] = test['flat_type'].map(flattype_mapping)
test = test.drop(columns=['flat_type'])

In [28]:
test = test.drop(columns=['eco_category'])

In [29]:
one_hot_cols = pd.get_dummies(test, columns=['region'], prefix=['region'])
test = test.merge(one_hot_cols)
test = test.drop(columns=['region'])

In [30]:
lease_commence_date = test['lease_commence_date'].to_numpy()
#assuming 99-year lease, we calculate the remaining years on the lease
remaining_lease = lease_commence_date + 99 - pd.DatetimeIndex(test['month']).year.to_numpy()
test['remaining_lease'] = remaining_lease
test = test.drop(columns=['lease_commence_date'])

In [32]:
test.head()

Unnamed: 0,month,town,floor_area_sqm,flat_model,latitude,longitude,elevation,subzone,planning_area,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2004-01-01,bukit batok,94.0,new generation,1.346581,103.744085,0.0,bukit batok west,bukit batok,4,6,5.0,4,0,0,0,0,1,84
1,2001-11-01,tampines,122.0,improved,1.357618,103.961379,0.0,tampines east,tampines,4,6,5.0,5,0,1,0,0,0,95
2,2002-07-01,jurong east,67.0,new generation,1.337804,103.741998,0.0,toh guan,jurong east,1,3,2.0,3,0,0,0,0,1,79
3,2015-04-01,ang mo kio,82.0,new generation,1.380084,103.849574,0.0,yio chu kang east,ang mo kio,4,6,5.0,3,0,0,0,1,0,65
4,2004-04-01,clementi,117.0,standard,1.31396,103.769831,0.0,clementi north,clementi,1,3,2.0,5,0,0,0,0,1,73


In [36]:
test = test.drop(columns=['elevation','town','subzone','planning_area'])
test.dropna()

Unnamed: 0,month,floor_area_sqm,flat_model,latitude,longitude,lower,upper,storey_average,flattype_mapping,region_central region,region_east region,region_north region,region_north-east region,region_west region,remaining_lease
0,2004-01-01,94.0,new generation,1.346581,103.744085,04,06,5.0,4,0,0,0,0,1,84
1,2001-11-01,122.0,improved,1.357618,103.961379,04,06,5.0,5,0,1,0,0,0,95
2,2002-07-01,67.0,new generation,1.337804,103.741998,01,03,2.0,3,0,0,0,0,1,79
3,2015-04-01,82.0,new generation,1.380084,103.849574,04,06,5.0,3,0,0,0,1,0,65
4,2004-04-01,117.0,standard,1.313960,103.769831,01,03,2.0,5,0,0,0,0,1,73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109105,2008-04-01,110.0,improved,1.380452,103.879333,10,12,11.0,5,0,0,0,1,0,94
109106,2006-01-01,102.0,model a,1.314481,103.870458,13,15,14.0,4,1,0,0,0,0,92
109107,2000-01-01,68.0,improved,1.294924,103.854315,07,09,8.0,3,1,0,0,0,0,78
109108,2009-07-01,104.0,model a,1.339927,103.687354,10,12,11.0,4,0,0,0,0,1,78


In [35]:
flat_models = test.groupby('flat_model')
keys = flat_models.groups.keys()
print(keys)
flat_type_dict = {}

# store the average "resale_price" of each "flat_model" in a dictionary with
#    key : "flat_model"
#    value : average "resale_price"
for key in keys:
    df_i = flat_models.get_group(key)
    flat_type_dict.update({key : df_i["resale_price"].mean()})

# ordering the different "flat_model" by average "resale_price" (ascending order)
{k: v for k, v in sorted(flat_type_dict.items(), key=lambda item: item[1])}
print(flat_type_dict)
print(flat_type_dict.keys())

# save the ordered list of keys from dictionary as a list for easier index finding
flat_model_list = list(flat_type_dict.keys())

for flat_model in flat_model_list:
    ordinal_value = flat_model_list.index(flat_model) + 1
    test['flat_model'] = test['flat_model'].str.replace(rf'^{flat_model}$', str(ordinal_value), regex=True)

dict_keys(['2-room', 'adjoined flat', 'apartment', 'dbss', 'improved', 'improved maisonette', 'maisonette', 'model a', 'model a maisonette', 'model a2', 'multi generation', 'new generation', 'premium apartment', 'premium apartment loft', 'premium maisonette', 'simplified', 'standard', 'terrace', 'type s1', 'type s2'])


KeyError: 'resale_price'