In [59]:
import numpy as np
import pandas as pd
from utils import * # Functions written by team ghostbusters
import seaborn as sns
from category_encoders import TargetEncoder
import networkx as nx

pd.options.display.float_format = '{:,.2f}'.format

In [60]:
import warnings
warnings.filterwarnings('ignore')

In [61]:
# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [62]:
df_train = pd.read_csv('./data/train.csv')

In [63]:
df_train.columns

Index(['listing_id', 'title', 'address', 'property_name', 'property_type',
       'tenure', 'built_year', 'num_beds', 'num_baths', 'size_sqft',
       'floor_level', 'furnishing', 'available_unit_types', 'total_num_units',
       'property_details_url', 'lat', 'lng', 'elevation', 'subzone',
       'planning_area', 'price'],
      dtype='object')

<h3> Property type </h3>
Ordinal encoding property_type based on EDA result

In [64]:
"""
Generalize property type
"""
df= df_train.copy()
df['general_property_type'] = df['property_type'].apply(generalize_property_type) # 3 general types： landed hdb condo

"""
Standardize property type
"""
df['property_type'] = df['property_type'].apply(standardize_property_type) # lower case


In [65]:
"""
Property categori ordinal encoding
The order of encoding follows the EDA results from small to large
"""
ordered_ppt_type = {
"hdb":1, 
"executive condo":2, "walk-up":2,"shophouse":2,
"condo":3,"apartment":3,
"townhouse":4,"terraced house":4,"landed":4,"cluster house":4,
"corner terrace":5,
"conservation house":6,
"semi-detached house":7,
"bungalow":8,
"land only":9,
 }
df["property_type_ordinal"] = df["property_type"].replace(ordered_ppt_type)

<h3>Price</h3>
Clean price outliers

In [66]:
"""
Price data cleaning
"""
print("Data Num before clean:", df.shape[0])
#   1. Remove large price
price_outlier = get_outlier(df,'price')
df = df.drop(price_outlier)
print('Data Num after clean 1:', df.shape[0])

#   2. Remove 0 price
df= df[df.price>0]
print('Data Num after clean 2:', df.shape[0])

#   3. Remove property type related noise price
#   Threshold based on EDA
hdb_outlier = df[(df.general_property_type=='hdb')&(df.price>2000000)].index
df = df.drop(hdb_outlier)
print('Data Num after clea 3:', df.shape[0])


Data Num before clean: 20254
Data Num after clean 1: 20252
Data Num after clean 2: 20151
Data Num after clea 3: 20149


<h3>Size Sqft</h3>
Clean outlier, fill missing value

In [67]:
"""
Clean size_sqft
"""
#   Remove large outlier, transfrom small outlier to size square feet if can find corresponding data in the same property
df.size_sqft = df.apply(lambda r: standardize_size(r, df, 400, 6000,'hdb'), axis=1)
df.size_sqft = df.apply(lambda r: standardize_size(r, df, 400, 1000000,'condo'), axis=1)
df.size_sqft = df.apply(lambda r: standardize_size(r, df, 400, 60000,'landed'), axis=1)
df = df.dropna(subset=['size_sqft'])    # Large outlier are set as na

print('Number after clean', df.shape[0])

Number after clean 20073


<h3> Tenure </h3>
Group tenure, ordinal encoding based on EDA results

In [68]:
"""
Tenure
"""
# Genralize tenur into 3 groups 'Nan', '99~110 year' ,'900+ year', 'freehold'
# Ordinal encoding following EDA results
df['tenure_group'] = df['tenure'].apply(standardize_tenure)

<h3>Numer of beds</h3>
Fill missing value

In [69]:
"""
Bed
"""
print('Null in num_beds:',len(df[df.num_beds.isnull()]))

#   1. Set studio bed
is_studio = df.title.str.contains('studio')# is studio
is_null = df.num_beds.isnull() # num_beds is null
is_small = df.size_sqft <=900 # is studio but not studio house
df.loc[is_studio&is_null&is_small,"num_beds"] = 1

#   2. Fill with information of the same property
df['num_beds'] = df.groupby(['property_name','size_sqft'])['num_beds'].transform(lambda x: x.fillna(next(iter(x.mode()), np.nan)))

#   3. Fill num_beds with hdb info, in comparison with other type of house, the floor plan and size of hdb is generally fiexed
is_null = df.num_beds.isnull() # num_beds is null
is_4rm = df.size_sqft > 1290 # Observation from EDA over 85% of 4 room larger than 1290 sqft
is_2rm = df.size_sqft < 900 # Observation from EDA over 95% of 2 room smaller than 900 sqft
is_hdb = df.general_property_type.str.contains("hdb", na=False, case=False) # is hdb
df.loc[is_hdb&is_null&is_4rm,'num_beds'] = 4
df.loc[is_hdb&is_null&(~is_2rm)&(~is_4rm),'num_beds'] = 3
df.loc[is_hdb&is_null&is_2rm,'num_beds'] = 2

#   4. Drop invalid data 
df = df.dropna(subset=['num_beds'])
print('data num after cleaning:',df.shape[0])
print('Null in num_beds after cleaning:',len(df[df.num_beds.isnull()]))

Null in num_beds: 68
data num after cleaning: 20065
Null in num_beds after cleaning: 0


<h3>Num_baths</h3>
Fill missing values, drop outlier and other missing data

In [70]:
"""
baths
"""
print('Null in num_baths:',len(df[df.num_baths.isnull()]))

#   1. Fill with information of the same property
df['num_baths'] = df.groupby(['property_name','num_beds','size_sqft'])['num_baths'].transform(lambda x: x.fillna(next(iter(x.mode()), np.nan)))

#  2. Drop data that cannot be filled
df = df.dropna(subset=['num_baths'])
df[df.num_baths.isnull()].shape[0]
print('data num after cleaning:',df.shape[0])
print('Null in num_baths after cleaning:',len(df[df.num_baths.isnull()]))

Null in num_baths: 432
data num after cleaning: 20012
Null in num_baths after cleaning: 0


<h3> Remove outliers regarding num_beds and num_baths</h3>
Find outlier based on bed2bath ratio, fill outlier with reference value from same property, drop other outliers.

In [71]:
##  1. Try filling noisy data with information of the same property, outliers are defined by based on boxplot result
df_noise = df.copy()
df_noise['bed2bath'] =df_noise['num_beds']/df_noise['num_baths']
df_noise_cleaned = df_noise.apply(lambda r: standardize_bednbath(r, df_noise, 0.4,3), axis=1)

In [72]:
print('data num before cleaning:',df_noise.shape[0])
df_noise_arr = np.array([*df_noise_cleaned])
df_noise['num_beds'] = df_noise_arr[:,0]
df_noise['num_baths'] = df_noise_arr[:,1]
df_noise = df_noise.dropna(subset=['num_beds','num_baths'])
print('data num after cleaning:',df_noise.shape[0])

print('Null in num_beds after cleaning:',len(df_noise[df_noise.num_beds.isnull()]))
print('Null in num_baths after cleaning:',len(df_noise[df_noise.num_baths.isnull()]))


data num before cleaning: 20012
data num after cleaning: 20007
Null in num_beds after cleaning: 0
Null in num_baths after cleaning: 0


<h3>Built Year</h3>
Fill missing value

In [73]:
"""
Built year 
"""
##  Fill missing value based on mean values of each general property type 'hdb' 'condo' 'landed'
df_year = df_noise.copy()
year_group = df_year.groupby(by = ['general_property_type'])['built_year'].transform(lambda x: int(x.mean()))
df_year['built_year'] = df_year['built_year'].fillna(year_group)

<h3>price per sqft</h3>
Calculate price every square feet using cleaned data, remove noise

In [74]:
#   1. Calculate size per sqft
df_per_price = df_year.copy()
df_per_price["per_price"] = df_per_price["price"]/df_per_price["size_sqft"]

In [75]:
#   2. Remove noise
df = df_per_price[df_per_price.per_price<30000]

<h3>Target encoding per price based on subzone</h3>
Target encoding per_price attributes based on subzone category of data entries

In [76]:
"""
Subzone name encode
"""
encoder = TargetEncoder()
df['subzone_per_price_encoded'] = encoder.fit_transform(df['subzone'], df['per_price'])

In [77]:
print("number of entries", len(df))

number of entries 20003


<h1>Auxiliary Data</h2>

<h3>MRT</h3>
find 50 mrt stations with the highest in/out degree centrality

In [78]:
# Load data
df_mrt_connections = pd.read_csv('data/auxiliary-data/sg-mrt-connections.csv')
df_mrt = pd.read_csv('data/auxiliary-data/sg-mrt-stations.csv')

In [79]:
# Find important statinons 
G_undirected = nx.Graph()

for idx, row in df_mrt_connections.iterrows():
    G_undirected.add_edge(row['to'], row['from'])

# Use degree centrality
nx_degree_scores = nx.algorithms.centrality.degree_centrality(G_undirected)

ordered_degree_scores = sorted(nx_degree_scores.items(), key=lambda kv: kv[1], reverse=True)

for station, score in ordered_degree_scores[:5]:
    print('{} ({:.5f})'.format(station, score))

important_mrt_stations = [entry[0] for entry in ordered_degree_scores[:50]]
print(important_mrt_stations)

df_important_mrt = df_mrt[df_mrt['name'].isin(important_mrt_stations)]

dhoby ghaut (0.03968)
macpherson (0.03175)
little india (0.03175)
buona vista (0.03175)
chinatown (0.03175)
['dhoby ghaut', 'macpherson', 'little india', 'buona vista', 'chinatown', 'botanic gardens', 'newton', 'serangoon', 'bugis', 'bishan', 'outram park', 'woodlands', 'promenade', 'paya lebar', 'tampines', 'raffles place', 'caldecott', 'expo', 'tanah merah', 'jurong east', 'bayfront', 'marina bay', 'city hall', 'tanjong pagar', 'bright hill', 'mayflower', 'mattar', 'rochor', 'one-north', 'kent ridge', 'downtown', 'telok ayer', 'ubi', 'dover', 'farrer park', 'boon keng', 'clarke quay', 'kaki bukit', 'farrer road', 'holland village', 'harbourfront', 'telok blangah', 'marsiling', 'kranji', 'sengkang', 'buangkok', 'stevens', 'bedok', 'yew tee', 'bukit batok']


In [80]:
# Distance cauculated based on lat and lng
df = add_distance_to_nearest_mrt(df_important_mrt, df, 'dist_to_nearest_important_mrt')
df['dist_to_nearest_important_mrt_rounded'] = df['dist_to_nearest_important_mrt'].round(0).astype(int) #Round to integer

<h3>Shopping Mall</h3>
Find number of shopping malls within 300 m of the property

In [81]:
#   Load Data
df_shopping_mall = pd.read_csv('./data/auxiliary-data/sg-shopping-malls.csv')

In [82]:
#   Add number of nearby shopping malls
#   Within 300 m of the property
df = add_number_of_nearby_shopping_malls(df_shopping_mall,df)

<h3>Schools</h3>
Find number of nearby schools within 1 km of the property

In [83]:
df_primary_schools = pd.read_csv('./data/auxiliary-data/sg-primary-schools.csv')
df_secondary_schools = pd.read_csv('./data/auxiliary-data/sg-secondary-schools.csv')

In [84]:
#   Within 1km of the property
df = add_number_of_nearby_primary_schools(df_primary_schools, df)

In [85]:
#   Within 1km of the property
df = add_number_of_nearby_secondary_schools(df_secondary_schools, df)

<h3>Commertial Centre</h3>
Find name of nearest commercial centre(type: BN and CR) within 10 km of the property.
Ordinal encoding the name 

In [86]:
df_cc = pd.read_csv('./data/auxiliary-data/sg-commerical-centres.csv')

In [87]:
df= add_name_of_nearest_commercial_centre_by_type(df,df_cc,'BN',10)# within 10 km
df = add_name_of_nearest_commercial_centre_by_type(df,df_cc,'CR',10) # within 10 km

In [88]:
#   Categorical encoding BN
ordered_BN = {
'Novena':9,
'Alexandra':8,
 'Buona Vista':7,
 'Paya Lebar Central':6,'Serangoon':5,
 'Bishan':4,
 'Changi East Urban District':3,
 'Tao Payoh':2,
 'None':1
 }
df["name_of_nearest_BN_ordinal"] = df["name_of_nearest_BN"].replace(ordered_BN)

In [89]:
#   Categorical encoding CR
ordered_CR = {
'Central Business District':5,
 'Jurong Lake District':4,
 'Tampines Regional Centre':3,
 'Seletar Regional Centre':2,
 'Woodlands Regional Centre':1
 }
df["name_of_nearest_CR_ordinal"] = df["name_of_nearest_CR"].replace(ordered_CR)

<h2> Drop features</h2>
Only contains features that will be useful for training

In [90]:
df_final = df[['built_year', 'num_beds', 'num_baths', 'lat', 'lng', 'size_sqft',
                    'tenure_group', 'subzone_per_price_encoded',
                    'property_type_ordinal',
                    #mrt
                    'dist_to_nearest_important_mrt_rounded',
                    #schools
                    'number_of_nearby_primary_schools', 
                    'number_of_nearby_secondary_schools', 
                    #shopping mall
                    'number_of_nearby_shopping_malls',
                    #CR
                    'name_of_nearest_BN_ordinal',
                    'name_of_nearest_CR_ordinal',
                    #dependent variable
                    'price',
                    'per_price'
                    ]]


In [91]:
# Check is still contains missing values
for col in df_final.columns:
    if df_final[col].isna().sum():
        print(col)

<h2> Save to CSV </h2>

In [92]:
# CSV without dropping columns, will be useful for target encoding of test set
df.to_csv('./data/train_final_complete_nodrop.csv',index=False)
# CSV for model training
df_final.to_csv('./data/train_final_complete_nodrop.csv',index=False)