# Task 1: Housing Price Regression

Motivation here.

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor, BaggingRegressor
from sklearn.model_selection import train_test_split, PredefinedSplit, GridSearchCV
from sklearn.metrics import mean_squared_error

#import torch
#import torch.nn as nn
#import torch.optim as optim
#from torch.utils.data import Dataset,DataLoader

import re

In [2]:
# Hyperparameters

RANDOM_CONTROL = 42 # For reproducibility of notebook
TRAIN_SIZE = 0.8

# Random Forest: Fill in based on GridSearch results
RF_NUM_ESTIMATORS = 100
RF_MAX_DEPTH = 50
RF_MAX_FEATURES = 1
RF_MIN_SPLIT = 2
RF_MIN_LEAF = 1
RF_BOOTSTRAP = True
RF_CRITERION = "squared_error"

# Gradient Boosting: Fill in based on GridSearch results
GB_NUM_ESTIMATORS = 100
GB_MAX_DEPTH = 50
GB_CRITERION = "squared_error"
GB_LEARNING_RATE = 0.1

# AdaBoost: Fill in based on GridSearch results
AB_NUM_ESTIMATORS = 100
AB_MAX_DEPTH = 50
AB_LEARNING_RATE = 0.1

# Neural Net: Fill in based on test iterations
NN_NUM_EPOCHS = 10
NN_BATCH_SIZE = 32
NN_LEARNING_RATE = 0.1

In [3]:
# Read training data
df = pd.read_csv('data/train.csv') 

df.head(3)

df.shape

(20254, 21)

# Improving our Dataset with Aux Data

Since we have auxilliary data, we can add them to our training data to see if our model improves. We can find the number of infrasructures close to our listings. The infrastructures we have include:
* commercial centers
* mrt stations
* primary schools
* secondary schools
* shopping malls

We also have data for 
* subzones

In [5]:
# helper funcitons
# first we define functions we need
# haversine distance function
from math import radians, cos, sin, asin, sqrt
def haversine(lat1, long1, lat2, long2):
    """
Replicating the same formula as mentioned in Wiki
    """
    # convert decimal degrees to radians 
    lat1, long1, lat2, long2 = map(radians, [lat1, long1, lat2, long2])
    # haversine formula 
    dlon = long2 - long1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    # Radius of earth in kilometers is 6371
    km = 6371* c
    return km

# find the nmber of nearest infrastructure to our property listing
# km is the maximum distance we want to calculate from. 
# calculate the number of infrastructure within x km
def count_nearest(lat, long, infrastructure, km):
    distances = infrastructure.apply(
        lambda row: haversine(lat, long, row['lat'], row['lng']), 
        axis=1)
    
    return sum(i <= km for i in distances)


# find the distance of the nearest infrastructure to our property listing
def find_nearest_distance(lat, long, infrastructure):
    distances = infrastructure.apply(
        lambda row: haversine(lat, long, row['lat'], row['lng']), 
        axis=1)
    return distances.min()


# get the population of the subzone the listing is in
def find_subzone_population(subzone, subzones_df):
    sz = subzones_df
    if subzone == "":
        sz_pop = None
    else:
        sz_pop = sz.loc[sz['name'] == subzone, 'population'].item()
    return sz_pop

# get the population density of the subzone the listing is in
def find_subzone_population_density(subzone, subzones_df):
    sz = subzones_df
    if subzone == "":
        sz_pop_density = None
    else:
        sz_pop_density = sz.loc[sz['name'] == subzone, 'population'].item()/sz.loc[sz['name'] == subzone, 'area_size'].item()
    return sz_pop_density

def improve_dataset(df):
    df = df.copy()
    # get the files containing the infrastructures data
    cc = pd.read_csv("data/auxiliary-data/sg-commerical-centres.csv")
    mrt = pd.read_csv("data/auxiliary-data/sg-mrt-stations.csv")
    ps = pd.read_csv("data/auxiliary-data/sg-primary-schools.csv")
    ss = pd.read_csv("data/auxiliary-data/sg-secondary-schools.csv")
    sm = pd.read_csv("data/auxiliary-data/sg-shopping-malls.csv")
    sz = pd.read_csv("data/auxiliary-data/sg-subzones.csv")
    
    df.subzone = df.subzone.fillna('') 
    
    # get the population of the subzone the listing is in, and add to data
    df['subzone_pop'] = df.apply(
        lambda row: find_subzone_population(row['subzone'], sz), 
        axis=1)
    df['subzone_pop'] = df['subzone_pop'].round(decimals=3)
    
    # get the population of the subzone the listing is in, and add to data
    df['subzone_pop_density'] = df.apply(
        lambda row: find_subzone_population(row['subzone'], sz), 
        axis=1)
    df['subzone_pop_density'] = df['subzone_pop_density'].round(decimals=3)
    
    # calculate distance to nearest commercial center
    df['dist_2_nearest_cc'] = df.apply(
        lambda row: find_nearest_distance(row['lat'], row['lng'], cc), 
        axis=1)
    df['dist_2_nearest_cc'] = df['dist_2_nearest_cc'].round(decimals=3)

    # calculate the number of commercial centers within x km
    df['nearest_cc_count'] = df.apply(
        lambda row: count_nearest(row['lat'], row['lng'], cc, 1), 
        axis=1)
    
    # calculate distance to nearest mrt station
    df['dist_2_nearest_mrt'] = df.apply(
        lambda row: find_nearest_distance(row['lat'], row['lng'], mrt), 
        axis=1)
    df['dist_2_nearest_mrt'] = df['dist_2_nearest_mrt'].round(decimals=3)

    # calculate the number of mrt stations within x km
    df['nearest_mrt_count'] = df.apply(
        lambda row: count_nearest(row['lat'], row['lng'], mrt, 1), 
        axis=1)
    
    # calculate distance to nearest primary school
    df['dist_2_nearest_ps'] = df.apply(
        lambda row: find_nearest_distance(row['lat'], row['lng'], ps), 
        axis=1)
    df['dist_2_nearest_ps'] = df['dist_2_nearest_ps'].round(decimals=3)

    # calculate the number of primary schools within x km
    df['nearest_ps_count'] = df.apply(
        lambda row: count_nearest(row['lat'], row['lng'], ps, 1), 
        axis=1)
    # calculate distance to nearest secondary school
    df['dist_2_nearest_ss'] = df.apply(
        lambda row: find_nearest_distance(row['lat'], row['lng'], ss), 
        axis=1)
    df['dist_2_nearest_ss'] = df['dist_2_nearest_ss'].round(decimals=3)

    # calculate the number of secondary schools within x km
    df['nearest_ss_count'] = df.apply(
        lambda row: count_nearest(row['lat'], row['lng'], ss, 1), 
        axis=1)
    
    # calculate distance to nearest shopping mall
    df['dist_2_nearest_sm'] = df.apply(
        lambda row: find_nearest_distance(row['lat'], row['lng'], sm), 
        axis=1)
    df['dist_2_nearest_sm'] = df['dist_2_nearest_sm'].round(decimals=3)

    # calculate the number of secondary schools within x km
    df['nearest_sm_count'] = df.apply(
        lambda row: count_nearest(row['lat'], row['lng'], sm, 1), 
        axis=1)

    return df

In [6]:
df_ = improve_dataset(df)

In [7]:
print(df_.shape)
df_.head(3)

(20254, 33)


Unnamed: 0,listing_id,title,address,property_name,property_type,tenure,built_year,num_beds,num_baths,size_sqft,...,dist_2_nearest_cc,nearest_cc_count,dist_2_nearest_mrt,nearest_mrt_count,dist_2_nearest_ps,nearest_ps_count,dist_2_nearest_ss,nearest_ss_count,dist_2_nearest_sm,nearest_sm_count
0,122881,hdb flat for sale in 866 yishun street 81,sembawang / yishun (d27),866 yishun street 81,hdb 4 rooms,,1988.0,3.0,2.0,1115,...,3.339,0,0.574,1,0.276,3,0.183,3,0.621,1
1,259374,hdb flat for sale in 506b serangoon north aven...,hougang / punggol / sengkang (d19),hdb-serangoon estate,hdb,99-year leasehold,1992.0,4.0,2.0,1575,...,2.402,0,1.734,0,0.123,3,0.291,4,0.553,1
2,665422,4 bed condo for sale in meyerhouse,128 meyer road,meyerhouse,condo,freehold,2022.0,4.0,6.0,3070,...,2.171,0,1.32,0,0.891,1,0.895,1,0.824,1


# EDA

Talk about pre-processing here.
Visualize plots of original data.

For HDB,

Assume that num_beds refers only to the bedrooms excluding the living rooms
* Hdb 2-room = 1 bedroom , 1 bathroom
* Hdb 3-room = 2 bedroom , 2 bathroom
* Hdb 4-room = 3 bedroom , 2 bathroom
* Hdb 5-room = 3 bedroom , 2 bathroom
* Hdb 3-gen = 4 bedroom , 3 bathroom
* Hdb Executive = 3/4 bedroom, 2/3 bathroom
* Hdb Masionette = 3/4 bedroom, 2/3 bathroom
* Hdb Jumbo = 4 bedroom, 4 bathroom

References:
* http://www.data.com.sg/template-m.jsp?p=my/1.html 
* https://www.hdb.gov.sg/residential/buying-a-flat/finding-a-flat/types-of-flats


In [8]:
def visualize():
    pass

# IMPORTANT: Trivial modifications only. Do not aggregate/standardize/impute here!!

def ignore_attributes(df) -> pd.DataFrame:
    # Drop listing id; nominal identifier with no meaning
    df.drop('listing_id', axis=1, inplace=True)

    # Drop elevation; all the values are 0, spurious attribute
    df.drop('elevation', axis=1, inplace=True)

    # Drop url; nominal identifier with no meaning; useful for manual lookups or scraping
    df.drop('property_details_url', axis=1, inplace=True)

    # Drop floor level as 83% missing and sparse with the rest of the values. 
    # Not enough data available to get the model trained.
    df.drop('floor_level', axis=1, inplace=True)

    # Drop column property_type, tenure and furnishing as it is now encoded
    df.drop('property_type',axis = 1, inplace=True)
    df.drop('tenure',axis=1, inplace=True)
    df.drop('furnishing', axis=1, inplace=True)

    # BELOW CODE IN THIS SECTION IS ONLY MEANT TO GET THE SKELETON WORKING; RE-EVALUATE EACH ATTRIBUTE ONE BY ONE
    df.drop('title', axis=1, inplace=True)
    df.drop('address', axis=1, inplace=True)
    df.drop('property_name', axis=1, inplace=True)
    df.drop('built_year', axis=1, inplace=True)
    df.drop('available_unit_types', axis=1, inplace=True)
    df.drop('total_num_units', axis=1, inplace=True) # Bernard verified dropping it. 27.9% missing.
    df.drop('lat', axis=1, inplace=True)
    df.drop('lng', axis=1, inplace=True)
    df.drop('subzone', axis=1, inplace=True)
    df.drop('planning_area', axis=1, inplace=True)

    return df

def handle_missing_values(df) -> pd.DataFrame:
    # Treat missing year data as new.
    # Semantically, we define this attribute as the depreciation factor for pricing.
    # A new house or one with missing data denotes the depreciation factor is 0 or unknown.
    # The depreciation factor is assumed to be the difference between construction and current year.
    # TODO: Maybe do not treat future years as current! Inflation factor might be one to look out for.
    df['built_year'] = df['built_year'].fillna(2022)
    
    # TODO: 80 are missing. Should we remove them or should we keep it as 0?
    # Verify assumption if studio qualifies as 1 bed. 
    # 75 of missing are studio, we replace the Nan as 1
    filter_beds_studio = ((df.num_beds.isna()) & ((df.title.str.contains('studio','Studio', flags=re.IGNORECASE, regex=True))))
    df.loc[filter_beds_studio, "num_beds"] = 1
    # 5 of missing, we do not have much info. Use 0 to denote absence of attribute
    df['num_beds'] = df['num_beds'].fillna(0)

    # TODO: 400 are missing. Cannot remove so many data. Use 0 to denote absence of attribute.
    df['num_baths'] = df['num_baths'].fillna(0)
    
    
    return df
    
def handle_invalid_values(df) -> pd.DataFrame:
    # Price is the target regression variable. If negative or 0, treat that row as invalid.
    if 'price' in df:
        df = df[df.price > 0]

    # TODO: Verify the steps below for HDB - bed/bath ratio, price checks
    # Filtering those with number of bathrooms more than number of bedrooms for HDB
    filter_bath_beds_hdb = ((df.num_baths > df.num_beds) & ((df.property_type.str.contains('hdb','Hdb', flags=re.IGNORECASE, regex=True)) | (df.title.str.contains('hdb','Hdb', regex=False))))
    df = df.drop(df[filter_bath_beds_hdb].index)

    #Filtering those with number of bathrooms more than 4, number of bedrooms more than 4 for HDB
    filter_bath_beds_4_hdb = (((df.num_baths > 4) | (df.num_beds > 5)) & ((df.property_type.str.contains('hdb','Hdb', flags=re.IGNORECASE, regex=True)) | (df.title.str.contains('hdb','Hdb', regex=False))))
    df = df.drop(df[filter_bath_beds_4_hdb].index)
    
    # price ; filtering for HDB price more than 2 million
    if 'price' in df:
        filter_price_hdb = ((df.price > 2000000) & ((df.property_type.str.contains('hdb','Hdb', flags=re.IGNORECASE, regex=True)) | (df.title.str.contains('hdb','Hdb', regex=False))))
        df = df.drop(df[filter_price_hdb].index)
        
    ## Outliers
    
    # Filtering those hdb with more than 2000 size_sqft
    filter_size_hdb = ((df.size_sqft > 2000) & ((df.property_type.str.contains('hdb','Hdb', flags=re.IGNORECASE, regex=True)) | (df.title.str.contains('hdb','Hdb', regex=False))))
    df[filter_size_hdb][['property_type','num_baths','num_beds','size_sqft','price']]
    df = df.drop(df[filter_size_hdb].index)
    
    
    if 'price' in df:
    # Filtering those data with less than $200/square feet
        df['price per sq ft'] = df['price']/df['size_sqft']
        filter_price_sqft_200 = ((df['price per sq ft'] < 200) & (df['price per sq ft'] > 0))
        df = df.drop(df[filter_price_sqft_200].index)
    # Filtering those data with less than 500 square feet, and more than $5000 per square feet
        df['price per sq ft'] = df['price']/df['size_sqft']
        filter_size = ((df['price per sq ft'] > 5000) & (df['price per sq ft'] > 0) & (df['size_sqft'] < 500))
        df = df.drop(df[filter_size].index)
        
    df.drop('price per sq ft', axis=1, inplace=True)
        
    return df

def transform_data(df) -> pd.DataFrame:
    # TODO: Test against not doing this.
    df.loc[df["built_year"] > 2022, "built_year"] = 2022
    
    # Convert built_year into the aforementioned depreciation factor
    df["depreciation"] = (2022-df["built_year"])
    
    # TODO: Add details on why we are doing so
    df.loc[df.property_type.str.contains('hdb', flags=re.IGNORECASE, regex=True), 'property_type'] = 'hdb'
    df.loc[df.property_type.str.contains('condo', flags=re.IGNORECASE, regex=True), 'property_type'] = 'condo'
    df['property_type'] = df['property_type'].str.lower()
    df.loc[df.property_type.str.contains('cluster house', flags=re.IGNORECASE, regex=True), 'property_type'] = 'landed'
    df.loc[df.property_type.str.contains('townhouse', flags=re.IGNORECASE, regex=True), 'property_type'] = 'landed'
    df.loc[df.property_type.str.contains('land only', flags=re.IGNORECASE, regex=True), 'property_type'] = 'landed'
    df.loc[df.property_type.str.contains('apartment',  flags=re.IGNORECASE, regex=True), 'property_type'] = 'condo'
    df.loc[df.property_type.str.contains('bungalow', flags=re.IGNORECASE, regex=True), 'property_type'] = 'bungalow'
    df.loc[df.property_type.str.contains('semi-detached house', flags=re.IGNORECASE, regex=True), 'property_type'] = 'corner'
    df.loc[df.property_type.str.contains('corner terrace',flags=re.IGNORECASE, regex=True), 'property_type'] = 'corner'
    df.loc[df.property_type.str.contains('shophouse', flags=re.IGNORECASE, regex=True), 'property_type'] = 'protected'
    df.loc[df.property_type.str.contains('conservation house', flags=re.IGNORECASE, regex=True), 'property_type'] = 'protected'
    
    # Get one hot encoding of columns property_type
    property_columns = ['bungalow', 'condo', 'hdb', 'corner', 'landed', 'protected', 'terraced house', 'walk-up']
    one_hot = pd.get_dummies(df['property_type'], columns=property_columns)
    # Join the encoded df
    df = df.join(one_hot)
    
    df['tenure'] = df['tenure'].fillna(value=df.property_type)
    df.loc[df.tenure.str.contains('hdb', flags=re.IGNORECASE, regex=True), 'tenure'] = '99-year leasehold'
    df.loc[df.tenure.str.contains('condo', flags=re.IGNORECASE, regex=True), 'tenure'] = '99-year leasehold'
    df.loc[df.tenure.str.contains('terraced house', flags=re.IGNORECASE, regex=True), 'tenure'] = '99-year leasehold'
    df.loc[df.tenure.str.contains('corner', flags=re.IGNORECASE, regex=True), 'tenure'] = '99-year leasehold'
    df.loc[df.tenure.str.contains('landed', flags=re.IGNORECASE, regex=True), 'tenure'] = '99-year leasehold'
    df.loc[df.tenure.str.contains('protected', flags=re.IGNORECASE, regex=True), 'tenure'] = 'freehold'
    df.loc[df.tenure.str.contains('bungalow', flags=re.IGNORECASE, regex=True), 'tenure'] = 'freehold'

    df.loc[df.tenure.str.contains('110-year leasehold', flags=re.IGNORECASE, regex=True), 'tenure'] = '99-year leasehold'
    df.loc[df.tenure.str.contains('103-year leasehold', flags=re.IGNORECASE, regex=True), 'tenure'] = '99-year leasehold'
    df.loc[df.tenure.str.contains('102-year leasehold', flags=re.IGNORECASE, regex=True), 'tenure'] = '99-year leasehold'
    df.loc[df.tenure.str.contains('100-year leasehold', flags=re.IGNORECASE, regex=True), 'tenure'] = '99-year leasehold'

    df.loc[df.tenure.str.contains('999-year leasehold', flags=re.IGNORECASE, regex=True), 'tenure'] = 'freehold'
    df.loc[df.tenure.str.contains('946-year leasehold', flags=re.IGNORECASE, regex=True), 'tenure'] = 'freehold'
    df.loc[df.tenure.str.contains('956-year leasehold', flags=re.IGNORECASE, regex=True), 'tenure'] = 'freehold'
    df.loc[df.tenure.str.contains('947-year leasehold', flags=re.IGNORECASE, regex=True), 'tenure'] = 'freehold'
    df.loc[df.tenure.str.contains('929-year leasehold', flags=re.IGNORECASE, regex=True), 'tenure'] = 'freehold'

    df['encoded_tenure'] = 0
    df.loc[df.tenure.str.contains('freehold', flags=re.IGNORECASE, regex=True), 'encoded_tenure'] = 1
    
    df['encoded_furnishing'] = 0
    df.loc[df.furnishing.str.contains('partial', flags=re.IGNORECASE, regex=True), 'encoded_furnishing'] = 0.5
    df.loc[df.furnishing.str.contains('unfurnished', flags=re.IGNORECASE, regex=True), 'encoded_furnishing'] = -1
    df.loc[df.furnishing.str.contains('fully', flags=re.IGNORECASE, regex=True), 'encoded_furnishing'] = 1
    
    return df
    
def pre_process(df, mode='train') -> pd.DataFrame:
    df = handle_missing_values(df)
    if mode=='train': df = handle_invalid_values(df)
    df = transform_data(df)
    df = ignore_attributes(df)
    
    return df

In [11]:
df_preprocessed = pre_process(df, mode='train')
print(df_preprocessed.shape)
df_preprocessed.head()

(20060, 15)


Unnamed: 0,num_beds,num_baths,size_sqft,price,depreciation,bungalow,condo,corner,hdb,landed,protected,terraced house,walk-up,encoded_tenure,encoded_furnishing
0,3.0,2.0,1115,514500.0,34.0,0,0,0,1,0,0,0,0,0,0.0
1,4.0,2.0,1575,995400.0,30.0,0,0,0,1,0,0,0,0,0,0.0
2,4.0,6.0,3070,8485000.0,0.0,0,1,0,0,0,0,0,0,1,0.5
3,3.0,2.0,958,2626000.0,0.0,0,1,0,0,0,0,0,0,1,0.5
4,2.0,1.0,732,1764000.0,0.0,0,1,0,0,0,0,0,0,0,0.0


In [14]:
df_preprocessed_ = pre_process(df_, mode='train')
df_preprocessed_['subzone_pop'].fillna((df_preprocessed_['subzone_pop'].mean()), inplace=True)
df_preprocessed_['subzone_pop_density'].fillna((df_preprocessed_['subzone_pop_density'].mean()), inplace=True)
print(df_preprocessed_.shape)
df_preprocessed_.head()

(20060, 27)


Unnamed: 0,num_beds,num_baths,size_sqft,price,subzone_pop,subzone_pop_density,dist_2_nearest_cc,nearest_cc_count,dist_2_nearest_mrt,nearest_mrt_count,...,bungalow,condo,corner,hdb,landed,protected,terraced house,walk-up,encoded_tenure,encoded_furnishing
0,3.0,2.0,1115,514500.0,42240.0,42240.0,3.339,0,0.574,1,...,0,0,0,1,0,0,0,0,0,0.0
1,4.0,2.0,1575,995400.0,15940.0,15940.0,2.402,0,1.734,0,...,0,0,0,1,0,0,0,0,0,0.0
2,4.0,6.0,3070,8485000.0,9980.0,9980.0,2.171,0,1.32,0,...,0,1,0,0,0,0,0,0,1,0.5
3,3.0,2.0,958,2626000.0,6180.0,6180.0,1.606,0,0.726,2,...,0,1,0,0,0,0,0,0,1,0.5
4,2.0,1.0,732,1764000.0,80.0,80.0,1.87,0,0.371,3,...,0,1,0,0,0,0,0,0,0,0.0


In [16]:
# Split data into train and validation set

X_housing = df_preprocessed.loc[:, df_preprocessed.columns != 'price']
y_housing = df_preprocessed['price']

X_train, X_val, y_train, y_val = train_test_split(X_housing, y_housing, train_size=TRAIN_SIZE, random_state=RANDOM_CONTROL, shuffle=True) 

print(X_train.shape)
print(y_train.shape)


(16048, 14)
(16048,)


In [17]:
# Split data into train and validation set

X_housing_ = df_preprocessed_.loc[:, df_preprocessed_.columns != 'price']
y_housing_ = df_preprocessed_['price']

X_train_, X_val_, y_train_, y_val_ = train_test_split(X_housing_, y_housing_, train_size=TRAIN_SIZE, random_state=RANDOM_CONTROL, shuffle=True) 

print(X_train_.shape)
print(y_train_.shape)


(16048, 26)
(16048,)


In [18]:
# Standardize/Normalize

def post_process(df) -> pd.DataFrame:
    return df

In [19]:
X_train = post_process(X_train)
X_val = post_process(X_val)

In [20]:
X_train_ = post_process(X_train_)
X_val_ = post_process(X_val_)

**Models**

(Add outline of steps here later)

- Linear Regression
- Random Forest
- Gradient Boosting
- AdaBoost
- Extra Trees Regressor
- Bagging Regressor
- Neural Net

In [22]:
def predict(model, X_feature) -> pd.DataFrame: # Fix it. Numpy array is returned
    y_hat = model.predict(X_feature)
    return y_hat

def validate(y, y_hat) -> None:
    rmse = mean_squared_error(y, y_hat, squared=False)
    print('Validation RMSE: {:.3}'.format(rmse))
    return

In [23]:
val_split_indices = [-1 if x in X_train.index else 0 for x in X_housing.index]
ps = PredefinedSplit(test_fold=val_split_indices)

In [24]:
val_split_indices_ = [-1 if x in X_train_.index else 0 for x in X_housing_.index]
ps_ = PredefinedSplit(test_fold=val_split_indices_)

# Linear Regression

In [25]:
def train_lr(X_feature, y_label):
    lr = LinearRegression().fit(X_feature, y_label)
    return lr

In [26]:
model_lr = train_lr(X_train, y_train)
y_hat_lr = predict(model_lr, X_val)
validate(y_val, y_hat_lr)

Validation RMSE: 3.66e+06


In [27]:
model_lr_ = train_lr(X_train_, y_train_)
y_hat_lr_ = predict(model_lr_, X_val_)
validate(y_val_, y_hat_lr_)

Validation RMSE: 3.57e+06


# Random Forest

In [28]:
def train_rf(X_feature, y_label):
    rf = RandomForestRegressor(n_estimators=RF_NUM_ESTIMATORS, criterion=RF_CRITERION, max_depth=RF_MAX_DEPTH,
                          min_samples_split=RF_MIN_SPLIT, min_samples_leaf=RF_MIN_LEAF, max_features=RF_MAX_FEATURES,
                          bootstrap=RF_BOOTSTRAP).fit(X_feature, y_label)
    return rf

def bestfit_rf(X_feature, y_label, train_test_split):
    estimator = RandomForestRegressor()
    params = {'n_estimators': [25, 50, 100],
              'max_depth': [5, 10, 25, 50],
              'min_samples_split': [2],
              'min_samples_leaf': [1],
              'criterion': ["squared_error"],
              'max_features': [1],
              'bootstrap': [True, False],
              'random_state': [RANDOM_CONTROL]}
    model_rf = GridSearchCV(estimator=estimator,
                         param_grid=params,
                         cv=train_test_split)
    model_rf.fit(X_feature, y_label)
    print('Random Forest Best Parameters: {}'.format(model_rf.best_params_))
    return model_rf

In [29]:
model_rf = bestfit_rf(X_housing, y_housing, ps)
y_hat_rf = predict(model_rf, X_val)
validate(y_val, y_hat_rf)

Random Forest Best Parameters: {'bootstrap': False, 'criterion': 'squared_error', 'max_depth': 50, 'max_features': 1, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': 42}
Validation RMSE: 2.49e+05


In [30]:
model_rf_ = bestfit_rf(X_housing_, y_housing_, ps_)
y_hat_rf_ = predict(model_rf_, X_val_)
validate(y_val_, y_hat_rf_)

Random Forest Best Parameters: {'bootstrap': False, 'criterion': 'squared_error', 'max_depth': 50, 'max_features': 1, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': 42}
Validation RMSE: 2.14e+05


# Gradient Boosting

In [31]:
def train_gb(X_feature, y_label):
    gb = GradientBoostingRegressor(n_estimators=GB_NUM_ESTIMATORS, learning_rate=GB_LEARNING_RATE,
                                  max_depth=GB_MAX_DEPTH, criterion=GB_CRITERION).fit(X_feature, y_label)
    return gb

def bestfit_gb(X_feature, y_label, train_test_split):
    estimator = GradientBoostingRegressor()
    params = {'n_estimators': [25, 50, 100],
              'learning_rate': [0.001, 0.01, 0.1, 0.5, 1],
              'max_depth': [5, 10, 25, 50],
              'min_samples_split': [2],
              'min_samples_leaf': [1],
              'criterion': ["squared_error", "friedman_mse"],
              'max_features': [1],
              'random_state': [RANDOM_CONTROL]}
    model_gb = GridSearchCV(estimator=estimator,
                         param_grid=params,
                         cv=train_test_split)
    model_gb.fit(X_feature, y_label)
    print('Gradient Boost Best Parameters: {}'.format(model_gb.best_params_))
    return model_gb

In [32]:
model_gb = bestfit_gb(X_housing, y_housing, ps)
y_hat_gb = predict(model_gb, X_val)
validate(y_val, y_hat_gb)

Gradient Boost Best Parameters: {'criterion': 'squared_error', 'learning_rate': 0.1, 'max_depth': 25, 'max_features': 1, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': 42}
Validation RMSE: 2.49e+05


In [33]:
model_gb_ = bestfit_gb(X_housing_, y_housing_, ps_)
y_hat_gb_ = predict(model_gb_, X_val_)
validate(y_val_, y_hat_gb_)

Gradient Boost Best Parameters: {'criterion': 'squared_error', 'learning_rate': 0.1, 'max_depth': 10, 'max_features': 1, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': 42}
Validation RMSE: 4.2e+05


# AdaBoost

In [34]:
def train_ab():
    pass

def bestfit_ab(X_feature, y_label, train_test_split):
    base_estimator = DecisionTreeRegressor()
    estimator = AdaBoostRegressor(base_estimator=base_estimator)
    params = {'base_estimator__max_depth': [5, 10, 25, 50],
              'base_estimator__splitter': ['best', 'random'],
              'n_estimators': [25, 50, 100],
              'learning_rate': [0.001, 0.01, 0.1, 0.5, 1],
              'random_state': [RANDOM_CONTROL]}
    model_ab = GridSearchCV(estimator=estimator,
                         param_grid=params,
                         cv=ps)
    model_ab.fit(X_feature, y_label)
    print('AdaBoost Best Parameters: {}'.format(model_ab.best_params_))
    return model_ab

In [35]:
model_ab = bestfit_ab(X_housing, y_housing, ps)
y_hat_ab = predict(model_ab, X_val)
validate(y_val, y_hat_ab)

AdaBoost Best Parameters: {'base_estimator__max_depth': 50, 'base_estimator__splitter': 'random', 'learning_rate': 0.01, 'n_estimators': 25, 'random_state': 42}
Validation RMSE: 2.75e+05


In [36]:
model_ab_ = bestfit_ab(X_housing_, y_housing_, ps_)
y_hat_ab_ = predict(model_ab_, X_val_)
validate(y_val_, y_hat_ab_)

AdaBoost Best Parameters: {'base_estimator__max_depth': 25, 'base_estimator__splitter': 'random', 'learning_rate': 1, 'n_estimators': 25, 'random_state': 42}
Validation RMSE: 3.91e+05


# Extra Trees Regressor

In [37]:
def train_et():
    pass

def bestfit_et(X_feature, y_label, train_test_split):
    estimator = ExtraTreesRegressor()
    params = {'n_estimators': [25, 50, 100],
              'max_depth': [5, 10, 25, 50],
              'min_samples_split': [2],
              'min_samples_leaf': [1],
              'criterion': ["squared_error", "friedman_mse"],
              'max_features': [1],
              'random_state': [RANDOM_CONTROL]}
    model_et = GridSearchCV(estimator=estimator,
                         param_grid=params,
                         cv=ps)
    model_et.fit(X_feature, y_label)
    print('Extra Trees Best Parameters: {}'.format(model_et.best_params_))
    return model_et

In [38]:
model_et = bestfit_et(X_housing, y_housing, ps)
y_hat_et = predict(model_et, X_val)
validate(y_val, y_hat_et)

Extra Trees Best Parameters: {'criterion': 'squared_error', 'max_depth': 25, 'max_features': 1, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 25, 'random_state': 42}
Validation RMSE: 4.71e+05


In [39]:
model_et_ = bestfit_et(X_housing_, y_housing_, ps_)
y_hat_et_ = predict(model_et_, X_val_)
validate(y_val_, y_hat_et_)

Extra Trees Best Parameters: {'criterion': 'friedman_mse', 'max_depth': 50, 'max_features': 1, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 25, 'random_state': 42}
Validation RMSE: 2.14e+05


# Bagging Regressor

In [40]:
def train_br():
    pass

def bestfit_br(X_feature, y_label, train_test_split):
    base_estimator = DecisionTreeRegressor()
    estimator = BaggingRegressor(base_estimator=base_estimator)
    params = {'n_estimators': [25, 50, 100],
              'max_features': [1],
              'random_state': [RANDOM_CONTROL]}
    model_br = GridSearchCV(estimator=estimator,
                         param_grid=params,
                         cv=ps)
    model_br.fit(X_feature, y_label)
    print('Bagging Best Parameters: {}'.format(model_br.best_params_))
    return model_br

In [41]:
model_br = bestfit_br(X_housing, y_housing, ps)
y_hat_br = predict(model_br, X_val)
validate(y_val, y_hat_br)

Bagging Best Parameters: {'max_features': 1, 'n_estimators': 100, 'random_state': 42}
Validation RMSE: 4.58e+06


In [42]:
model_br_ = bestfit_br(X_housing_, y_housing_, ps_)
y_hat_br_ = predict(model_br_, X_val_)
validate(y_val_, y_hat_br_)

Bagging Best Parameters: {'max_features': 1, 'n_estimators': 25, 'random_state': 42}
Validation RMSE: 4.4e+06


# Neural Net

In [None]:
X_housing = X_housing.to_numpy()
y_housing = y_housing.to_numpy()
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()
X_val = X_val.to_numpy()
y_val = y_val.to_numpy()

#print(X_train.shape)
#print(y_train.shape)
train_dataloader = DataLoader([ [X_train[i], y_train[i]] for i in range(len(X_train)) ], batch_size=NN_BATCH_SIZE, shuffle=True, num_workers=4)

In [None]:
# Define MLP architecture
class Model(nn.Module):
    
    def __init__(self, device='cpu'):
        super(Model, self).__init__()
        self.device = device
        
        # Modify accordingly
        self.fc1 = nn.Linear(1, 4)
        self.fc2 = nn.Linear(4, 16)
        self.fc3 = nn.Linear(16, 64)
        self.fc4 = nn.Linear(64, 4)
        self.fc5 = nn.Linear(4, 1)
        self.relu = nn.ReLU()
        
        self.dense = nn.Sequential(self.fc1, self.relu, self.fc2, self.relu, self.fc3, self.relu, 
                                   self.fc4, self.relu, self.fc5)
        
    def forward(self, x):
        pred = self.dense(x)
        return pred
    
# Train
device = 'cpu'
model = Model()
optimizer = optim.Adam(model.parameters(), NN_LEARNING_RATE)
criterion = nn.MSELoss()
model.to(device)
for epoch in range(NN_NUM_EPOCHS):
    running_loss = 0
    for idx, (x_features, y_labels) in enumerate(train_dataloader):
        optimizer.zero_grad()
        x_features = x_features.to(device, dtype=torch.float)
        y_labels = y_labels.to(device, dtype=torch.float)
        prediction = model(x_features)
        loss = torch.sqrt(criterion(prediction, y_labels)) # Standardize RMSE loss
        loss.backward()
        optimizer.step()
        running_loss += loss
        if (idx+1) %100 == 0: 
            running_loss = format(running_loss/100, '.4f')
            print(f"Epoch [{epoch+1} Batches processed | {idx}] Loss: {running_loss}")
            running_loss = 0
print("Finished Training.")


In [None]:
# Validate
X_features = torch.from_numpy(X_val)
y_labels = torch.from_numpy(y_val)
X_features = X_features.to(device, dtype=torch.float)
y_labels = y_labels.to(device, dtype=torch.float)
prediction = model(X_features)
rmse_nn = torch.sqrt(criterion(prediction, y_labels))

print('Neural Net Validation rmse: {:.3}'.format(rmse_nn))

# Test




In [49]:
# Choose one model here for final predictions - Linear Regression (TODO: CHANGE THIS) 
# Train over entire training set
df_train = pd.read_csv('data/train.csv') 
df_train = pre_process(df_train, mode='train')
X_train = df_train.loc[:, df_train.columns != 'price']
y_train = df_train['price']
X_train = post_process(X_train)
model = train_rf(X_train, y_train)

# Predict labels for test set
df_test = pd.read_csv('data/test.csv') 
df_test = pre_process(df_test, mode='test')
df_test = post_process(df_test)
predictions = predict(model, df_test)
predictions = pd.DataFrame(predictions)
predictions.to_csv('predictions.csv', header=['Predicted'], index=True, index_label='Id')

In [None]:
# Choose one model here for final predictions - Linear Regression (TODO: CHANGE THIS) 
# Train over entire training set
df_train_ = pd.read_csv('data/train.csv')
df_train_ = improve_dataset(df_train_)
df_train_ = pre_process(df_train_, mode='train')
df_train_['subzone_pop'].fillna((df_train_['subzone_pop'].mean()), inplace=True)
df_train_['subzone_pop_density'].fillna((df_train_['subzone_pop_density'].mean()), inplace=True)
X_train_ = df_train_.loc[:, df_train_.columns != 'price']
y_train_ = df_train_['price']
X_train_ = post_process(X_train_)
#model_ = train_lr(X_train_, y_train_)

In [45]:
# Predict labels for test set
df_test_ = pd.read_csv('data/test.csv') 
df_test_ = improve_dataset(df_test_)
df_test_ = pre_process(df_test_, mode='test')
df_test_['subzone_pop'].fillna((df_test_['subzone_pop'].mean()), inplace=True)
df_test_['subzone_pop_density'].fillna((df_test_['subzone_pop_density'].mean()), inplace=True)
df_test = post_process(df_test_)
#predictions = predict(model_, df_test_)

In [48]:
model_ = train_rf(X_train_, y_train_)
predictions = predict(model_, df_test_)

In [49]:
predictions = pd.DataFrame(predictions)
predictions.to_csv("predictions.csv", header=["Predicted"], index=True, index_label="Id")

# Do not execute

(Add misc. code here that was not utilized)

In [None]:
# working with property details
import requests
import csv
from bs4 import BeautifulSoup

In [None]:
url = 'https://www.99.co/singapore/condos-apartments/meyerhouse'

In [None]:
page = requests.get(url)

In [None]:
print(page.status_code)
print(page.content)

In [None]:
#The website has anti-crawl software on it