In [6]:
# import packages
# !pip install category_encoders
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
import category_encoders as ce
import seaborn as sns

import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from geopy.distance import geodesic
import geopy
import networkx as nx


# load data from csv
commercial_center = pd.read_csv('./data/auxiliary-data/sg-commerical-centres.csv')
# hawker_center = pd.read_csv('./data/auxiliary-data/sg-gov-markets-hawker-centres.csv')
primary_school = pd.read_csv('./data/auxiliary-data/sg-primary-schools.csv')
secondary_school = pd.read_csv('./data/auxiliary-data/sg-secondary-schools.csv')
shopping_malls = pd.read_csv('./data/auxiliary-data/sg-shopping-malls.csv')
train_station = pd.read_csv('./data/auxiliary-data/sg-mrt-stations.csv')

In [7]:
cleaned_train_df = pd.read_csv('data/train.csv')
cleaned_test_df = pd.read_csv('data/test.csv')

In [8]:
# duplicate dataset
train_df = cleaned_train_df.copy()
test_df = cleaned_test_df.copy()

In [9]:
print(train_df.shape)
print(test_df.shape)

(20254, 21)
(6966, 20)


In [10]:
train_df.head()

Unnamed: 0,listing_id,title,address,property_name,property_type,tenure,built_year,num_beds,num_baths,size_sqft,...,furnishing,available_unit_types,total_num_units,property_details_url,lat,lng,elevation,subzone,planning_area,price
0,122881,hdb flat for sale in 866 yishun street 81,sembawang / yishun (d27),866 yishun street 81,hdb 4 rooms,,1988.0,3.0,2.0,1115,...,unspecified,,116.0,https://www.99.co/singapore/hdb/866-yishun-str...,1.414399,103.837196,0,yishun south,yishun,514500.0
1,259374,hdb flat for sale in 506b serangoon north aven...,hougang / punggol / sengkang (d19),hdb-serangoon estate,hdb,99-year leasehold,1992.0,4.0,2.0,1575,...,unspecified,"1, 2, 3, 4, 5, 6 br",,https://www.99.co/singapore/hdb/hdbserangoon-e...,1.372597,103.875625,0,serangoon north,serangoon,995400.0
2,665422,4 bed condo for sale in meyerhouse,128 meyer road,meyerhouse,condo,freehold,2022.0,4.0,6.0,3070,...,partial,"studio, 3, 4, 5, 6 br",56.0,https://www.99.co/singapore/condos-apartments/...,1.298773,103.895798,0,mountbatten,marine parade,8485000.0
3,857699,3 bed condo for sale in leedon green,26 leedon heights,leedon green,Condo,freehold,2023.0,3.0,2.0,958,...,partial,"studio, 1, 2, 3, 4 br",638.0,https://www.99.co/singapore/condos-apartments/...,1.312364,103.803271,0,farrer court,bukit timah,2626000.0
4,216061,2 bed condo for sale in one bernam,1 bernam street,one bernam,condo,99-year leasehold,2026.0,2.0,1.0,732,...,unspecified,"studio, 1, 2, 3, 4, 5 br",351.0,https://www.99.co/singapore/condos-apartments/...,1.273959,103.843635,0,anson,downtown core,1764000.0


In [11]:
print(train_df.dtypes)

listing_id                int64
title                    object
address                  object
property_name            object
property_type            object
tenure                   object
built_year              float64
num_beds                float64
num_baths               float64
size_sqft                 int64
floor_level              object
furnishing               object
available_unit_types     object
total_num_units         float64
property_details_url     object
lat                     float64
lng                     float64
elevation                 int64
subzone                  object
planning_area            object
price                   float64
dtype: object


In [12]:
# compute distance between locations
def compute_distance(lat1, lng1, lat2, lng2):
    c1 = (lat1, lng1)
    c2 = (lat2, lng2)
    return(geopy.distance.geodesic(c1, c2).km)

In [13]:
lat = cleaned_train_df.lat
lng = cleaned_train_df.lng

In [14]:
compute_distance(lat[0], lng[0], lat[1], lng[1])

6.297250231498214

In [15]:
def np_vec_impl(lat, lng, df, i):
    return np.vectorize(compute_distance)(lat, lng, df['lat'][i], df['lng'][i])

In [16]:
def compute_auxiliary(lat, lng):
    # initialization
    near_cc = np.zeros(len(lat))
#     near_hc = np.zeros(len(lat))
    near_ps = np.zeros(len(lat))
    near_ss = np.zeros(len(lat))
    near_sm = np.zeros(len(lat))
    near_ts = np.zeros(len(lat))
    
    dist_cc = np.zeros(len(lat))+1000
#     dist_hc = np.zeros(len(lat))+1000
    dist_ps = np.zeros(len(lat))+1000
    dist_ss = np.zeros(len(lat))+1000
    dist_sm = np.zeros(len(lat))+1000
    dist_ts = np.zeros(len(lat))+1000
    
#     for i in range(len(hawker_center)):
#         d = np_vec_impl(lat, lng, hawker_center, i)
#         within = d < 1.5
#         dist_hc[(d < dist_hc)] = d[(d < dist_hc)]
#         near_hc[within] += 1
    
    for i in range(len(commercial_center)):
        d = np_vec_impl(lat, lng, commercial_center, i)
        within = d < 3.0
        dist_cc[(d < dist_cc)] = d[(d < dist_cc)]
        near_cc[within] += 1
    
    for i in range(len(primary_school)):
        d = np_vec_impl(lat, lng, primary_school, i)
        within = d < 3.0
        dist_ps[(d < dist_ps)] = d[(d < dist_ps)]
        near_ps[within] += 1
    
    for i in range(len(secondary_school)):
        d = np_vec_impl(lat, lng, secondary_school, i)
        within = d < 3.0
        dist_ss[(d < dist_ss)] = d[(d < dist_ss)]
        near_ss[within] += 1
        
    for i in range(len(shopping_malls)):
        d = np_vec_impl(lat, lng, shopping_malls, i)
        within = d < 2.0
        dist_sm[(d < dist_sm)] = d[(d < dist_sm)]
        near_sm[within] += 1


    for i in range(len(train_station)):
        d = np_vec_impl(lat, lng, train_station, i)
        within = d < 1.5
        dist_ts[(d < dist_ts)] = d[(d < dist_ts)]
        near_ts[within] += 1
        
    d = {'num_comercial_3km': near_cc, 'num_primary_3km': near_ps, 'num_secondary_3km': near_ss, 'num_mall_2km': near_sm, 'num_mrt_1.5km': near_ts}

    num_of_amenties_test = pd.DataFrame(data=d)
    
    d = {'closest_comercial': dist_cc,'closest_primary': dist_ps, 'closest_secondary': dist_ss,'closest_mall': dist_sm, 'closest_mrt': dist_ts}

    closest_distance_test = pd.DataFrame(data=d)
    return num_of_amenties_test, closest_distance_test

In [17]:
near_cc = np.zeros(len(lat))
near_ps = np.zeros(len(lat))
near_ss = np.zeros(len(lat))
near_sm = np.zeros(len(lat))
near_ts = np.zeros(len(lat))

In [18]:
dist_cc = np.zeros(len(lat))+1000
dist_ps = np.zeros(len(lat))+1000
dist_ss = np.zeros(len(lat))+1000
dist_sm = np.zeros(len(lat))+1000
dist_ts = np.zeros(len(lat))+1000

In [21]:
num_of_amenties_test, closest_distance_test = compute_auxiliary(lat,lng)

In [24]:
import time
start = time.time()
for i in range(len(commercial_center)):
    d = np_vec_impl(lat, lng, commercial_center, i)
    within = d < 3.0
    dist_cc[(d < dist_cc)] = d[(d < dist_cc)]
    near_cc[within] += 1
print(f'cost time: {time.time() - start}')

0
cost time: 103.45691013336182


In [31]:
for i in range(len(primary_school)):
    d = np_vec_impl(lat, lng, primary_school, i)
    within = d < 3.0
    dist_ps[(d < dist_ps)] = d[(d < dist_ps)]
    near_ps[within] += 1
print(f'cost time: {time.time() - start}')

cost time: 680.4127485752106


In [32]:
for i in range(len(secondary_school)):
    d = np_vec_impl(lat, lng, secondary_school, i)
    within = d < 3.0
    dist_ss[(d < dist_ss)] = d[(d < dist_ss)]
    near_ss[within] += 1
print(f'cost time: {time.time() - start}')

cost time: 1082.7566471099854


In [None]:
for i in range(len(shopping_malls)):
    d = np_vec_impl(lat, lng, shopping_malls, i)
    within = d < 2.0
    dist_sm[(d < dist_sm)] = d[(d < dist_sm)]
    near_sm[within] += 1
    
    
for i in range(len(train_station)):
    d = np_vec_impl(lat, lng, train_station, i)
    within = d < 1.5
    dist_ts[(d < dist_ts)] = d[(d < dist_ts)]
    near_ts[within] += 1

In [None]:
# dataframe to store number of amenties

d = {'num_comercial_3km': near_cc, 'num_primary_3km': near_ps, 'num_secondary_3km': near_ss, 'num_mall_2km': near_sm, 'num_mrt_1.5km': near_ts}

num_of_amenties = pd.DataFrame(data=d)
num_of_amenties

In [None]:
d2 = {'closest_comercial': dist_cc,'closest_primary': dist_ps, 'closest_secondary': dist_ss,'closest_mall': dist_sm, 'closest_mrt': dist_ts}

closest_distance = pd.DataFrame(data=d2)
closest_distance

In [None]:
filepath = './data/auxiliary-number-amenties-test.csv'
num_of_amenties_test.to_csv(filepath, index=True)

filepath = './data/auxiliary-distance-amenties-test.csv'
closest_distance_test.to_csv(filepath, index=True)