In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
import seaborn as sns

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV


from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score


## Load and clean rent 2017 data

In [2]:
#load and clean data
final17 = pd.read_csv("Data/final17_dataset.csv")

nationality_col = ['P_Afghanistan_Tot','P_Australia_Tot','P_Bangladesh_Tot','P_Bosnia_Herzegov_Tot','P_Cambodia_Tot','P_Canada_Tot','P_Chile_Tot','P_China_Tot','P_Croatia_Tot','P_Egypt_Tot','P_England_Tot','P_Fiji_Tot','P_France_Tot','P_Germany_Tot','P_Greece_Tot','P_Hong_Kong_Tot',
 'P_India_Tot','P_Indonesia_Tot','P_Iran_Tot','P_Iraq_Tot','P_Ireland_Tot','P_Italy_Tot','P_Japan_Tot','P_Korea_South_Tot','P_Lebanon_Tot','P_Malaysia_Tot','P_Malta_Tot','P_Mauritius_Tot','P_Myanmar_Tot','P_Nepal_Tot','P_Netherlands_Tot','P_New_Zealand_Tot',
 'P_Nthern_Ireland_Tot','P_Pakistan_Tot','P_PNG_Tot','P_Philippines_Tot','P_Poland_Tot','P_Scotland_Tot','P_Singapore_Tot','P_South_Africa_Tot','P_SE_Europe_nfd_Tot','P_Sri_Lanka_Tot','P_Taiwan_Tot','P_Thailand_Tot','P_FYROM_Tot',
 'P_Turkey_Tot','P_USA_Tot','P_Vietnam_Tot','P_Wales_Tot','P_Zimbabwe_Tot','P_Elsewhere_Tot']

col_rename = {'Age_below_15_yr_perc':'pct_age_14below', 'Age_15_64_yr_perc' : 'pct_age_15_64',
              'Age_over_65_yr_perc' : 'pct_age_65above', 'BP_Aus_Perc' : 'pct_birthplace_au',
              'BP_Non_Aus_Perc' : 'pct_birthplace_non_au', 'SE_Yr10_above_Perc' : 'pct_educ_yr10above',
              'SE_Yr9_below_Perc' : 'pct_educ_yr9below', 'MS_Married_Perc' : 'pct_married',
              'MS_Single_Perc' : 'pct_single', 'Percent_Unem_loyment_P' : 'unemployment_rate',
              'Percnt_LabForc_prticipation_P' : 'lfpr', 'Percnt_Employment_to_populn_P' : 'employment_rate', 
              'Median_age_persons' : 'median_age', 'Median_tot_prsnl_inc_weekly' : 'median_weekly_pincome',
              'Median_tot_fam_inc_weekly' : 'median_weekly_fincome',
              'Median_tot_hhd_inc_weekly' : 'median_weekly_hincome', 'Average_household_size' : 'ave_hhsize'}

final17_c = final17.fillna(0)
final17_c = final17_c.drop(nationality_col, axis=1)
final17_c = final17_c.rename(columns = col_rename)

final17_c['ntl17_mean_per_land_area'] = final17_c.ntl17_sum / final17_c.land_area
final17_c['bstop_per_land_area'] = final17_c.bus_stop / final17_c.land_area
final17_c['tstation_per_land_area'] = final17_c.train_station / final17_c.land_area
final17_c['transpo_per_land_area'] = final17_c.all_transpo / final17_c.land_area

final17_c = final17_c[final17_c.r_weekly_rent_median > 0]
print("shape of final df: ", final17_c.shape)

shape of final df:  (121, 124)


In [3]:
sales17_col = ['s_mean_nonstrata','s_mean_strata','s_median_nonstrata','s_median_strata','s_mean_sales','s_median_sales']
sales17=final17_c[sales17_col]

rent17_col = ['r_one_br','r_two_br','r_three_br','r_four_or_more_br',
 'r_bedsitter','r_not_specified','r_flat','r_house','r_other','r_townhouse','r_weekly_rent_median']

rent17 = final17_c[rent17_col]

In [4]:
print("shape of rent df: ",rent17.shape)

shape of rent df:  (121, 11)


In [5]:
rent17.head()

Unnamed: 0,r_one_br,r_two_br,r_three_br,r_four_or_more_br,r_bedsitter,r_not_specified,r_flat,r_house,r_other,r_townhouse,r_weekly_rent_median
0,162.5,220.0,290.0,375.0,0.0,280.0,200.0,310.0,277.5,277.5,270.0
1,170.0,240.0,322.5,410.0,0.0,250.0,230.0,330.0,0.0,0.0,280.0
2,240.0,360.0,470.0,580.0,0.0,460.0,350.0,510.0,415.0,455.0,450.0
4,190.0,270.0,320.0,415.0,0.0,310.0,260.0,340.0,310.0,280.0,310.0
5,205.0,270.0,350.0,400.0,0.0,0.0,265.0,355.0,350.0,330.0,320.0


In [6]:
lga_id = ['LGA_CODE', 'LGA_NAME']

X_ntl = ['land_area', 'ntl17_mean', 'ntl17_mean_per_land_area']
X_transpo_bus = ['bus_stop', 'bstop_per_land_area']
X_transpo_train = ['train_station', 'tstation_per_land_area']
X_transpo_all = ['all_transpo', 'transpo_per_land_area']
X_census_age = ['pct_age_14below', 'pct_age_15_64', 'pct_age_65above']
X_census_origin = ['pct_birthplace_au', 'pct_birthplace_non_au']
X_census_educ = ['pct_educ_yr10above', 'pct_educ_yr9below']
X_census_marital_stat = ['pct_married', 'pct_single']
X_census_employment = ['unemployment_rate', 'lfpr', 'employment_rate']
X_census_educ_qual = ['pct_pgrad', 'pct_grad_cert_dip', 'pct_bach_deg', 'pct_adv_dip', 'pct_cert_iii_iv',
                      'pct_cert_i_ii', 'pct_others']
X_census_stats = ['median_age', 'median_weekly_pincome', 'median_weekly_fincome', 'median_weekly_hincome',
                  'ave_hhsize']

X_transpo_bus_col= ['bstop_per_land_area']
X_transpo_train_col= ['tstation_per_land_area']
X_transpo_all_col = ['transpo_per_land_area']
X_ntl_col = ['ntl17_mean_per_land_area']
X_census_col = X_census_age+X_census_origin+X_census_origin+X_census_educ+X_census_marital_stat+X_census_employment+X_census_educ_qual+X_census_stats

X_all_col = X_ntl_col+X_transpo_all_col+X_census_age+X_census_origin+X_census_educ+X_census_marital_stat+X_census_employment+X_census_educ_qual+X_census_stats
X_all_excl_train_col = X_ntl_col+X_transpo_bus_col+X_census_age+X_census_origin+X_census_educ+X_census_marital_stat+X_census_employment+X_census_educ_qual+X_census_stats


# Predict house rent median

## Using all variables

#### Choose and split data

In [7]:
X = final17_c[X_all_col]
y = rent17['r_weekly_rent_median']

In [8]:
print("shape of X: ", X.shape)
print("shape of y: ", y.shape)

shape of X:  (121, 26)
shape of y:  (121,)


In [9]:
# 75% split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)


print("shape of X_train: ", X_train.shape)
print("shape of y_train: ", y_train.shape)
print("shape of X_test: ", X_test.shape)
print("shape of y_test: ", y_test.shape)


shape of X_train:  (90, 26)
shape of y_train:  (90,)
shape of X_test:  (31, 26)
shape of y_test:  (31,)


#### Use cross validation to find best max depth

In [10]:
# best depth
dtr = DecisionTreeRegressor(random_state = 42)
param_grid = {'max_depth':np.arange(1,50)}
dtr_grid_cv = GridSearchCV(dtr, param_grid, cv = 10, scoring = 'r2')
dtr_grid_cv.fit(X_train, y_train)
print("best max depth para is: ", dtr_grid_cv.best_params_)
print("R2 = ", dtr_grid_cv.best_score_)

best max depth para is:  {'max_depth': 8}
R2 =  0.7441506814829111


**-> Based on cross validataion result, best max_depth parameter for the regression is 8**

#### Scoring

In [11]:
dtr_d10 = DecisionTreeRegressor(max_depth=8, random_state=42)

dtr_d10.fit(X_train,y_train)

y_pre_d10_train = dtr_d10.predict(X_train)
y_pre_d10_test = dtr_d10.predict(X_test)

rmse_train = np.sqrt(mean_squared_error(y_train,y_pre_d10_train))
rsq_train = r2_score(y_train,y_pre_d10_train)
#adj_rsq_train = 1 - (1 - rsq_train) * (len(y_train) - 1) / (len(y_train) - X_train.shape[1] - 1)

rmse_test = np.sqrt(mean_squared_error(y_test,y_pre_d10_test))
rsq_test = r2_score(y_test,y_pre_d10_test)
#adj_rsq_test = 1 - (1 - rsq_test) * (len(y_test) - 1) / (len(y_test) - X_test.shape[1] - 1)


print("rmse train data of max_depth=10: ", rmse_train)
print("r2 train data of max_depth=10: ", rsq_train)
#print("adjusted r2 train data of max_depth = 10: ", adj_rsq_train)

print("rmse test data of max_depth=10: ", rmse_test)
print("r2 test data of max_depth=10: ", rsq_test)
#print("adjusted r2 test data of max_depth = 10: ", adj_rsq_test)


rmse train data of max_depth=10:  2.3186542233389194
r2 train data of max_depth=10:  0.9997344545638964
rmse test data of max_depth=10:  55.73454589152734
r2 test data of max_depth=10:  0.8477305866528717


## Defining functions for using other variables

Other grouped variables will be tested using same process above with bellowed functions

In [12]:
def load_data(X,y):
    #print("shape of X: ", X.shape)
    #print("shape of y: ", y.shape)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
    #print("shape of X_train: ", X_train.shape)
    #print("shape of y_train: ", y_train.shape)
    #print("shape of X_test: ", X_test.shape)
    #print("shape of y_test: ", y_test.shape)
    
    return X_train, y_train, X_test, y_test

def dtr_best_depth(X_train, y_train):

    # best depth
    dtr = DecisionTreeRegressor(random_state = 42)
    param_grid = {'max_depth':np.arange(1,50)}
    dtr_grid_cv = GridSearchCV(dtr, param_grid, cv = 10, scoring = 'r2')
    dtr_grid_cv.fit(X_train, y_train)
    best_depth = dtr_grid_cv.best_params_
    best_score = dtr_grid_cv.best_score_
    
    #print("best max depth para is: ", dtr_grid_cv.best_params_)
    #print("R2 from CV = ", dtr_grid_cv.best_score_)

    return best_depth, best_score

def scoring(max_depth, X_train, y_train, X_test, y_test):
    dtr = DecisionTreeRegressor(max_depth = max_depth, random_state=42)
    dtr.fit(X_train, y_train)
    y_pre_train = dtr.predict(X_train)
    y_pre_test = dtr.predict(X_test)
    
    #print("rmse of {}: ".format(max_depth), np.sqrt(mean_squared_error(y_test,y_pre)))
    #print("r2 of {}: ".format(max_depth), r2_score(y_test,y_pre))
    
    rmse_train = np.sqrt(mean_squared_error(y_train,y_pre_train))
    rsq_train = r2_score(y_train,y_pre_train)
    adj_rsq_train = 1 - (1 - rsq_train) * (len(y_train) - 1) / (len(y_train) - X_train.shape[1] - 1)
    
    rmse_test = np.sqrt(mean_squared_error(y_test,y_pre_test))
    rsq_test = r2_score(y_test,y_pre_test)
    adj_rsq_test = 1 - (1 - rsq_test) * (len(y_test) - 1) / (len(y_test) - X_test.shape[1] - 1)
    
    return rmse_train, rsq_train, adj_rsq_train, rmse_test, rsq_test, adj_rsq_test
    
    
    
def dtr_assess(X, y):
    
    data = load_data(X, y)
    best_depth = dtr_best_depth(data[0], data[1])
    scores = scoring(best_depth[0]['max_depth'],data[0], data[1], data[2], data[3])
    #scores = scoring(1,data[0], data[1], data[2], data[3])
    
    print("shape of X: ", X.shape)
    print("shape of y: ", y.shape)
    
    print("shape of X_train: ", data[0].shape)
    print("shape of y_train: ", data[1].shape)
    print("shape of X_test: ", data[2].shape)
    print("shape of y_test: ", data[3].shape)               

    print("best max depth para is: ", best_depth[0])
    print("R2 from CV: ", best_depth[1])
    
    print("rmse train data of max_depth = {}: ".format(best_depth[0]['max_depth']), scores[0])
    print("r2 train data of max_depth = {}: ".format(best_depth[0]['max_depth']), scores[1]) 
    #print("adjusted r2 train data of max_depth = {}: ".format(best_depth[0]['max_depth']), scores[2])
    
    print("rmse test data of max_depth = {}: ".format(best_depth[0]['max_depth']), scores[3])
    print("r2 test data of max_depth = {}: ".format(best_depth[0]['max_depth']), scores[4])      
    #print("adjusted r2 test data of max_depth = {}: ".format(best_depth[0]['max_depth']), scores[5])            
                                  

## Using census variables

In [13]:
X = final17_c[X_census_col]
y = rent17['r_weekly_rent_median']

dtr_assess(X, y)

shape of X:  (121, 26)
shape of y:  (121,)
shape of X_train:  (90, 26)
shape of y_train:  (90,)
shape of X_test:  (31, 26)
shape of y_test:  (31,)
best max depth para is:  {'max_depth': 8}
R2 from CV:  0.7771055464913436
rmse train data of max_depth = 8:  3.6839419880650364
r2 train data of max_depth = 8:  0.9993296641736006
rmse test data of max_depth = 8:  84.22007910717586
r2 test data of max_depth = 8:  0.6523077338669976


## Using transportation variables

### All transportations

In [14]:
X = final17_c[X_transpo_all_col]
y = rent17['r_weekly_rent_median']

dtr_assess(X, y)

shape of X:  (121, 1)
shape of y:  (121,)
shape of X_train:  (90, 1)
shape of y_train:  (90,)
shape of X_test:  (31, 1)
shape of y_test:  (31,)
best max depth para is:  {'max_depth': 5}
R2 from CV:  0.7434863940431771
rmse train data of max_depth = 5:  32.41154177070065
r2 train data of max_depth = 5:  0.9481120818346076
rmse test data of max_depth = 5:  72.24275459367833
r2 test data of max_depth = 5:  0.7441694910411718


### Bus stops only

In [15]:
X = final17_c[X_transpo_bus_col]
y = rent17['r_weekly_rent_median']

dtr_assess(X, y)

shape of X:  (121, 1)
shape of y:  (121,)
shape of X_train:  (90, 1)
shape of y_train:  (90,)
shape of X_test:  (31, 1)
shape of y_test:  (31,)
best max depth para is:  {'max_depth': 5}
R2 from CV:  0.7336154297609776
rmse train data of max_depth = 5:  32.422787442946344
r2 train data of max_depth = 5:  0.9480760689997415
rmse test data of max_depth = 5:  72.2856130764312
r2 test data of max_depth = 5:  0.7438658547831098


### Train stations only

In [16]:
X = final17_c[X_transpo_train_col]
y = rent17['r_weekly_rent_median']

dtr_assess(X, y)

shape of X:  (121, 1)
shape of y:  (121,)
shape of X_train:  (90, 1)
shape of y_train:  (90,)
shape of X_test:  (31, 1)
shape of y_test:  (31,)
best max depth para is:  {'max_depth': 2}
R2 from CV:  0.3574138733604668
rmse train data of max_depth = 2:  98.83129427416294
r2 train data of max_depth = 2:  0.5175460888675221
rmse test data of max_depth = 2:  111.03574345483992
r2 test data of max_depth = 2:  0.39564885376009384


**-> Results above shows that inclunding train staions counts does not generate more accurate prediction**

## Using nighttime lights variables

In [17]:
X = final17_c[X_ntl_col]
y = rent17['r_weekly_rent_median']

dtr_assess(X, y)

shape of X:  (121, 1)
shape of y:  (121,)
shape of X_train:  (90, 1)
shape of y_train:  (90,)
shape of X_test:  (31, 1)
shape of y_test:  (31,)
best max depth para is:  {'max_depth': 2}
R2 from CV:  0.4822577313007422
rmse train data of max_depth = 2:  69.53474793752518
r2 train data of max_depth = 2:  0.7611800323573423
rmse test data of max_depth = 2:  74.71430899314855
r2 test data of max_depth = 2:  0.7263652125036986


## Using bus stops and nighttime lights variables

In [18]:
X = final17_c[X_transpo_bus + X_ntl_col]
y = rent17['r_weekly_rent_median']

dtr_assess(X, y)

shape of X:  (121, 3)
shape of y:  (121,)
shape of X_train:  (90, 3)
shape of y_train:  (90,)
shape of X_test:  (31, 3)
shape of y_test:  (31,)
best max depth para is:  {'max_depth': 3}
R2 from CV:  0.6479930039258802
rmse train data of max_depth = 3:  46.0683757664991
r2 train data of max_depth = 3:  0.8951731182944531
rmse test data of max_depth = 3:  71.74371491703849
r2 test data of max_depth = 3:  0.7476917436517823


## Using all variables excluding train station counts

In [19]:
X = final17_c[X_all_excl_train_col]
y = rent17['r_weekly_rent_median']

dtr_assess(X, y)

shape of X:  (121, 26)
shape of y:  (121,)
shape of X_train:  (90, 26)
shape of y_train:  (90,)
shape of X_test:  (31, 26)
shape of y_test:  (31,)
best max depth para is:  {'max_depth': 8}
R2 from CV:  0.7564160418234713
rmse train data of max_depth = 8:  2.723780733003573
r2 train data of max_depth = 8:  0.9996335530149788
rmse test data of max_depth = 8:  59.7081180728323
r2 test data of max_depth = 8:  0.8252446394234482


----
# Predict house rent median by dwelling type

In [20]:
df_dw =  final17_c.melt(id_vars = ['LGA_CODE'], value_vars = ['r_flat', 'r_house', 'r_other', 'r_townhouse'])
df_dw = df_dw[df_dw.value > 0]

df17_by_dw = final17_c.drop(columns = ['r_flat', 'r_house', 'r_other', 'r_townhouse'])
df17_by_dw = df17_by_dw.merge(df_dw, on = 'LGA_CODE', how = 'inner')
df17_by_dw = df17_by_dw.rename(columns = {'variable' : 'dwell_type', 'value' : 'weekly_rent_median'})

dum_df17_by_dw = pd.get_dummies(df17_by_dw.dwell_type).drop(columns = ['r_other'])
df17_by_dw = df17_by_dw.merge(dum_df17_by_dw, how = 'inner', left_index = True, right_index = True)

X_dw_col = ['r_flat', 'r_house', 'r_townhouse']

In [21]:
print('Shape of the df17_by_dw dataset: ', df17_by_dw.shape)
print('The number of LGAs represented in the dataset is: ', len(df17_by_dw.LGA_CODE.unique()))

df17_by_dw.head()


Shape of the df17_by_dw dataset:  (346, 125)
The number of LGAs represented in the dataset is:  116


Unnamed: 0,LGA_CODE,LGA_NAME,s_mean_nonstrata,s_mean_strata,s_median_nonstrata,s_median_strata,s_mean_sales,s_median_sales,r_one_br,r_two_br,...,ntl19_sum,ntl17_mean_per_land_area,bstop_per_land_area,tstation_per_land_area,transpo_per_land_area,dwell_type,weekly_rent_median,r_flat,r_house,r_townhouse
0,LGA10050,Albury,350.72904,197.23513,320.0,206.0,320.42552,290.0,162.5,220.0,...,4417.75,13.711019,0.29417,0.003269,0.297438,r_flat,200.0,1,0,0
1,LGA10050,Albury,350.72904,197.23513,320.0,206.0,320.42552,290.0,162.5,220.0,...,4417.75,13.711019,0.29417,0.003269,0.297438,r_house,310.0,0,1,0
2,LGA10050,Albury,350.72904,197.23513,320.0,206.0,320.42552,290.0,162.5,220.0,...,4417.75,13.711019,0.29417,0.003269,0.297438,r_other,277.5,0,0,0
3,LGA10050,Albury,350.72904,197.23513,320.0,206.0,320.42552,290.0,162.5,220.0,...,4417.75,13.711019,0.29417,0.003269,0.297438,r_townhouse,277.5,0,0,1
4,LGA10130,Armidale Regional,372.99474,0.0,352.5,0.0,361.56796,345.0,170.0,240.0,...,10910.72168,1.139977,0.001972,0.000116,0.002436,r_flat,230.0,1,0,0


## Using all variables

In [22]:
X = df17_by_dw[X_all_col + X_dw_col]
y = df17_by_dw['r_weekly_rent_median']

dtr_assess(X, y)

shape of X:  (346, 29)
shape of y:  (346,)
shape of X_train:  (259, 29)
shape of y_train:  (259,)
shape of X_test:  (87, 29)
shape of y_test:  (87,)
best max depth para is:  {'max_depth': 10}
R2 from CV:  0.9842876041789863
rmse train data of max_depth = 10:  3.9265796716771093
r2 train data of max_depth = 10:  0.999179263436791
rmse test data of max_depth = 10:  20.254714727190343
r2 test data of max_depth = 10:  0.9828455413255597




## Using census variables

In [23]:
X = df17_by_dw[X_census_col + X_dw_col]
y = df17_by_dw['r_weekly_rent_median']

dtr_assess(X, y)

shape of X:  (346, 29)
shape of y:  (346,)
shape of X_train:  (259, 29)
shape of y_train:  (259,)
shape of X_test:  (87, 29)
shape of y_test:  (87,)
best max depth para is:  {'max_depth': 12}
R2 from CV:  0.9811274042024123
rmse train data of max_depth = 12:  0.0
r2 train data of max_depth = 12:  1.0
rmse test data of max_depth = 12:  20.07171052049104
r2 test data of max_depth = 12:  0.9831541268535929




## Using transportation variables

### All transportations

In [24]:
X = df17_by_dw[X_transpo_all_col + X_dw_col]
y = df17_by_dw['r_weekly_rent_median']

dtr_assess(X, y)

shape of X:  (346, 4)
shape of y:  (346,)
shape of X_train:  (259, 4)
shape of y_train:  (259,)
shape of X_test:  (87, 4)
shape of y_test:  (87,)
best max depth para is:  {'max_depth': 12}
R2 from CV:  0.9773216370954231
rmse train data of max_depth = 12:  0.0
r2 train data of max_depth = 12:  1.0
rmse test data of max_depth = 12:  9.29250099341634
r2 test data of max_depth = 12:  0.9963893117828136




### Bus stops only

In [25]:
X = df17_by_dw[X_transpo_bus_col  + X_dw_col]
y = df17_by_dw['r_weekly_rent_median']

dtr_assess(X, y)

shape of X:  (346, 4)
shape of y:  (346,)
shape of X_train:  (259, 4)
shape of y_train:  (259,)
shape of X_test:  (87, 4)
shape of y_test:  (87,)
best max depth para is:  {'max_depth': 12}
R2 from CV:  0.9694377900744011
rmse train data of max_depth = 12:  5.968061713635392
r2 train data of max_depth = 12:  0.9981039858927456
rmse test data of max_depth = 12:  15.015689963601874
r2 test data of max_depth = 12:  0.9905720918773466




### Train stations only

In [26]:
X = df17_by_dw[X_transpo_train_col  + X_dw_col]
y = df17_by_dw['r_weekly_rent_median']

dtr_assess(X, y)

shape of X:  (346, 4)
shape of y:  (346,)
shape of X_train:  (259, 4)
shape of y_train:  (259,)
shape of X_test:  (87, 4)
shape of y_test:  (87,)
best max depth para is:  {'max_depth': 8}
R2 from CV:  0.5880482437240938
rmse train data of max_depth = 8:  75.87639383071607
r2 train data of max_depth = 8:  0.693529753465352
rmse test data of max_depth = 8:  109.65020382199384
r2 test data of max_depth = 8:  0.49725976601446475




**-> Results above also shows that inclunding train staions counts does not generate more accurate prediction**

## Using nighttime lights variables

In [27]:
X = df17_by_dw[X_ntl_col + X_dw_col]
y = df17_by_dw['r_weekly_rent_median']

dtr_assess(X, y)

shape of X:  (346, 4)
shape of y:  (346,)
shape of X_train:  (259, 4)
shape of y_train:  (259,)
shape of X_test:  (87, 4)
shape of y_test:  (87,)
best max depth para is:  {'max_depth': 16}
R2 from CV:  0.8645422596559246
rmse train data of max_depth = 16:  0.0
r2 train data of max_depth = 16:  1.0
rmse test data of max_depth = 16:  22.883073601928224
r2 test data of max_depth = 16:  0.9781045703701567




## Using bus stops and nighttime lights variables

In [28]:
X = df17_by_dw[X_transpo_bus + X_ntl_col + X_dw_col]
y = df17_by_dw['r_weekly_rent_median']

dtr_assess(X, y)

shape of X:  (346, 6)
shape of y:  (346,)
shape of X_train:  (259, 6)
shape of y_train:  (259,)
shape of X_test:  (87, 6)
shape of y_test:  (87,)
best max depth para is:  {'max_depth': 11}
R2 from CV:  0.9807880083472734
rmse train data of max_depth = 11:  0.9423539080573116
r2 train data of max_depth = 11:  0.999952728103559
rmse test data of max_depth = 11:  17.583380999009318
r2 test data of max_depth = 11:  0.9870720539982784




## Using all variables excluding train station counts

In [29]:
X = df17_by_dw[X_all_excl_train_col + X_dw_col]
y = df17_by_dw['r_weekly_rent_median']

dtr_assess(X, y)

shape of X:  (346, 29)
shape of y:  (346,)
shape of X_train:  (259, 29)
shape of y_train:  (259,)
shape of X_test:  (87, 29)
shape of y_test:  (87,)
best max depth para is:  {'max_depth': 12}
R2 from CV:  0.9811975991534801
rmse train data of max_depth = 12:  0.0
r2 train data of max_depth = 12:  1.0
rmse test data of max_depth = 12:  17.469184691710815
r2 test data of max_depth = 12:  0.9872394313256175




----
# Predict house rent median by bedroom

In [30]:
df_br =  final17_c.melt(id_vars = ['LGA_CODE'], value_vars = ['r_one_br', 'r_two_br', 'r_three_br',
                                                             'r_four_or_more_br', 'r_bedsitter'])
df_br = df_br[df_br.value > 0]

df17_by_br = final17_c.drop(columns = ['r_one_br', 'r_two_br', 'r_three_br',
                                        'r_four_or_more_br', 'r_bedsitter'])
df17_by_br = df17_by_br.merge(df_br, on = 'LGA_CODE', how = 'inner')
df17_by_br = df17_by_br.rename(columns = {'variable' : 'bedroom', 'value' : 'weekly_rent_median'})

dum_df17_by_br = pd.get_dummies(df17_by_br.bedroom).drop(columns = ['r_bedsitter'])
df17_by_br = df17_by_br.merge(dum_df17_by_br, how = 'inner', left_index = True, right_index = True)

X_br_col = ['r_one_br', 'r_two_br', 'r_three_br','r_four_or_more_br']


In [31]:
print('Shape of the df17_by_dw dataset: ', df17_by_br.shape)
print('The number of LGAs represented in the dataset is: ', len(df17_by_br.LGA_CODE.unique()))

df17_by_br.head()


Shape of the df17_by_dw dataset:  (375, 125)
The number of LGAs represented in the dataset is:  110


Unnamed: 0,LGA_CODE,LGA_NAME,s_mean_nonstrata,s_mean_strata,s_median_nonstrata,s_median_strata,s_mean_sales,s_median_sales,r_not_specified,r_flat,...,ntl17_mean_per_land_area,bstop_per_land_area,tstation_per_land_area,transpo_per_land_area,bedroom,weekly_rent_median,r_four_or_more_br,r_one_br,r_three_br,r_two_br
0,LGA10050,Albury,350.72904,197.23513,320.0,206.0,320.42552,290.0,280.0,200.0,...,13.711019,0.29417,0.003269,0.297438,r_one_br,162.5,0,1,0,0
1,LGA10050,Albury,350.72904,197.23513,320.0,206.0,320.42552,290.0,280.0,200.0,...,13.711019,0.29417,0.003269,0.297438,r_two_br,220.0,0,0,0,1
2,LGA10050,Albury,350.72904,197.23513,320.0,206.0,320.42552,290.0,280.0,200.0,...,13.711019,0.29417,0.003269,0.297438,r_three_br,290.0,0,0,1,0
3,LGA10050,Albury,350.72904,197.23513,320.0,206.0,320.42552,290.0,280.0,200.0,...,13.711019,0.29417,0.003269,0.297438,r_four_or_more_br,375.0,1,0,0,0
4,LGA10130,Armidale Regional,372.99474,0.0,352.5,0.0,361.56796,345.0,250.0,230.0,...,1.139977,0.001972,0.000116,0.002436,r_one_br,170.0,0,1,0,0


## Using all variables

In [32]:
X = df17_by_br[X_all_col + X_br_col]
y = df17_by_br['r_weekly_rent_median']

dtr_assess(X, y)

shape of X:  (375, 30)
shape of y:  (375,)
shape of X_train:  (281, 30)
shape of y_train:  (281,)
shape of X_test:  (94, 30)
shape of y_test:  (94,)
best max depth para is:  {'max_depth': 10}
R2 from CV:  0.980213074206678
rmse train data of max_depth = 10:  0.0
r2 train data of max_depth = 10:  1.0
rmse test data of max_depth = 10:  26.067628902448046
r2 test data of max_depth = 10:  0.9678407293703832




## Using census variables

In [33]:
X = df17_by_br[X_census_col + X_br_col]
y = df17_by_br['r_weekly_rent_median']

dtr_assess(X, y)

shape of X:  (375, 30)
shape of y:  (375,)
shape of X_train:  (281, 30)
shape of y_train:  (281,)
shape of X_test:  (94, 30)
shape of y_test:  (94,)
best max depth para is:  {'max_depth': 11}
R2 from CV:  0.9775584563460243
rmse train data of max_depth = 11:  0.2790107078392655
r2 train data of max_depth = 11:  0.9999963297691916
rmse test data of max_depth = 11:  24.507851949545703
r2 test data of max_depth = 11:  0.9715741390608613




## Using transportation variables

### All transportations

In [34]:
X = df17_by_br[X_transpo_all_col + X_br_col]
y = df17_by_br['r_weekly_rent_median']

dtr_assess(X, y)

shape of X:  (375, 5)
shape of y:  (375,)
shape of X_train:  (281, 5)
shape of y_train:  (281,)
shape of X_test:  (94, 5)
shape of y_test:  (94,)
best max depth para is:  {'max_depth': 11}
R2 from CV:  0.9670337255757643
rmse train data of max_depth = 11:  3.890233953090368
r2 train data of max_depth = 11:  0.9992864850542684
rmse test data of max_depth = 11:  9.467460939664543
r2 test data of max_depth = 11:  0.9957580047791992




### Bus stops only

In [35]:
X = df17_by_br[X_transpo_bus_col  + X_br_col]
y = df17_by_br['r_weekly_rent_median']

dtr_assess(X, y)

shape of X:  (375, 5)
shape of y:  (375,)
shape of X_train:  (281, 5)
shape of y_train:  (281,)
shape of X_test:  (94, 5)
shape of y_test:  (94,)
best max depth para is:  {'max_depth': 11}
R2 from CV:  0.9439695828963974
rmse train data of max_depth = 11:  5.883044201324099
r2 train data of max_depth = 11:  0.9983682415985144
rmse test data of max_depth = 11:  10.58010500930774
r2 test data of max_depth = 11:  0.9947023521366462




### Train stations only

In [36]:
X = df17_by_br[X_transpo_train_col  + X_br_col]
y = df17_by_br['r_weekly_rent_median']

dtr_assess(X, y)

shape of X:  (375, 5)
shape of y:  (375,)
shape of X_train:  (281, 5)
shape of y_train:  (281,)
shape of X_test:  (94, 5)
shape of y_test:  (94,)
best max depth para is:  {'max_depth': 13}
R2 from CV:  0.6267358661820231
rmse train data of max_depth = 13:  75.28636600673447
r2 train data of max_depth = 13:  0.7327705584747988
rmse test data of max_depth = 13:  95.23583032424735
r2 test data of max_depth = 13:  0.570756353070359




**-> Results above also shows that inclunding train staions counts does not generate more accurate prediction**

## Using nighttime lights variables

In [37]:
X = df17_by_br[X_ntl_col + X_br_col]
y = df17_by_br['r_weekly_rent_median']

dtr_assess(X, y)

shape of X:  (375, 5)
shape of y:  (375,)
shape of X_train:  (281, 5)
shape of y_train:  (281,)
shape of X_test:  (94, 5)
shape of y_test:  (94,)
best max depth para is:  {'max_depth': 10}
R2 from CV:  0.9559288579304089
rmse train data of max_depth = 10:  5.996489680927974
r2 train data of max_depth = 10:  0.998304702912305
rmse test data of max_depth = 10:  14.532256043747653
r2 test data of max_depth = 10:  0.9900053126541384




## Using bus stops and nighttime lights variables

In [38]:
X = df17_by_br[X_transpo_bus + X_ntl_col + X_br_col]
y = df17_by_br['r_weekly_rent_median']

dtr_assess(X, y)

shape of X:  (375, 7)
shape of y:  (375,)
shape of X_train:  (281, 7)
shape of y_train:  (281,)
shape of X_test:  (94, 7)
shape of y_test:  (94,)
best max depth para is:  {'max_depth': 11}
R2 from CV:  0.988645607534871
rmse train data of max_depth = 11:  0.6466447758949041
r2 train data of max_depth = 11:  0.999980285617372
rmse test data of max_depth = 11:  17.670296027574853
r2 test data of max_depth = 11:  0.9852228466126878




## Using all variables excluding train station counts

In [39]:
X = df17_by_br[X_all_excl_train_col + X_br_col]
y = df17_by_br['r_weekly_rent_median']

dtr_assess(X, y)

shape of X:  (375, 30)
shape of y:  (375,)
shape of X_train:  (281, 30)
shape of y_train:  (281,)
shape of X_test:  (94, 30)
shape of y_test:  (94,)
best max depth para is:  {'max_depth': 9}
R2 from CV:  0.9818587179181298
rmse train data of max_depth = 9:  1.2716372970754894
r2 train data of max_depth = 9:  0.9999237608318269
rmse test data of max_depth = 9:  23.850066117884175
r2 test data of max_depth = 9:  0.9730795506004615


