In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
import seaborn as sns

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV


from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score



## Load and sales 2017 data

In [2]:
#load and clean data
final17 = pd.read_csv("Data/final17_dataset.csv")

nationality_col = ['P_Afghanistan_Tot','P_Australia_Tot','P_Bangladesh_Tot','P_Bosnia_Herzegov_Tot','P_Cambodia_Tot','P_Canada_Tot','P_Chile_Tot','P_China_Tot','P_Croatia_Tot','P_Egypt_Tot','P_England_Tot','P_Fiji_Tot','P_France_Tot','P_Germany_Tot','P_Greece_Tot','P_Hong_Kong_Tot',
 'P_India_Tot','P_Indonesia_Tot','P_Iran_Tot','P_Iraq_Tot','P_Ireland_Tot','P_Italy_Tot','P_Japan_Tot','P_Korea_South_Tot','P_Lebanon_Tot','P_Malaysia_Tot','P_Malta_Tot','P_Mauritius_Tot','P_Myanmar_Tot','P_Nepal_Tot','P_Netherlands_Tot','P_New_Zealand_Tot',
 'P_Nthern_Ireland_Tot','P_Pakistan_Tot','P_PNG_Tot','P_Philippines_Tot','P_Poland_Tot','P_Scotland_Tot','P_Singapore_Tot','P_South_Africa_Tot','P_SE_Europe_nfd_Tot','P_Sri_Lanka_Tot','P_Taiwan_Tot','P_Thailand_Tot','P_FYROM_Tot',
 'P_Turkey_Tot','P_USA_Tot','P_Vietnam_Tot','P_Wales_Tot','P_Zimbabwe_Tot','P_Elsewhere_Tot']

col_rename = {'Age_below_15_yr_perc':'pct_age_14below', 'Age_15_64_yr_perc' : 'pct_age_15_64',
              'Age_over_65_yr_perc' : 'pct_age_65above', 'BP_Aus_Perc' : 'pct_birthplace_au',
              'BP_Non_Aus_Perc' : 'pct_birthplace_non_au', 'SE_Yr10_above_Perc' : 'pct_educ_yr10above',
              'SE_Yr9_below_Perc' : 'pct_educ_yr9below', 'MS_Married_Perc' : 'pct_married',
              'MS_Single_Perc' : 'pct_single', 'Percent_Unem_loyment_P' : 'unemployment_rate',
              'Percnt_LabForc_prticipation_P' : 'lfpr', 'Percnt_Employment_to_populn_P' : 'employment_rate', 
              'Median_age_persons' : 'median_age', 'Median_tot_prsnl_inc_weekly' : 'median_weekly_pincome',
              'Median_tot_fam_inc_weekly' : 'median_weekly_fincome',
              'Median_tot_hhd_inc_weekly' : 'median_weekly_hincome', 'Average_household_size' : 'ave_hhsize'}

final17_c = final17.fillna(0)
final17_c = final17_c.drop(nationality_col, axis=1)
final17_c = final17_c.rename(columns = col_rename)

final17_c['ntl17_mean_per_land_area'] = final17_c.ntl17_sum / final17_c.land_area
final17_c['bstop_per_land_area'] = final17_c.bus_stop / final17_c.land_area
final17_c['tstation_per_land_area'] = final17_c.train_station / final17_c.land_area
final17_c['transpo_per_land_area'] = final17_c.all_transpo / final17_c.land_area

final17_c = final17_c[final17_c.s_median_sales>0]
print("shape of final df: ", final17_c.shape)

shape of final df:  (113, 124)


In [3]:
sales17_col = ['s_mean_nonstrata','s_mean_strata','s_median_nonstrata','s_median_strata','s_mean_sales','s_median_sales']
sales17=final17_c[sales17_col]

rent17_col = ['r_one_br','r_two_br','r_three_br','r_four_or_more_br',
 'r_bedsitter','r_not_specified','r_flat','r_house','r_other','r_townhouse','r_weekly_rent_median']

sales17 = final17_c[sales17_col]

In [4]:
print("shape of sales df: ",sales17.shape)

shape of sales df:  (113, 6)


In [5]:
sales17.head()

Unnamed: 0,s_mean_nonstrata,s_mean_strata,s_median_nonstrata,s_median_strata,s_mean_sales,s_median_sales
0,350.72904,197.23513,320.0,206.0,320.42552,290.0
1,372.99474,0.0,352.5,0.0,361.56796,345.0
2,695.14002,463.46875,639.5,424.0,616.27321,587.5
4,436.94776,293.44444,417.0,280.0,426.22967,405.0
5,480.04897,292.83431,460.0,285.0,431.58223,420.0


In [6]:
lga_id = ['LGA_CODE', 'LGA_NAME']

X_ntl = ['land_area', 'ntl17_mean', 'ntl17_mean_per_land_area']
X_transpo_bus = ['bus_stop', 'bstop_per_land_area']
X_transpo_train = ['train_station', 'tstation_per_land_area']
X_transpo_all = ['all_transpo', 'transpo_per_land_area']
X_census_age = ['pct_age_14below', 'pct_age_15_64', 'pct_age_65above']
X_census_origin = ['pct_birthplace_au', 'pct_birthplace_non_au']
X_census_educ = ['pct_educ_yr10above', 'pct_educ_yr9below']
X_census_marital_stat = ['pct_married', 'pct_single']
X_census_employment = ['unemployment_rate', 'lfpr', 'employment_rate']
X_census_educ_qual = ['pct_pgrad', 'pct_grad_cert_dip', 'pct_bach_deg', 'pct_adv_dip', 'pct_cert_iii_iv',
                      'pct_cert_i_ii', 'pct_others']
X_census_stats = ['median_age', 'median_weekly_pincome', 'median_weekly_fincome', 'median_weekly_hincome',
                  'ave_hhsize']

X_transpo_bus_col= ['bstop_per_land_area']
X_transpo_train_col= ['tstation_per_land_area']
X_transpo_all_col = ['transpo_per_land_area']
X_ntl_col = ['ntl17_mean_per_land_area']
X_census_col = X_census_age+X_census_origin+X_census_origin+X_census_educ+X_census_marital_stat+X_census_employment+X_census_educ_qual+X_census_stats

X_all_col = X_ntl_col+X_transpo_all_col+X_census_age+X_census_origin+X_census_educ+X_census_marital_stat+X_census_employment+X_census_educ_qual+X_census_stats
X_all_excl_train_col = X_ntl_col+X_transpo_bus_col+X_census_age+X_census_origin+X_census_educ+X_census_marital_stat+X_census_employment+X_census_educ_qual+X_census_stats


# Predict house sales median

## Using all variables

#### Choose and split data

In [7]:
X = final17_c[X_all_col]
y = sales17['s_median_sales']

In [8]:
print("shape of X: ", X.shape)
print("shape of y: ", y.shape)

shape of X:  (113, 26)
shape of y:  (113,)


In [9]:
# 75% split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)


print("shape of X_train: ", X_train.shape)
print("shape of y_train: ", y_train.shape)
print("shape of X_test: ", X_test.shape)
print("shape of y_test: ", y_test.shape)


shape of X_train:  (84, 26)
shape of y_train:  (84,)
shape of X_test:  (29, 26)
shape of y_test:  (29,)


#### Use cross validation to find best max depth

In [10]:
# best depth
dtr = DecisionTreeRegressor(random_state = 42)
param_grid = {'max_depth':np.arange(1,50)}
dtr_grid_cv = GridSearchCV(dtr, param_grid, cv = 10, scoring = 'r2')
dtr_grid_cv.fit(X_train, y_train)
print("best max depth para is: ", dtr_grid_cv.best_params_)
print("R2 = ", dtr_grid_cv.best_score_)

best max depth para is:  {'max_depth': 4}
R2 =  0.676867889842009




**-> Based on cross validataion result, best max_depth parameter for the regression is 4**

#### Scoring

In [31]:
dtr_d10 = DecisionTreeRegressor(max_depth=4, random_state=42)

dtr_d10.fit(X_train,y_train)

y_pre_d10_train = dtr_d10.predict(X_train)
y_pre_d10_test = dtr_d10.predict(X_test)

rmse_train = np.sqrt(mean_squared_error(y_train,y_pre_d10_train))
rsq_train = r2_score(y_train,y_pre_d10_train)
#adj_rsq_train = 1 - (1 - rsq_train) * (len(y_train) - 1) / (len(y_train) - X_train.shape[1] - 1)

rmse_test = np.sqrt(mean_squared_error(y_test,y_pre_d10_test))
rsq_test = r2_score(y_test,y_pre_d10_test)
#adj_rsq_test = 1 - (1 - rsq_test) * (len(y_test) - 1) / (len(y_test) - X_test.shape[1] - 1)


print("rmse train data of max_depth=4: ", rmse_train)
print("r2 train data of max_depth=4: ", rsq_train)
#print("adjusted r2 train data of max_depth = 10: ", adj_rsq_train)

print("rmse test data of max_depth=4: ", rmse_test)
print("r2 test data of max_depth=4: ", rsq_test)
#print("adjusted r2 test data of max_depth = 10: ", adj_rsq_test)


rmse train data of max_depth=4:  64.42356810713441
r2 train data of max_depth=4:  0.9731620214811663
rmse test data of max_depth=4:  334.5812889135171
r2 test data of max_depth=4:  0.6614049692429425


## Defining functions for using other variables

Other grouped variables will be tested using same process above with bellowed functions

In [12]:
def load_data(X,y):
    #print("shape of X: ", X.shape)
    #print("shape of y: ", y.shape)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
    #print("shape of X_train: ", X_train.shape)
    #print("shape of y_train: ", y_train.shape)
    #print("shape of X_test: ", X_test.shape)
    #print("shape of y_test: ", y_test.shape)
    
    return X_train, y_train, X_test, y_test

def dtr_best_depth(X_train, y_train):

    # best depth
    dtr = DecisionTreeRegressor(random_state = 42)
    param_grid = {'max_depth':np.arange(1,50)}
    dtr_grid_cv = GridSearchCV(dtr, param_grid, cv = 10, scoring = 'r2')
    dtr_grid_cv.fit(X_train, y_train)
    best_depth = dtr_grid_cv.best_params_
    best_score = dtr_grid_cv.best_score_
    
    #print("best max depth para is: ", dtr_grid_cv.best_params_)
    #print("R2 from CV = ", dtr_grid_cv.best_score_)

    return best_depth, best_score

def scoring(max_depth, X_train, y_train, X_test, y_test):
    dtr = DecisionTreeRegressor(max_depth = max_depth, random_state=42)
    dtr.fit(X_train, y_train)
    y_pre_train = dtr.predict(X_train)
    y_pre_test = dtr.predict(X_test)
    
    #print("rmse of {}: ".format(max_depth), np.sqrt(mean_squared_error(y_test,y_pre)))
    #print("r2 of {}: ".format(max_depth), r2_score(y_test,y_pre))
    
    rmse_train = np.sqrt(mean_squared_error(y_train,y_pre_train))
    rsq_train = r2_score(y_train,y_pre_train)
    adj_rsq_train = 1 - (1 - rsq_train) * (len(y_train) - 1) / (len(y_train) - X_train.shape[1] - 1)
    
    rmse_test = np.sqrt(mean_squared_error(y_test,y_pre_test))
    rsq_test = r2_score(y_test,y_pre_test)
    adj_rsq_test = 1 - (1 - rsq_test) * (len(y_test) - 1) / (len(y_test) - X_test.shape[1] - 1)
    
    return rmse_train, rsq_train, adj_rsq_train, rmse_test, rsq_test, adj_rsq_test
    
    
    
def dtr_assess(X, y):
    
    data = load_data(X, y)
    best_depth = dtr_best_depth(data[0], data[1])
    scores = scoring(best_depth[0]['max_depth'],data[0], data[1], data[2], data[3])
    
    print("shape of X: ", X.shape)
    print("shape of y: ", y.shape)
    
    print("shape of X_train: ", data[0].shape)
    print("shape of y_train: ", data[1].shape)
    print("shape of X_test: ", data[2].shape)
    print("shape of y_test: ", data[3].shape)               

    print("best max depth para is: ", best_depth[0])
    print("R2 from CV: ", best_depth[1])
    
    print("rmse train data of max_depth = {}: ".format(best_depth[0]['max_depth']), scores[0])
    print("r2 train data of max_depth = {}: ".format(best_depth[0]['max_depth']), scores[1]) 
    #print("adjusted r2 train data of max_depth = {}: ".format(best_depth[0]['max_depth']), scores[2])
    
    print("rmse test data of max_depth = {}: ".format(best_depth[0]['max_depth']), scores[3])
    print("r2 test data of max_depth = {}: ".format(best_depth[0]['max_depth']), scores[4])      
    #print("adjusted r2 test data of max_depth = {}: ".format(best_depth[0]['max_depth']), scores[5])            
                                  

## Using census variables

In [13]:
X = final17_c[X_census_col]
y = sales17['s_median_sales']

dtr_assess(X, y)

shape of X:  (113, 26)
shape of y:  (113,)
shape of X_train:  (84, 26)
shape of y_train:  (84,)
shape of X_test:  (29, 26)
shape of y_test:  (29,)
best max depth para is:  {'max_depth': 6}
R2 from CV:  0.6606560326632243
rmse train data of max_depth = 6:  28.41030549566396
r2 train data of max_depth = 6:  0.9947807024181088
rmse test data of max_depth = 6:  261.638066353022
r2 test data of max_depth = 6:  0.7929481372766466




## Using transportation variables

### All transportations

In [14]:
X = final17_c[X_transpo_all_col]
y = sales17['s_median_sales']

dtr_assess(X, y)

shape of X:  (113, 1)
shape of y:  (113,)
shape of X_train:  (84, 1)
shape of y_train:  (84,)
shape of X_test:  (29, 1)
shape of y_test:  (29,)
best max depth para is:  {'max_depth': 2}
R2 from CV:  0.42490424233809015
rmse train data of max_depth = 2:  211.7211062056669
r2 train data of max_depth = 2:  0.7101398179612339
rmse test data of max_depth = 2:  356.2274778973721
r2 test data of max_depth = 2:  0.6161760129000582




### Bus stops only

In [15]:
X = final17_c[X_transpo_bus_col]
y = sales17['s_median_sales']

dtr_assess(X, y)

shape of X:  (113, 1)
shape of y:  (113,)
shape of X_train:  (84, 1)
shape of y_train:  (84,)
shape of X_test:  (29, 1)
shape of y_test:  (29,)
best max depth para is:  {'max_depth': 2}
R2 from CV:  0.40623093278563926
rmse train data of max_depth = 2:  211.7211062056669
r2 train data of max_depth = 2:  0.7101398179612339
rmse test data of max_depth = 2:  356.2274778973721
r2 test data of max_depth = 2:  0.6161760129000582




### Train stations only

In [16]:
X = final17_c[X_transpo_train_col]
y = sales17['s_median_sales']

dtr_assess(X, y)

shape of X:  (113, 1)
shape of y:  (113,)
shape of X_train:  (84, 1)
shape of y_train:  (84,)
shape of X_test:  (29, 1)
shape of y_test:  (29,)
best max depth para is:  {'max_depth': 6}
R2 from CV:  0.22798827613954833
rmse train data of max_depth = 6:  274.760136081774
r2 train data of max_depth = 6:  0.5118338098801809
rmse test data of max_depth = 6:  445.4408218873744
r2 test data of max_depth = 6:  0.39985345508469883




**-> Results above shows that inclunding train staions counts does not generate more accurate prediction**

## Using nighttime lights variables

In [17]:
X = final17_c[X_ntl_col]
y = sales17['s_median_sales']

dtr_assess(X, y)

shape of X:  (113, 1)
shape of y:  (113,)
shape of X_train:  (84, 1)
shape of y_train:  (84,)
shape of X_test:  (29, 1)
shape of y_test:  (29,)
best max depth para is:  {'max_depth': 1}
R2 from CV:  0.28058845751959344
rmse train data of max_depth = 1:  258.39382999857077
r2 train data of max_depth = 1:  0.568257770155604
rmse test data of max_depth = 1:  395.27481275073177
r2 test data of max_depth = 1:  0.5274197545103678




## Using bus stops and nighttime lights variables

In [18]:
X = final17_c[X_transpo_bus + X_ntl_col]
y = sales17['s_median_sales']

dtr_assess(X, y)

shape of X:  (113, 3)
shape of y:  (113,)
shape of X_train:  (84, 3)
shape of y_train:  (84,)
shape of X_test:  (29, 3)
shape of y_test:  (29,)
best max depth para is:  {'max_depth': 11}
R2 from CV:  0.5399859787790998
rmse train data of max_depth = 11:  1.164964745021435
r2 train data of max_depth = 11:  0.9999912242181886
rmse test data of max_depth = 11:  413.5298538792428
r2 test data of max_depth = 11:  0.48276129433892667




## Using all variables including only census and bus stops

In [19]:
X = final17_c[X_census_col+X_transpo_bus_col]
y = sales17['s_median_sales']

dtr_assess(X, y)

shape of X:  (113, 27)
shape of y:  (113,)
shape of X_train:  (84, 27)
shape of y_train:  (84,)
shape of X_test:  (29, 27)
shape of y_test:  (29,)
best max depth para is:  {'max_depth': 5}
R2 from CV:  0.7479496505590264
rmse train data of max_depth = 5:  44.92327919761741
r2 train data of max_depth = 5:  0.9869502211371789
rmse test data of max_depth = 5:  304.4118856668184
r2 test data of max_depth = 5:  0.7197145903601332




----
# Predict house sales median by dwelling type

In [20]:
df_dw =  final17_c.melt(id_vars = ['LGA_CODE'], value_vars = ['s_median_nonstrata','s_median_strata'])
df_dw = df_dw[df_dw.value > 0]

df17_by_dw = final17_c.drop(columns = ['s_median_nonstrata','s_median_strata'])
df17_by_dw = df17_by_dw.merge(df_dw, on = 'LGA_CODE', how = 'inner')
df17_by_dw = df17_by_dw.rename(columns = {'variable' : 'dwell_type', 'value' : 'sales_median_by_dw'})

dum_df17_by_dw = pd.get_dummies(df17_by_dw.dwell_type).drop(columns = ['s_median_nonstrata'])
df17_by_dw = df17_by_dw.merge(dum_df17_by_dw, how = 'inner', left_index = True, right_index = True)

X_dw_col = ['s_median_strata']

In [21]:
print('Shape of the df17_by_dw dataset: ', df17_by_dw.shape)
print('The number of LGAs represented in the dataset is: ', len(df17_by_dw.LGA_CODE.unique()))

df17_by_dw.head()


Shape of the df17_by_dw dataset:  (176, 125)
The number of LGAs represented in the dataset is:  113


Unnamed: 0,LGA_CODE,LGA_NAME,s_mean_nonstrata,s_mean_strata,s_mean_sales,s_median_sales,r_one_br,r_two_br,r_three_br,r_four_or_more_br,...,ntl19_mean,ntl19_max,ntl19_sum,ntl17_mean_per_land_area,bstop_per_land_area,tstation_per_land_area,transpo_per_land_area,dwell_type,sales_median_by_dw,s_median_strata
0,LGA10050,Albury,350.72904,197.23513,320.42552,290.0,162.5,220.0,290.0,375.0,...,2.501557,56.82,4417.75,13.711019,0.29417,0.003269,0.297438,s_median_nonstrata,320.0,0
1,LGA10050,Albury,350.72904,197.23513,320.42552,290.0,162.5,220.0,290.0,375.0,...,2.501557,56.82,4417.75,13.711019,0.29417,0.003269,0.297438,s_median_strata,206.0,1
2,LGA10130,Armidale Regional,372.99474,0.0,361.56796,345.0,170.0,240.0,322.5,410.0,...,0.234065,41.060001,10910.72168,1.139977,0.001972,0.000116,0.002436,s_median_nonstrata,352.5,0
3,LGA10250,Ballina,695.14002,463.46875,616.27321,587.5,240.0,360.0,470.0,580.0,...,0.650873,23.889999,1676.650024,3.293157,0.123727,0.0,0.138162,s_median_nonstrata,639.5,0
4,LGA10250,Ballina,695.14002,463.46875,616.27321,587.5,240.0,360.0,470.0,580.0,...,0.650873,23.889999,1676.650024,3.293157,0.123727,0.0,0.138162,s_median_strata,424.0,1


## Using all variables

In [22]:
X = df17_by_dw[X_all_col + X_dw_col]
y = df17_by_dw['sales_median_by_dw']

dtr_assess(X, y)

shape of X:  (176, 27)
shape of y:  (176,)
shape of X_train:  (132, 27)
shape of y_train:  (132,)
shape of X_test:  (44, 27)
shape of y_test:  (44,)
best max depth para is:  {'max_depth': 12}
R2 from CV:  0.8304104570018573
rmse train data of max_depth = 12:  1.429324578253659
r2 train data of max_depth = 12:  0.9999956834711148
rmse test data of max_depth = 12:  239.21689501121534
r2 test data of max_depth = 12:  0.7979729888233872




## Using census variables

In [23]:
X = df17_by_dw[X_census_col + X_dw_col]
y = df17_by_dw['sales_median_by_dw']

dtr_assess(X, y)

shape of X:  (176, 27)
shape of y:  (176,)
shape of X_train:  (132, 27)
shape of y_train:  (132,)
shape of X_test:  (44, 27)
shape of y_test:  (44,)
best max depth para is:  {'max_depth': 13}
R2 from CV:  0.8269484732195795
rmse train data of max_depth = 13:  0.0
r2 train data of max_depth = 13:  1.0
rmse test data of max_depth = 13:  276.7944827776825
r2 test data of max_depth = 13:  0.7295166109388111




## Using transportation variables

### All transportations

In [24]:
X = df17_by_dw[X_transpo_all_col + X_dw_col]
y = df17_by_dw['sales_median_by_dw']

dtr_assess(X, y)

shape of X:  (176, 2)
shape of y:  (176,)
shape of X_train:  (132, 2)
shape of y_train:  (132,)
shape of X_test:  (44, 2)
shape of y_test:  (44,)
best max depth para is:  {'max_depth': 4}
R2 from CV:  0.7067789170893073
rmse train data of max_depth = 4:  231.9337137000768
r2 train data of max_depth = 4:  0.8863418214884189
rmse test data of max_depth = 4:  329.1616471993258
r2 test data of max_depth = 4:  0.6174887657933603




### Bus stops only

In [25]:
X = df17_by_dw[X_transpo_bus_col  + X_dw_col]
y = df17_by_dw['sales_median_by_dw']

dtr_assess(X, y)

shape of X:  (176, 2)
shape of y:  (176,)
shape of X_train:  (132, 2)
shape of y_train:  (132,)
shape of X_test:  (44, 2)
shape of y_test:  (44,)
best max depth para is:  {'max_depth': 4}
R2 from CV:  0.7182977841931987
rmse train data of max_depth = 4:  212.70304511598812
r2 train data of max_depth = 4:  0.9044082653114646
rmse test data of max_depth = 4:  353.1035251048799
r2 test data of max_depth = 4:  0.5598204650192834




### Train stations only

In [26]:
X = df17_by_dw[X_transpo_train_col  + X_dw_col]
y = df17_by_dw['sales_median_by_dw']

dtr_assess(X, y)

shape of X:  (176, 2)
shape of y:  (176,)
shape of X_train:  (132, 2)
shape of y_train:  (132,)
shape of X_test:  (44, 2)
shape of y_test:  (44,)
best max depth para is:  {'max_depth': 2}
R2 from CV:  0.251419995407782
rmse train data of max_depth = 2:  513.6912321640226
r2 train data of max_depth = 2:  0.4424584551038053
rmse test data of max_depth = 2:  395.87110538855376
r2 test data of max_depth = 2:  0.4467348075011456




**-> Results above also shows that transportation variables does not affeect sales median by dwelling type much**

## Using nighttime lights variables

In [27]:
X = df17_by_dw[X_ntl_col + X_dw_col]
y = df17_by_dw['sales_median_by_dw']

dtr_assess(X, y)

shape of X:  (176, 2)
shape of y:  (176,)
shape of X_train:  (132, 2)
shape of y_train:  (132,)
shape of X_test:  (44, 2)
shape of y_test:  (44,)
best max depth para is:  {'max_depth': 2}
R2 from CV:  0.4654474907675229
rmse train data of max_depth = 2:  374.2492952719113
r2 train data of max_depth = 2:  0.7040659686493627
rmse test data of max_depth = 2:  414.85774443184096
r2 test data of max_depth = 2:  0.3923910751615668




## Using bus stops and nighttime lights variables

In [28]:
X = df17_by_dw[X_transpo_bus + X_ntl_col + X_dw_col]
y = df17_by_dw['sales_median_by_dw']

dtr_assess(X, y)

shape of X:  (176, 4)
shape of y:  (176,)
shape of X_train:  (132, 4)
shape of y_train:  (132,)
shape of X_test:  (44, 4)
shape of y_test:  (44,)
best max depth para is:  {'max_depth': 10}
R2 from CV:  0.7431169420079622
rmse train data of max_depth = 10:  3.7921037044166535
r2 train data of max_depth = 10:  0.9999696168121144
rmse test data of max_depth = 10:  275.6864159497532
r2 test data of max_depth = 10:  0.7316778806240403




## Using all variables including only census and bus stops

In [29]:
X = df17_by_dw[X_census_col+X_transpo_bus_col + X_dw_col]
y = df17_by_dw['sales_median_by_dw']

dtr_assess(X, y)

shape of X:  (176, 28)
shape of y:  (176,)
shape of X_train:  (132, 28)
shape of y_train:  (132,)
shape of X_test:  (44, 28)
shape of y_test:  (44,)
best max depth para is:  {'max_depth': 7}
R2 from CV:  0.827991217711173
rmse train data of max_depth = 7:  27.2380291690118
r2 train data of max_depth = 7:  0.9984324395803813
rmse test data of max_depth = 7:  302.4498566786456
r2 test data of max_depth = 7:  0.6770520687306396


