In [2]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
import seaborn as sns

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV


from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.metrics import r2_score

In [3]:
final17 = pd.read_csv("Data/final17_dataset.csv")

In [4]:
print(final17.columns)

Index(['LGA_CODE', 'LGA_NAME', 's_mean_nonstrata', 's_mean_strata',
       's_median_nonstrata', 's_median_strata', 's_mean_sales',
       's_median_sales', 'r_one_br', 'r_two_br',
       ...
       'ntl17_count', 'ntl17_min', 'ntl17_mean', 'ntl17_max', 'ntl17_sum',
       'ntl19_count', 'ntl19_min', 'ntl19_mean', 'ntl19_max', 'ntl19_sum'],
      dtype='object', length=171)


In [5]:
nationality_col = ['P_Afghanistan_Tot','P_Australia_Tot','P_Bangladesh_Tot','P_Bosnia_Herzegov_Tot','P_Cambodia_Tot','P_Canada_Tot','P_Chile_Tot','P_China_Tot','P_Croatia_Tot','P_Egypt_Tot','P_England_Tot','P_Fiji_Tot','P_France_Tot','P_Germany_Tot','P_Greece_Tot','P_Hong_Kong_Tot',
 'P_India_Tot','P_Indonesia_Tot','P_Iran_Tot','P_Iraq_Tot','P_Ireland_Tot','P_Italy_Tot','P_Japan_Tot','P_Korea_South_Tot','P_Lebanon_Tot','P_Malaysia_Tot','P_Malta_Tot','P_Mauritius_Tot','P_Myanmar_Tot','P_Nepal_Tot','P_Netherlands_Tot','P_New_Zealand_Tot',
 'P_Nthern_Ireland_Tot','P_Pakistan_Tot','P_PNG_Tot','P_Philippines_Tot','P_Poland_Tot','P_Scotland_Tot','P_Singapore_Tot','P_South_Africa_Tot','P_SE_Europe_nfd_Tot','P_Sri_Lanka_Tot','P_Taiwan_Tot','P_Thailand_Tot','P_FYROM_Tot',
 'P_Turkey_Tot','P_USA_Tot','P_Vietnam_Tot','P_Wales_Tot','P_Zimbabwe_Tot','P_Elsewhere_Tot']

col_rename = {'Age_below_15_yr_perc':'pct_age_14below', 'Age_15_64_yr_perc' : 'pct_age_15_64',
              'Age_over_65_yr_perc' : 'pct_age_65above', 'BP_Aus_Perc' : 'pct_birthplace_au',
              'BP_Non_Aus_Perc' : 'pct_birthplace_non_au', 'SE_Yr10_above_Perc' : 'pct_educ_yr10above',
              'SE_Yr9_below_Perc' : 'pct_educ_yr9below', 'MS_Married_Perc' : 'pct_married',
              'MS_Single_Perc' : 'pct_single', 'Percent_Unem_loyment_P' : 'unemployment_rate',
              'Percnt_LabForc_prticipation_P' : 'lfpr', 'Percnt_Employment_to_populn_P' : 'employment_rate', 
              'Median_age_persons' : 'median_age', 'Median_tot_prsnl_inc_weekly' : 'median_weekly_pincome',
              'Median_tot_fam_inc_weekly' : 'median_weekly_fincome',
              'Median_tot_hhd_inc_weekly' : 'median_weekly_hincome', 'Average_household_size' : 'ave_hhsize'}

final17_c = final17.fillna(0)
final17_c = final17_c.drop(nationality_col, axis=1)
final17_c = final17_c.rename(columns = col_rename)

final17_c['ntl17_mean_per_land_area'] = final17_c.ntl17_sum / final17_c.land_area
final17_c['bstop_per_land_area'] = final17_c.bus_stop / final17_c.land_area
final17_c['tstation_per_land_area'] = final17_c.train_station / final17_c.land_area
final17_c['transpo_per_land_area'] = final17_c.all_transpo / final17_c.land_area

final17_c = final17_c[final17_c.s_median_sales>0]
print("shape of final df: ", final17_c.shape)

shape of final df:  (113, 124)


In [6]:
sales17_col = ['s_mean_nonstrata','s_mean_strata','s_median_nonstrata','s_median_strata','s_mean_sales','s_median_sales']

sales17=final17_c[sales17_col]
sales17

Unnamed: 0,s_mean_nonstrata,s_mean_strata,s_median_nonstrata,s_median_strata,s_mean_sales,s_median_sales
0,350.72904,197.23513,320.00,206.000,320.42552,290.00
1,372.99474,0.00000,352.50,0.000,361.56796,345.00
2,695.14002,463.46875,639.50,424.000,616.27321,587.50
4,436.94776,293.44444,417.00,280.000,426.22967,405.00
5,480.04897,292.83431,460.00,285.000,431.58223,420.00
6,561.55357,0.00000,547.50,0.000,556.78947,545.00
7,249.65000,0.00000,225.00,0.000,241.62121,222.50
8,795.08317,550.45910,780.00,550.000,750.62768,737.00
9,185.63043,0.00000,165.00,0.000,185.63043,165.00
10,324.76562,0.00000,271.75,0.000,324.76562,271.75


In [7]:
lga_id = ['LGA_CODE', 'LGA_NAME']

X_ntl = ['land_area', 'ntl17_mean', 'ntl17_mean_per_land_area']
X_transpo_bus = ['bus_stop', 'bstop_per_land_area']
X_transpo_train = ['train_station', 'tstation_per_land_area']
X_transpo_all = ['all_transpo', 'transpo_per_land_area']
X_census_age = ['pct_age_14below', 'pct_age_15_64', 'pct_age_65above']
X_census_origin = ['pct_birthplace_au', 'pct_birthplace_non_au']
X_census_educ = ['pct_educ_yr10above', 'pct_educ_yr9below']
X_census_marital_stat = ['pct_married', 'pct_single']
X_census_employment = ['unemployment_rate', 'lfpr', 'employment_rate']
X_census_educ_qual = ['pct_pgrad', 'pct_grad_cert_dip', 'pct_bach_deg', 'pct_adv_dip', 'pct_cert_iii_iv',
                      'pct_cert_i_ii', 'pct_others']
X_census_stats = ['median_age', 'median_weekly_pincome', 'median_weekly_fincome', 'median_weekly_hincome',
                  'ave_hhsize']

X_all_col = X_ntl+X_transpo_all+X_census_age+X_census_origin+X_census_educ+X_census_marital_stat+X_census_employment+X_census_educ_qual+X_census_stats



In [8]:
X = final17_c[X_all_col]

y = final17_c['s_median_sales']

print(X.shape)
print(y.shape)

(113, 29)
(113,)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(84, 29)
(84,)
(29, 29)
(29,)


In [10]:
# Method to choose best depth
dtr = DecisionTreeRegressor(random_state = 42)
param_grid = {'max_depth':np.arange(1,50)}
dtr_grid_cv = GridSearchCV(dtr, param_grid, cv = 10, scoring = 'r2')
dtr_grid_cv.fit(X_train, y_train)
print("best max depth para is: ", dtr_grid_cv.best_params_)
print("R2 = ", dtr_grid_cv.best_score_)

best max depth para is:  {'max_depth': 5}
R2 =  0.6932838308809042




In [11]:
dtr = DecisionTreeRegressor(max_depth=5, random_state=143)

dtr.fit(X_train,y_train)

y_pre = dtr.predict(X_test)

print("rmse of max_depth=5: ", np.sqrt(mean_squared_error(y_test,y_pre)))
print("r2 of max_depth=5: ", r2_score(y_test,y_pre))

rmse of max_depth=5:  226.9744051486297
r2 of max_depth=5:  0.8441771822298191
