In [2]:
import urllib
import json
import pandas as pd

In [3]:
url = 'https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/package_show'
params = { "id": "498d16dd-31cf-4d68-8ba5-a9df131accc6"}
response = urllib.request.urlopen(url, data=bytes(json.dumps(params), encoding="utf-8"))
package = json.loads(response.read())


In [4]:
data = pd.read_json(urllib.request.urlopen(url, data=bytes(json.dumps(params), encoding="utf-8")))

In [5]:
data.describe()

Unnamed: 0,help,success,result
count,41,41,35
unique,1,1,31
top,https://ckan0.cf.opendata.inter.prod-toronto.c...,True,False
freq,41,41,3


In [6]:
data.head()

Unnamed: 0,help,success,result
civic_issues,https://ckan0.cf.opendata.inter.prod-toronto.c...,True,"Affordable housing,Poverty reduction"
collection_method,https://ckan0.cf.opendata.inter.prod-toronto.c...,True,
creator_user_id,https://ckan0.cf.opendata.inter.prod-toronto.c...,True,150d5301-86ec-44a3-a070-50f2cea839c9
dataset_category,https://ckan0.cf.opendata.inter.prod-toronto.c...,True,Document
excerpt,https://ckan0.cf.opendata.inter.prod-toronto.c...,True,This dataset contains information relating to ...


In [7]:
xls = pd.ExcelFile('./ressources/wellbeing-toronto-housing.xlsx')

In [8]:
xls.sheet_names

['IndicatorMetadata', 'RawDataRef-Period2008', 'RawDataRef_2011']

In [9]:
toronto_data = xls.parse('RawDataRef_2011')
toronto_data.head()

Unnamed: 0,Neighbourhood,Neighbourhood Id,Home Prices,Mid-Century Highrise Households,Mid-Century Highrise Population,Percent Mid-Century Highrise Households,Percent Mid-Century Highrise Population,Rent Bank Applicants,Social Housing Turnover,Social Housing Units,Social Housing Waiting List
0,West Humber-Clairville,1,317508,690,1810,6.8,5.3,38,0.3125,952,557
1,Mount Olive-Silverstone-Jamestown,2,251119,4110,13395,42.9,40.9,41,7.0,1146,1049
2,Thistletown-Beaumond Heights,3,414216,430,1260,13.4,12.4,7,1.333333,447,270
3,Rexdale-Kipling,4,392271,600,1050,15.2,10.0,19,4.2,432,186
4,Elms-Old Rexdale,5,233832,870,2305,27.3,24.1,14,1.0,463,239


In [10]:
y = toronto_data['Home Prices']

In [11]:
features = ['Mid-Century Highrise Households', 'Percent Mid-Century Highrise Households', 'Social Housing Units', 'Social Housing Waiting List']
X = toronto_data[features]
X.describe()

Unnamed: 0,Mid-Century Highrise Households,Percent Mid-Century Highrise Households,Social Housing Units,Social Housing Waiting List
count,140.0,140.0,140.0,140.0
mean,1807.178571,23.058571,657.95,410.3
std,1716.867045,17.579026,719.753318,314.164432
min,0.0,0.0,0.0,16.0
25%,508.75,8.875,161.25,169.75
50%,1240.0,19.6,468.0,321.0
75%,2691.25,32.3,919.75,573.5
max,8410.0,76.1,3990.0,1573.0


In [12]:
from sklearn.tree import DecisionTreeRegressor
toronto_model = DecisionTreeRegressor(random_state=1)
toronto_model.fit(X,y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=1, splitter='best')

In [13]:
X.head()

Unnamed: 0,Mid-Century Highrise Households,Percent Mid-Century Highrise Households,Social Housing Units,Social Housing Waiting List
0,690,6.8,952,557
1,4110,42.9,1146,1049
2,430,13.4,447,270
3,600,15.2,432,186
4,870,27.3,463,239


In [14]:
toronto_model.predict(X.head())

array([317508., 251119., 414216., 392271., 233832.])

## Model Validation

In [15]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)
toronto_model = DecisionTreeRegressor()
toronto_model.fit(train_X,train_y)

val_predictions = toronto_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

183605.6


In [16]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes,random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [17]:
for max_leaf_nodes in [2, 20, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d \t\t Mean Absolute Error: %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 2 		 Mean Absolute Error: 136836
Max leaf nodes: 20 		 Mean Absolute Error: 176412
Max leaf nodes: 500 		 Mean Absolute Error: 192473
Max leaf nodes: 5000 		 Mean Absolute Error: 192473


# Random Forests

Another type of model is the **random forest** model, that uses many trees and makes a prediction by averaging the predictions of each component tree. It generally results in a much better predictive accuracy than a single decision tree.

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
toronto_preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, toronto_preds))

163377.90000000002


