# Model selection

## Predicting house prices with California data

## Load libs

In [12]:
import warnings
warnings.simplefilter("ignore")
from local.lib import calhousing as ch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
%matplotlib inline

## Load data in Python's memory

In [13]:
d = pd.read_csv("local/data/cal_housing_small.data")
print (d.shape)
d.head()

(500, 6)


Unnamed: 0,longitude,latitude,housingMedianAge,totalRooms,totalBedrooms,medianHouseValue
0,-120.58,35.0,37.0,523.0,119.0,106300.0
1,-118.17,33.98,31.0,1236.0,329.0,155400.0
2,-122.22,37.81,52.0,1971.0,335.0,273700.0
3,-117.91,33.66,21.0,1708.0,505.0,193800.0
4,-121.92,37.24,27.0,1265.0,216.0,281200.0


In [14]:
X = d.as_matrix()[:,:-1]
y = d["medianHouseValue"].as_matrix()
print (X.shape, y.shape)

(500, 5) (500,)


# train/test

In [15]:
testsize = 0.3

In [16]:
parameters = { 'max_depth': list(range(1,20))}
estimator = DecisionTreeRegressor()
best, r = ch.gridsearch_best(X,y, estimator, parameters, n_iter=10, test_size=testsize)

In [17]:
best

mean_fit_time               0.00141549
std_fit_time               0.000236293
mean_score_time            0.000138998
std_score_time             2.55942e-05
param_max_depth                      9
params                {'max_depth': 9}
split0_test_score             0.440598
split1_test_score              0.46156
split2_test_score             0.412266
split3_test_score             0.432222
split4_test_score              0.49257
split5_test_score             0.487204
split6_test_score             0.369789
split7_test_score             0.403604
split8_test_score             0.368248
split9_test_score             0.400892
mean_test_score               0.426895
std_test_score               0.0418346
rank_test_score                     19
split0_train_score           0.0847402
split1_train_score            0.166943
split2_train_score            0.105417
split3_train_score             0.12864
split4_train_score            0.191847
split5_train_score            0.114409
split6_train_score       

## trainval/test

### 1. select model with `trainval`

In [7]:
valpct = 0.2

In [18]:
Xtrainval, Xtest, ytrainval, ytest = train_test_split(X,y, test_size=testsize)
print (Xtrainval.shape, Xtest.shape)
print (ytrainval.shape, ytest.shape)

(350, 5) (150, 5)
(350,) (150,)


In [20]:
parameters = { 'max_depth': list(range(1,20))}
estimator = DecisionTreeRegressor()
best, r = ch.gridsearch_best(Xtrainval,ytrainval, estimator, parameters, n_iter=10, test_size=valpct)
best

mean_fit_time              0.000856853
std_fit_time               7.15291e-06
mean_score_time            9.31025e-05
std_score_time             1.86287e-06
param_max_depth                      9
params                {'max_depth': 9}
split0_test_score             0.384802
split1_test_score             0.410857
split2_test_score             0.348173
split3_test_score              0.45885
split4_test_score             0.416217
split5_test_score             0.412626
split6_test_score             0.573379
split7_test_score             0.588875
split8_test_score             0.294029
split9_test_score              0.41583
mean_test_score               0.430364
std_test_score               0.0865538
rank_test_score                     19
split0_train_score            0.117731
split1_train_score            0.112995
split2_train_score           0.0854319
split3_train_score           0.0903051
split4_train_score            0.089344
split5_train_score            0.113482
split6_train_score       

### 2. use selected model on `test`

In [21]:
dt = DecisionTreeRegressor(**best.params)
dt

DecisionTreeRegressor(criterion='mse', max_depth=9, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [22]:
dt.fit(Xtrainval, ytrainval)
ch.rel_rmse(dt, Xtest, ytest)

0.48313553199098974

In [28]:
for _ in range(10):
    Xtrainval, Xtest, ytrainval, ytest = train_test_split(X,y, test_size=testsize)
    parameters = { 'max_depth': list(range(1,20))}
    estimator = DecisionTreeRegressor()
    best, r = ch.gridsearch_best(Xtrainval,ytrainval, estimator, parameters, n_iter=10, test_size=valpct)
    
    dt = DecisionTreeRegressor(**best.params)
    dt.fit(Xtrainval, ytrainval)
    score = ch.rel_rmse(dt, Xtest, ytest)    
    print (best.params, "%.3f"%best.mean_test_score, "%.3f"%score)


{'max_depth': 8} 0.429 0.403
{'max_depth': 9} 0.456 0.409
{'max_depth': 11} 0.447 0.383
{'max_depth': 8} 0.419 0.450
{'max_depth': 14} 0.445 0.432
{'max_depth': 11} 0.397 0.436
{'max_depth': 8} 0.412 0.547
{'max_depth': 7} 0.438 0.419
{'max_depth': 7} 0.467 0.477
{'max_depth': 8} 0.430 0.480


## Suggested experiments:

- play with different `testpct` and `valpct`
- use `cal_housing.data`
- use `cal_housing_full.data`
- measure stability of selected model