In [1]:
import pandas as pd
import numpy as np

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_log_error

In [29]:
perth = pd.read_csv('perth_clean.csv')
perth = pd.get_dummies(perth, columns=['suburb'])

train_indices, test_indices = train_test_split(perth.index, test_size=0.2, random_state=0)

perth_train = perth.loc[train_indices].copy()
perth_test = perth.loc[test_indices].copy()

In [33]:
print("Training set length", len(perth_train))
print("Tresting set length", len(perth_test))

Training set length 26924
Tresting set length 6732


In [34]:
indices = list(perth_train.index.copy())
np.random.shuffle(indices)

sampled_indices = indices[:int(len(indices) * 0.2)]
sampled_perth_train = perth_train.loc[sampled_indices, :].copy()

print("Sampled Training set length", len(sampled_indices))

Sampled Training set length 5384


In [35]:
n = len(sampled_perth_train)
sampled_train_indices, sampled_valid_indices = train_test_split(np.arange(n), test_size=0.2, random_state=0)

sampled_x_train = sampled_perth_train.drop('log10_price', axis=1)
sampled_y_train = sampled_perth_train['log10_price']

print("Training length of sampled data", len(sampled_train_indices))
print("Validiation length of sampled data", len(sampled_valid_indices))

Training length of sampled data 4307
Validiation length of sampled data 1077


# Random Forests

In [36]:
parameter_grid = {'min_samples_leaf': np.arange(1, 100, 5),
                  'max_features': np.arange(1, len(perth_train.columns) - 1, 5)}

In [38]:
base_model = RandomForestRegressor(criterion='squared_error', random_state=0)

search = RandomizedSearchCV(base_model, parameter_grid, 
                            cv=[(sampled_train_indices, sampled_valid_indices)], 
                            n_iter=(20 + 67 + 5 + 67),
                            scoring="neg_mean_squared_error", verbose=3, refit=False)

search.fit(sampled_x_train, sampled_y_train)

Fitting 1 folds for each of 159 candidates, totalling 159 fits
[CV 1/1] END max_features=281, min_samples_leaf=31;, score=-0.013 total time=   3.4s
[CV 1/1] END max_features=41, min_samples_leaf=21;, score=-0.020 total time=   0.6s
[CV 1/1] END max_features=51, min_samples_leaf=86;, score=-0.023 total time=   0.6s
[CV 1/1] END max_features=56, min_samples_leaf=36;, score=-0.019 total time=   0.8s
[CV 1/1] END max_features=86, min_samples_leaf=26;, score=-0.014 total time=   1.3s
[CV 1/1] END max_features=291, min_samples_leaf=61;, score=-0.016 total time=   3.1s
[CV 1/1] END max_features=201, min_samples_leaf=46;, score=-0.014 total time=   2.4s
[CV 1/1] END max_features=21, min_samples_leaf=46;, score=-0.034 total time=   0.3s
[CV 1/1] END max_features=181, min_samples_leaf=81;, score=-0.017 total time=   1.9s
[CV 1/1] END max_features=316, min_samples_leaf=36;, score=-0.014 total time=   3.7s
[CV 1/1] END max_features=291, min_samples_leaf=1;, score=-0.010 total time=   7.1s
[CV 1/1]

RandomizedSearchCV(cv=[(array([ 639, 2724, 5105, ..., 1653, 2607, 2732]),
                        array([ 499, 3624, 2104, ..., 2584, 1363, 2744]))],
                   estimator=RandomForestRegressor(random_state=0), n_iter=159,
                   param_distributions={'max_features': array([  1,   6,  11,  16,  21,  26,  31,  36,  41,  46,  51,  56,  61,
        66,  71,  76,  81,  86,  91,  96, 101, 106, 111, 116, 121, 126,
       131, 136, 141, 146, 151, 156, 161, 166, 171, 176, 181, 186, 191,
       196, 201, 206, 211, 216, 221, 226, 231, 236, 241, 246, 251, 256,
       261, 266, 271, 276, 281, 286, 291, 296, 301, 306, 311, 316, 321,
       326, 331]),
                                        'min_samples_leaf': array([ 1,  6, 11, 16, 21, 26, 31, 36, 41, 46, 51, 56, 61, 66, 71, 76, 81,
       86, 91, 96])},
                   refit=False, scoring='neg_mean_squared_error', verbose=3)

In [39]:
search.best_params_

{'max_features': 191, 'min_samples_leaf': 1}

In [40]:
final_model = RandomForestRegressor(criterion='squared_error', random_state=0, **search.best_params_)
final_model.fit(perth_train.drop('log10_price', axis=1), perth_train['log10_price'])

y = perth_test['log10_price']
y_pred = final_model.predict(perth_test.drop('log10_price', axis=1))
print(np.mean((y - y_pred) ** 2))

0.007857488928888677


In [41]:
# Best Parameteres from Coordinate Descentib 
base_parameters = {'max_features': 106, 'min_samples_leaf': 1}

final_model = RandomForestRegressor(criterion='squared_error', random_state=0, **base_parameters)
final_model.fit(perth_train.drop('log10_price', axis=1), perth_train['log10_price'])

y = perth_test['log10_price']
y_pred = final_model.predict(perth_test.drop('log10_price', axis=1))
print(np.mean((y - y_pred) ** 2))

0.00781962457241284


# XGBoost

In [42]:
parameter_grid = {'n_estimators': np.arange(50, 200 + 1, 10),
                  'learning_rate': np.linspace(0.01, 0.10, 10),
                  'max_depth': np.arange(1, 100 + 1, 10),
                  'subsample': np.linspace(0.1, 1.0, 10)}

In [43]:
[len(val) for val in parameter_grid.values()]

[16, 10, 10, 10]

In [44]:
base_model = XGBRegressor(objective='reg:squarederror', random_state=0)

search = RandomizedSearchCV(base_model, parameter_grid, 
                            cv=[(sampled_train_indices, sampled_valid_indices)], 
                            n_iter=(6 + 10 + 10 + 10 + 11 + 13 + 10 + 5),
                            scoring="neg_mean_squared_error", verbose=3, refit=False)

search.fit(sampled_x_train, sampled_y_train)

Fitting 1 folds for each of 75 candidates, totalling 75 fits
[CV 1/1] END learning_rate=0.05000000000000001, max_depth=71, n_estimators=120, subsample=1.0;, score=-0.009 total time=  25.6s
[CV 1/1] END learning_rate=0.04000000000000001, max_depth=21, n_estimators=120, subsample=0.8;, score=-0.010 total time=  18.0s
[CV 1/1] END learning_rate=0.05000000000000001, max_depth=81, n_estimators=80, subsample=0.4;, score=-0.019 total time=  10.3s
[CV 1/1] END learning_rate=0.030000000000000006, max_depth=51, n_estimators=170, subsample=0.6;, score=-0.010 total time=  33.1s
[CV 1/1] END learning_rate=0.01, max_depth=41, n_estimators=130, subsample=0.5;, score=-2.053 total time=   5.9s
[CV 1/1] END learning_rate=0.020000000000000004, max_depth=11, n_estimators=80, subsample=0.9;, score=-1.112 total time=   4.3s
[CV 1/1] END learning_rate=0.08, max_depth=1, n_estimators=130, subsample=0.5;, score=-0.015 total time=   2.5s
[CV 1/1] END learning_rate=0.07, max_depth=21, n_estimators=170, subsample

RandomizedSearchCV(cv=[(array([ 639, 2724, 5105, ..., 1653, 2607, 2732]),
                        array([ 499, 3624, 2104, ..., 2584, 1363, 2744]))],
                   estimator=XGBRegressor(objective='reg:squarederror'),
                   n_iter=75,
                   param_distributions={'learning_rate': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ]),
                                        'max_depth': array([ 1, 11, 21, 31, 41, 51, 61, 71, 81, 91]),
                                        'n_estimators': array([ 50,  60,  70,  80,  90, 100, 110, 120, 130, 140, 150, 160, 170,
       180, 190, 200]),
                                        'subsample': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])},
                   refit=False, scoring='neg_mean_squared_error', verbose=3)

In [45]:
search.best_params_

{'learning_rate': 0.08, 'max_depth': 21, 'n_estimators': 160, 'subsample': 0.5}

In [46]:
final_model = XGBRegressor(objective='reg:squarederror', random_state=0, **search.best_params_)
final_model.fit(perth_train.drop('log10_price', axis=1), perth_train['log10_price'])

y = perth_test['log10_price']
y_pred = final_model.predict(perth_test.drop('log10_price', axis=1))

print(np.mean((y - y_pred) ** 2))

0.007533777469334824


In [47]:
# Best Parameteres from Coordinate Descent
base_parameters = {'learning_rate': 0.12999999999999998,
                   'max_depth': 11,
                   'n_estimators': 200,
                   'subsample': 0.7999999999999999}

final_model = XGBRegressor(objective='reg:squarederror', random_state=0, **base_parameters)
final_model.fit(perth_train.drop('log10_price', axis=1), perth_train['log10_price'])

y = perth_test['log10_price']
y_pred = final_model.predict(perth_test.drop('log10_price', axis=1))

print(np.mean((y - y_pred) ** 2))

0.0075060996446797215


You'll see that random coordinate descent is just a bit better than randomized search, but it is basically quite similar. There are a couple of reasons why this might be

1. The search for XGBoost wasn't ran till convergence (but the Random Forest was). You will know you have reached convergence when the previous iteration through the variables gives you the exact same values as the most recent iteration. This is why you should be running coordinate descent for way longer than I have done here.

    Note that I ran coordinate descent a 2 more iterations after the notebook (so 4 in total) and it is almost at convergence with the parameters `{'learning_rate': 0.09, 'max_depth': 11, 'n_estimators': 170, 'subsample': 0.8}` and loss of `0.007313312056798277`. So defintely a significant improvement.

2. It may just be that this dataset is very hard and the very small 0.00003 increase was actually very good.

3. The more models you fit the more likely you are to converge to the correct answer. This is both true for both randomize search and coordinate descent. This means that if you sample a lot of parameters (i.e. if you try a lot of models) randomize search will give you similar results to coordinate descent.

    Now since we are looping over the parameters twice, in this case, we end up sampling a lot of parameters. And with randomize search, if it can also sample a lot of parameters it can also do well as well.

4. Coordinate descent works best when your loss function is convex (and smooth). If it is convex then convergence occurs in 1 loop over the parameters. But since we are using validation approach, rather than CV, it is highly unlikely that our loss function is convex. Making it harder for coordinate descent to work well.

There are a couple of ways to make coordinate descent to work better. Since each time we are simply searching through one parameter, we can perform early stopping on this parameter. As an example, lets say that the grid for one of our parameters was

```
[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]
```

When we tried to optimize this parameter we found that the resulting validation error was

```
feature=1;, score=-10
feature=2;, score=-9
feature=3;, score=-8
feature=4;, score=-10
feature=5;, score=-11
feature=6;, score=-12
```

There is no reason to continue and try the features `7, 8, 9, 10` because we know that these are simply going to give us larger values. So we can just stop here, meaning that rather than trying 10 parameters we only need to try 6 parameters. But I don't write the code for this because it takes too long, but you can try to do it yourself.