In [1]:
import pandas as pd
import numpy as np

from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_log_error

In [2]:
import xgboost
xgboost.__version__

'1.5.2'

In [3]:
perth = pd.read_csv('perth_clean.csv')

perth = pd.get_dummies(perth, columns=['suburb'])

train_indices, test_indices = train_test_split(np.array(perth.index), test_size=0.2, random_state=0)

perth_train = perth.loc[train_indices].copy()
perth_test = perth.loc[test_indices].copy()

In [4]:
parameter_grid = {'n_estimators': np.arange(50, 100 + 1, 10),
                  'learning_rate': np.linspace(0.05, 0.15, 11),
                  'max_depth': np.arange(50, 60),
                  'subsample': np.linspace(0.5, 1.0, 6)}

base_parameters = {key: np.random.choice(val) for key, val in parameter_grid.items()}
base_parameters

{'n_estimators': 80,
 'learning_rate': 0.09999999999999999,
 'max_depth': 56,
 'subsample': 0.6}

In [5]:
np.prod([len(val) for val in parameter_grid.values()])

3960

In [6]:
np.sum([len(val) for val in parameter_grid.values()])

33

In [16]:
train_indices, valid_indices = train_test_split(np.arange(len(perth_train)), test_size=0.2, random_state=0)

In [17]:
x_train = perth_train.drop(columns='log10_price').values
y_train = perth_train['log10_price'].values

In [None]:
print(base_parameters)
for parameter, grid in parameter_grid.items():
    base_model = XGBRegressor(objective='reg:squarederror', random_state=0, **base_parameters)
    
    parameter_sub_grid = {parameter: grid}
    search = GridSearchCV(base_model, parameter_sub_grid, cv=[(train_indices, valid_indices)], 
                          scoring="neg_mean_squared_error")
    
    search.fit(x_train, y_train)
    
    base_parameters.update(search.best_params_)
    print(base_parameters)

In [None]:
final_model = XGBRegressor(objective='reg:squarederror', random_state=0, **base_parameters)
final_model.fit(x_train, y_train)