### Import Libraries

In [2]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

### Import Dataset

In [3]:
df = pd.read_csv("Ames_housing.csv")
df[::200]

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,Remodeled,GrLivArea,BsmtFullBath,BsmtHalfBath,...,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,PavedDrive_P,PavedDrive_Y,SalePrice
0,60,65.0,8450,7,5,2003,0,1710,1,0,...,0,0,0,0,1,0,0,0,1,208500
200,20,80.0,8546,4,5,2003,1,1121,0,0,...,0,1,0,0,0,0,0,0,1,140000
400,120,38.0,14963,8,5,1996,0,1288,1,0,...,0,1,0,0,0,0,0,0,1,245500
600,60,74.0,10927,8,5,2005,0,1904,1,0,...,0,0,0,0,1,0,0,0,1,275000
800,60,79.0,12798,6,5,1997,0,1688,1,0,...,0,0,0,0,1,0,0,0,1,200000
1000,20,74.0,10206,3,3,1952,0,944,0,0,...,0,1,0,0,0,0,0,0,1,82000
1200,20,71.0,9353,4,5,1970,0,864,0,0,...,0,1,0,0,0,0,0,0,1,116050
1400,50,50.0,6000,6,7,1929,1,1158,0,0,...,0,0,0,0,0,0,0,0,1,120000


In [4]:
# Features
X = df[df.columns.drop("SalePrice")]

# Target
y = df["SalePrice"]

### Untuned Model Example

In [5]:
housing_dmatrix = xgb.DMatrix(data = X, label = y)

untuned_params = {"objective": "reg:squarederror"}

untuned_cv_results_rmse = xgb.cv(dtrain = housing_dmatrix,
                                 params = untuned_params,
                                 nfold = 4,
                                 metrics = "rmse",
                                 as_pandas = True,
                                 seed = 123)

print("Untuned RMSE =", untuned_cv_results_rmse["test-rmse-mean"].iloc[-1])

Untuned RMSE = 34624.22998025


### Tuned Model Example

In [6]:
housing_dmatrix = xgb.DMatrix(data = X, label = y)

tuned_params = {"objective": "reg:squarederror",
                "colsample_bytree": 0.3,
                "learning_rate": 0.1,
                "max_depth": 5}

tuned_cv_results_rmse = xgb.cv(dtrain = housing_dmatrix,
                               params = tuned_params,
                               nfold = 4,
                               num_boost_round = 200,
                               metrics = "rmse",
                               as_pandas = True,
                               seed = 123)

print("Tuned RMSE =", tuned_cv_results_rmse["test-rmse-mean"].iloc[-1])

Tuned RMSE = 29965.41113275


### Tuning the number of boosting rounds

In [7]:
# Create the DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data = X, label = y)

# Create the parameter dictionary for each tree: params
parameters = {"objective": "reg:squarederror",
              "max_depth": 3}

# Create list of number of boosting rounds
num_rounds = [5, 25, 50, 75, 100]

# Empty list to store final round rmse per XGBoost model
final_rmse_per_round = []

# Iterate over num_rounds and build one model per num_boost_round parameter
for curr_num_rounds in num_rounds:
    cv_results = xgb.cv(dtrain = housing_dmatrix,
                        params = parameters,
                        nfold = 3,
                        num_boost_round = curr_num_rounds,
                        metrics = "rmse",
                        as_pandas = True,
                        seed = 123)
    
    final_rmse_per_round.append(cv_results["test-rmse-mean"].iloc[-1])

final_rmse_per_round

[50903.30078166667,
 31849.582031333335,
 30943.686198333333,
 30579.745442666666,
 30680.307942999996]

In [8]:
num_rounds_rmses = list(zip(num_rounds, final_rmse_per_round))
pd.DataFrame(num_rounds_rmses, columns = ["num_boosting_rounds", "rmse"]).round(2)

Unnamed: 0,num_boosting_rounds,rmse
0,5,50903.3
1,25,31849.58
2,50,30943.69
3,75,30579.75
4,100,30680.31


### Automated boosting round selection using early_stopping

In [9]:
# Create your housing DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary for each tree: params
params = {"objective":"reg:squarederror",
          "max_depth":4}

# Perform cross-validation with early stopping: cv_results
cv_results = xgb.cv(dtrain = housing_dmatrix,
                    params = params,
                    nfold = 3,
                    early_stopping_rounds = 10,
                    num_boost_round = 50,
                    metrics = "rmse",
                    as_pandas = True,
                    seed = 123)

# Print cv_results
print("Best Test RMSE Mean =", cv_results["test-rmse-mean"].iloc[-1])

Best Test RMSE Mean = 30720.854818000003


### Tuning eta

In [10]:
# Create your housing DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data = X, label = y)

# Create the parameter dictionary for each tree (boosting round)
params = {"objective": "reg:squarederror",
          "max_depth": 3}

# Create list of eta values and empty list to store final round rmse per xgboost model
eta_vals = [0.001, 0.01, 0.1, 1]
best_rmse = []

# Systematically vary the eta
for curr_eta in eta_vals:
    params["eta"] = curr_eta
    
    cv_results = xgb.cv(dtrain = housing_dmatrix,
                        params = params,
                        nfold = 3,
                        early_stopping_rounds = 5,
                        num_boost_round = 10,
                        metrics = "rmse",
                        as_pandas = True,
                        seed = 123)
    
    best_rmse.append(cv_results["test-rmse-mean"].iloc[-1])

In [11]:
pd.DataFrame(list(zip(eta_vals, best_rmse)), columns = ["eta", "best_rmse"])

Unnamed: 0,eta,best_rmse
0,0.001,195736.401042
1,0.01,179932.182292
2,0.1,79759.414063
3,1.0,37900.03125


### Tuning max_depth

In [12]:
# Create housing DMatrix
housing_dmatrix = xgb.DMatrix(data = X, label = y)

# Create the parameter dictionary
parameters = {"objective": "reg:squarederror"}

# Create list of max_depth values
max_depths = [2, 5, 10, 20]

best_rmse = []

for curr_val in max_depths:
    parameters["max_depth"] = curr_val
    cv_results = xgb.cv(dtrain = housing_dmatrix,
                        params = parameters,
                        nfold = 2,
                        early_stopping_rounds = 5,
                        num_boost_round = 10,
                        metrics = "rmse",
                        as_pandas = True,
                        seed = 123)
    
    best_rmse.append(cv_results["test-rmse-mean"].iloc[-1])
    
best_rmse

[37957.4687505, 35596.5996095, 36065.5468755, 36739.578125]

In [13]:
pd.DataFrame(list(zip(max_depths, best_rmse)), columns = ["Max Depths", "RMSE"])

Unnamed: 0,Max Depths,RMSE
0,2,37957.46875
1,5,35596.59961
2,10,36065.546875
3,20,36739.578125


### Tuning colsample_bytree

In [14]:
# Create your housing DMatrix
housing_dmatrix = xgb.DMatrix(data = X, label = y)

# Create the parameter dictionary
parameters = {"objective": "reg:squarederror",
              "max_depth": 3}

# Create list of hyperparameter values: colsample_bytree_vals
colsample_bytree_vals = [0.1, 0.5, 0.8, 1]

best_rmse = []

# Systematically vary the hyperparameter value
for curr_val in colsample_bytree_vals:
    parameters["colsample_bytree"] = curr_val
    cv_results = xgb.cv(dtrain = housing_dmatrix,
                        params = parameters,
                        nfold = 2,
                        num_boost_round = 10,
                        early_stopping_rounds = 5,
                        metrics = "rmse",
                        as_pandas = True,
                        seed = 123)
    
    best_rmse.append(cv_results["test-rmse-mean"].iloc[-1])

best_rmse

[40918.1152345, 35813.90625, 35995.6796875, 35836.044922]

In [15]:
pd.DataFrame(list(zip(colsample_bytree_vals, best_rmse)), columns = ["Colsample_bytree", "RMSE"])

Unnamed: 0,Colsample_bytree,RMSE
0,0.1,40918.115235
1,0.5,35813.90625
2,0.8,35995.679688
3,1.0,35836.044922


### Grid Search

In [16]:
df = pd.read_csv("Ames_housing.csv")
df[::200]

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,Remodeled,GrLivArea,BsmtFullBath,BsmtHalfBath,...,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,PavedDrive_P,PavedDrive_Y,SalePrice
0,60,65.0,8450,7,5,2003,0,1710,1,0,...,0,0,0,0,1,0,0,0,1,208500
200,20,80.0,8546,4,5,2003,1,1121,0,0,...,0,1,0,0,0,0,0,0,1,140000
400,120,38.0,14963,8,5,1996,0,1288,1,0,...,0,1,0,0,0,0,0,0,1,245500
600,60,74.0,10927,8,5,2005,0,1904,1,0,...,0,0,0,0,1,0,0,0,1,275000
800,60,79.0,12798,6,5,1997,0,1688,1,0,...,0,0,0,0,1,0,0,0,1,200000
1000,20,74.0,10206,3,3,1952,0,944,0,0,...,0,1,0,0,0,0,0,0,1,82000
1200,20,71.0,9353,4,5,1970,0,864,0,0,...,0,1,0,0,0,0,0,0,1,116050
1400,50,50.0,6000,6,7,1929,1,1158,0,0,...,0,0,0,0,0,0,0,0,1,120000


In [17]:
# Features
X = df[df.columns.drop("SalePrice")]

# Target
y = df["SalePrice"]

In [18]:
housing_dmatrix = xgb.DMatrix(data = X, label = y)

gbm_param_grid = {"learning_rate": [0.01, 0.1, 0.5, 0.9],
                  "n_estimators": [200],
                  "subsample": [0.3, 0.5, 0.9]}

gbm = xgb.XGBRegressor()

grid_mse = GridSearchCV(estimator = gbm,
                        param_grid = gbm_param_grid,
                        scoring = "neg_mean_squared_error",
                        cv = 4,
                        verbose = 1)

grid_mse.fit(X, y)

print("Best Parameters =", grid_mse.best_params_, "\n")
print("Lower RMSE =", np.sqrt(np.abs(grid_mse.best_score_)))

Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:  2.2min finished


Best Parameters = {'learning_rate': 0.1, 'n_estimators': 200, 'subsample': 0.5} 

Lower RMSE = 29105.179169382693


### Random Search

In [20]:
housing_dmatrix = xgb.DMatrix(data = X, label = y)
gbm_param_grid = {"learning_rate": np.arange(0.05, 1.05, 0.05),
                  "n_estimators": [200],
                  "subsample": np.arange(0.05, 1.05, 0.05)}

gbm = xgb.XGBRegressor()

randomized_mse = RandomizedSearchCV(estimator = gbm,
                                    param_distributions = gbm_param_grid,
                                    n_iter = 25,
                                    scoring = "neg_mean_squared_error",
                                    cv = 4,
                                    verbose = 1)

randomized_mse.fit(X, y)

print("Best Parameters =", randomized_mse.best_params_, "\n")
print("Lower RMSE =", np.sqrt(np.abs(randomized_mse.best_score_)))

Fitting 4 folds for each of 25 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  3.3min finished


Best Parameters = {'subsample': 0.4, 'n_estimators': 200, 'learning_rate': 0.2} 

Lower RMSE = 29666.410368346937
