In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
plt.style.use("default")

In [2]:
from sklearn.datasets import load_boston

In [5]:
boston = load_boston()

In [6]:
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

In [7]:
boston.feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [11]:
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [12]:
df = pd.DataFrame(data=boston.data, columns=boston.feature_names)
df.head(3)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03


In [13]:
df["PRICE"] = boston.target
df.head(3)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7


In [14]:
df.shape

(506, 14)

In [15]:
from sklearn.model_selection import train_test_split

In [32]:
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, shuffle=True,
                                                    test_size=0.2, random_state=12)

In [17]:
import xgboost as xgb

In [18]:
xg_reg = xgb.XGBRegressor(objective ="reg:squarederror", colsample_bytree = 0.3, learning_rate = 0.1,
                          max_depth = 5, alpha = 10, n_estimators = 10)

In [33]:
xg_reg.fit(X_train, y_train)

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=5,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=10, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=10, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [34]:
y_pred = xg_reg.predict(X_test)

In [23]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [35]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [36]:
print(f"mse: {mse:0.2f}, rmse: {rmse:0.2f}, mae: {mae:0.2f}, r2: {r2:0.2f}")

mse: 106.60, rmse: 10.32, mae: 8.49, r2: -0.30


### K-Fold CV

#### DMatrices
Instead of numpy arrays or pandas dataFrame, XGBoost uses DMatrices. A DMatrix can contain both the features and the target.

In [39]:
data_dmatrix = xgb.DMatrix(data=boston.data,label=boston.target)

In [45]:
params = {"objective":"reg:squarederror","colsample_bytree": 0.3,"learning_rate": 0.1,
          "max_depth": 5, "alpha": 10}

cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50, early_stopping_rounds=10,
                    metrics=["rmse", "mae"], as_pandas=True, seed=123)

In [46]:
cv_results.head(3)

Unnamed: 0,train-rmse-mean,train-rmse-std,train-mae-mean,train-mae-std,test-rmse-mean,test-rmse-std,test-mae-mean,test-mae-std
0,21.750757,0.036152,19.87447,0.099426,21.765523,0.02885,19.879341,0.170341
1,19.778532,0.077649,17.923921,0.089126,19.83076,0.03176,17.960867,0.126975
2,18.05281,0.118633,16.168603,0.079432,18.157336,0.116038,16.242643,0.098806


In [47]:
cv_results.tail(3)

Unnamed: 0,train-rmse-mean,train-rmse-std,train-mae-mean,train-mae-std,test-rmse-mean,test-rmse-std,test-mae-mean,test-mae-std
47,2.358588,0.108396,1.646851,0.082153,4.027098,0.375358,2.662418,0.200856
48,2.330911,0.103723,1.631041,0.080232,4.023613,0.377495,2.664285,0.200703
49,2.289405,0.100094,1.607475,0.078872,3.99692,0.39378,2.649759,0.20555


### GridSearchCV

In [48]:
from sklearn.model_selection import GridSearchCV

In [50]:
estimator = xgb.XGBRegressor()
param_grid = {"colsample_bytree": [i/10.0 for i in range(2,6)],
              "learning_rate": [0.01, 0.1],
              "max_depth": [3, 4, 5], 
              "alpha": 10}

reg_xgb = GridSearchCV(estimator, param_grid, cv=5)

ValueError: Parameter grid for parameter (alpha) needs to be a list or numpy array, but got (<class 'int'>). Single values need to be wrapped in a list with one element.

https://blog.cambridgespark.com/hyperparameter-tuning-in-xgboost-4ff9100a3b2f