# 6.4.2 用XGBoost实现

## 1.业务理解

In [19]:
# 略，详见教材内容。本文件为教材《数据分析理论与实践：基于经典算法及Python编程实现》（朝乐门主编，机械工业出版社，2022年）的配套代码。
! pip install xgboost



## 2.数据读入

In [20]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [21]:
#数据导入 内置数据集（波士顿房价数据）
import pandas as pd
from sklearn.datasets import fetch_openml

def load_boston():
    # 从 OpenML 拉取同一份数据，字段完全一致
    data = fetch_openml(name="boston", version=1, as_frame=True)
    return data
boston = load_boston()

In [45]:
import sklearn
sklearn.__version__

'1.7.1'

In [23]:
#显示数据集字典的键
print(boston.keys())

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])


In [24]:
#数据集的完整描述
print(boston.DESCR)

**Author**:   
**Source**: Unknown - Date unknown  
**Please cite**:   

The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
prices and the demand for clean air', J. Environ. Economics & Management,
vol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics
...', Wiley, 1980.   N.B. Various transformations are used in the table on
pages 244-261 of the latter.
Variables in order:
CRIM     per capita crime rate by town
ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
INDUS    proportion of non-retail business acres per town
CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
NOX      nitric oxides concentration (parts per 10 million)
RM       average number of rooms per dwelling
AGE      proportion of owner-occupied units built prior to 1940
DIS      weighted distances to five Boston employment centres
RAD      index of accessibility to radial highways
TAX      full-value property-tax rate per $10

In [25]:
#数据集的特征名称
print(boston.feature_names)

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']


In [26]:
#数据形状
print(boston.data.shape)

(506, 13)


In [27]:
#将数据（data）与回归目标（target）转化为数据框类型
X= pd.DataFrame(boston.data, columns=boston.feature_names)
y= pd.DataFrame(boston.target, columns=['MEDV'])

In [28]:
#合并数据框
df= pd.concat([X, y], axis=1)
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48,22.0


In [29]:
help(load_boston)

Help on function load_boston in module __main__:

load_boston()



## 3.数据理解

In [30]:
#查看数据基本信息
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   CRIM     506 non-null    float64 
 1   ZN       506 non-null    float64 
 2   INDUS    506 non-null    float64 
 3   CHAS     506 non-null    category
 4   NOX      506 non-null    float64 
 5   RM       506 non-null    float64 
 6   AGE      506 non-null    float64 
 7   DIS      506 non-null    float64 
 8   RAD      506 non-null    category
 9   TAX      506 non-null    float64 
 10  PTRATIO  506 non-null    float64 
 11  B        506 non-null    float64 
 12  LSTAT    506 non-null    float64 
 13  MEDV     506 non-null    float64 
dtypes: category(2), float64(12)
memory usage: 49.0 KB


In [31]:
#查看描述性统计信息
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,NOX,RM,AGE,DIS,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.554695,6.284634,68.574901,3.795043,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.115878,0.702617,28.148861,2.10571,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.385,3.561,2.9,1.1296,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.449,5.8855,45.025,2.100175,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.538,6.2085,77.5,3.20745,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.624,6.6235,94.075,5.188425,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,0.871,8.78,100.0,12.1265,711.0,22.0,396.9,37.97,50.0


## 4.数据准备

In [32]:
#划分训练集与测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y
                                                    ,test_size=0.3
                                                    ,random_state=42)  
X_test

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
173,0.09178,0.0,4.05,0,0.510,6.416,84.1,2.6463,5,296.0,16.6,395.50,9.04
274,0.05644,40.0,6.41,1,0.447,6.758,32.9,4.0776,4,254.0,17.6,396.90,3.53
491,0.10574,0.0,27.74,0,0.609,5.983,98.8,1.8681,4,711.0,20.1,390.11,18.07
72,0.09164,0.0,10.81,0,0.413,6.065,7.8,5.2873,4,305.0,19.2,390.91,5.52
452,5.09017,0.0,18.10,0,0.713,6.297,91.8,2.3682,24,666.0,20.2,385.09,17.27
...,...,...,...,...,...,...,...,...,...,...,...,...,...
441,9.72418,0.0,18.10,0,0.740,6.406,97.2,2.0651,24,666.0,20.2,385.96,19.52
23,0.98843,0.0,8.14,0,0.538,5.813,100.0,4.0952,4,307.0,21.0,394.54,19.88
225,0.52693,0.0,6.20,0,0.504,8.725,83.0,2.8944,8,307.0,17.4,382.00,4.63
433,5.58107,0.0,18.10,0,0.713,6.436,87.9,2.3158,24,666.0,20.2,100.19,16.22


## 5.模型训练

In [33]:
#模型训练
model=XGBRegressor(enable_categorical=True)
model.fit(X_train,y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,True


In [34]:
help(XGBRegressor)

Help on class XGBRegressor in module xgboost.sklearn:

class XGBRegressor(sklearn.base.RegressorMixin, XGBModel)
 |  XGBRegressor(
 |      *,
 |      objective: Union[str, xgboost.sklearn._SklObjWProto, Callable[[Any, Any], Tuple[numpy.ndarray, numpy.ndarray]], NoneType] = 'reg:squarederror',
 |      **kwargs: Any
 |  ) -> None
 |
 |  Implementation of the scikit-learn API for XGBoost regression.
 |  See :doc:`/python/sklearn_estimator` for more information.
 |
 |  Parameters
 |  ----------
 |
 |      n_estimators : typing.Optional[int]
 |          Number of gradient boosted trees.  Equivalent to number of boosting
 |          rounds.
 |
 |      max_depth :  typing.Optional[int]
 |
 |          Maximum tree depth for base learners.
 |
 |      max_leaves : typing.Optional[int]
 |
 |          Maximum number of leaves; 0 indicates no limit.
 |
 |      max_bin : typing.Optional[int]
 |
 |          If using histogram-based algorithm, maximum number of bins per feature
 |
 |      grow_policy 

In [35]:
#默认参数模型预测结果y_pred
y_pred=model.predict(X_test)
y_pred

array([24.18011  , 32.275204 , 14.555893 , 24.992914 , 16.927103 ,
       23.385    , 19.331778 , 15.812956 , 21.038889 , 20.392742 ,
       21.269129 , 19.842466 ,  9.193318 , 23.79937  , 19.57628  ,
       27.833176 , 18.67142  ,  8.09512  , 48.03767  , 15.462807 ,
       24.653164 , 24.55675  , 13.912267 , 21.68695  , 15.782123 ,
       15.87253  , 22.29041  , 13.486744 , 18.161703 , 20.244566 ,
       20.744814 , 23.485144 , 21.100922 , 21.135052 , 14.800542 ,
       15.406718 , 32.747494 , 18.739288 , 21.79198  , 23.758926 ,
       19.429377 , 27.95749  , 48.053005 , 17.75301  , 22.03793  ,
       12.051628 , 15.410907 , 23.869837 , 18.488842 , 27.873665 ,
       22.098478 , 35.53837  , 16.430672 , 25.664473 , 49.743706 ,
       21.760406 , 15.794965 , 30.26715  , 22.968016 , 20.397764 ,
       25.110641 , 32.918003 , 30.133533 , 17.604536 , 23.806646 ,
       16.964783 , 12.726663 , 22.390095 , 28.68543  , 14.781225 ,
       20.18192  , 24.764055 , 10.638801 , 22.670666 , 23.0744

## 6.模型评价

In [36]:
#回归指标——平均绝对误差（MAE）、均方根误差（RMSE）、R_squared（决定系数）
print("MAE:",metrics.mean_absolute_error(y_test, y_pred))
print("RMSE:",metrics.mean_squared_error(y_test, y_pred))
    #True 返回 MSE 值，如果 False 返回 RMSE 值
print("R_squared:",metrics.r2_score(y_test, y_pred))

MAE: 2.0339202880859375
RMSE: 9.465873718261719
R_squared: 0.8729636073112488


## 7.模型调参

In [37]:
#以n_estimators、max_depth、subsample、colsample_bytree、learning_rate参数和值设置字典列表param_grid。设置cv参数值为5。
param_grid={'n_estimators':[50,150,300]
            ,'max_depth':[3,4,5,6]
            ,'subsample':[0.8,0.9,1]
            ,'colsample_bytree':[0.8,0.9,1]
            ,'learning_rate':[0.05,0.1,0.2,0.3]}

In [38]:
#调用GridSearchCV函数，进行5折交叉验证，对估计器XGBRegressor()的指定参数值param_grid进行详尽搜索，得到最终的最优模型参数。
n_folds=5
estimator=GridSearchCV(model
                       ,param_grid
                       ,cv=n_folds)
estimator.fit(X_train,y_train)

0,1,2
,estimator,"XGBRegressor(...ree=None, ...)"
,param_grid,"{'colsample_bytree': [0.8, 0.9, ...], 'learning_rate': [0.05, 0.1, ...], 'max_depth': [3, 4, ...], 'n_estimators': [50, 150, ...], ...}"
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,1
,device,
,early_stopping_rounds,
,enable_categorical,True


In [39]:
#通过best_estimator_属性，得出通过搜索选择的最高分（或最小损失）的估计量。
estimator.best_estimator_

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,1
,device,
,early_stopping_rounds,
,enable_categorical,True


## 8.模型预测

In [42]:
#调参后的模型训练
model1=XGBRegressor(
    n_estimators=150,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.9,
    learning_rate=0.2,
    enable_categorical=True
)
model1.fit(X_train,y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.9
,device,
,early_stopping_rounds,
,enable_categorical,True


In [43]:
#调参后的模型预测结果y_pred1
y_pred1=model1.predict(X_test)
y_pred1

array([23.923468 , 31.730278 , 16.087963 , 22.972803 , 17.44532  ,
       22.813885 , 18.544043 , 14.14657  , 19.954084 , 18.767136 ,
       21.507803 , 20.461235 ,  5.617635 , 21.655096 , 19.674654 ,
       26.667122 , 20.584059 ,  8.93592  , 45.355225 , 15.081056 ,
       24.784899 , 24.600218 , 13.203917 , 22.050678 , 17.0832   ,
       16.682404 , 21.6432   , 12.623802 , 20.663229 , 20.79867  ,
       20.573254 , 22.696484 , 23.776262 , 20.754581 , 14.64874  ,
       15.807152 , 34.375088 , 18.389507 , 22.626825 , 22.907358 ,
       18.501038 , 27.92512  , 45.86971  , 19.579906 , 22.312283 ,
       14.442314 , 15.821603 , 23.915186 , 18.58936  , 26.53378  ,
       21.589775 , 37.9726   , 16.307587 , 25.862392 , 51.461933 ,
       21.716747 , 15.131218 , 32.310066 , 21.616747 , 19.182951 ,
       23.74493  , 34.064262 , 33.119793 , 18.372274 , 22.994488 ,
       17.561565 , 13.84247  , 23.071178 , 30.658049 , 15.456505 ,
       20.462421 , 28.74388  , 12.264974 , 20.107924 , 22.6837

In [46]:
#调参后的模型回归指标评价结果
mse  = metrics.mean_squared_error(y_test, y_pred1)
rmse = np.sqrt(mse) 
print("MAE1:",metrics.mean_absolute_error(y_test, y_pred1))
print("RMSE1:", rmse)
print("R_squared1:",metrics.r2_score(y_test, y_pred1))

MAE1: 1.992936372756958
RMSE1: 3.013686748079793
R_squared1: 0.8781112432479858
