# XGBoost 模板

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

## 1. 读取数据集，构建训练数据

In [2]:
data = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
data.dropna(axis=0, subset=['SalePrice'], inplace=True)

In [3]:
y = data.SalePrice
X = data.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object'])
print(y.shape)
print(X.shape)

(1460,)
(1460, 37)


In [4]:
train_X, test_X, train_y, test_y = train_test_split(
    X, 
    y, 
    test_size=0.25
)

my_imputer = SimpleImputer()
train_X = my_imputer.fit_transform(train_X)
test_X = my_imputer.transform(test_X)

In [5]:
print(train_X.shape)
print(train_y.shape)

print(test_X.shape)
print(test_y.shape)

(1095, 37)
(1095,)
(365, 37)
(365,)


## 2. 定义XGBoost模型和超参数配置

In [6]:
my_model1 = XGBRegressor()
my_model2 = XGBRegressor(n_estimators=1000)
my_model3 = XGBRegressor(n_estimators=1000, learning_rate=0.05)


## 3. 模型训练

In [7]:
my_model1.fit(
    train_X, 
    train_y, 
    verbose=False
)

my_model2.fit(
    train_X, 
    train_y,
    eval_set=[(test_X, test_y)],
    verbose=False
)

my_model3.fit(
    train_X, 
    train_y,
    eval_set=[(test_X, test_y)],
    verbose=False
)


## 4. 使用模型预测数据

In [8]:
predictions1 = my_model1.predict(test_X)
predictions2 = my_model2.predict(test_X)
predictions3 = my_model3.predict(test_X)


error1 = mean_absolute_error(predictions1, test_y)
error2 = mean_absolute_error(predictions2, test_y)
error3 = mean_absolute_error(predictions3, test_y)


print(f"Mean Absolute Error : {error1}")
print(f"Mean Absolute Error : {error2}")
print(f"Mean Absolute Error : {error3}")

Mean Absolute Error : 18528.91368792808
Mean Absolute Error : 18560.811065924656
Mean Absolute Error : 17154.058604452053
