# 安装模块包

In [1]:
import pandas, sklearn, xgboost

print(pandas.__version__)
print(sklearn.__version__)
print(xgboost.__version__)

"""
pip install pandas==1.1.4 -i https://pypi.tuna.tsinghua.edu.cn/simple/
pip install scikit-learn==1.2.2 -i https://pypi.tuna.tsinghua.edu.cn/simple/
pip install xgboost==2.0.3 -i https://pypi.tuna.tsinghua.edu.cn/simple/
"""

1.1.4
1.2.2
2.0.3


In [2]:
import pandas as pd

In [3]:
# 读取路径
melbourne_file_path = "/home/lc/code/ai/input/melb_data.csv"
# 读取文件
melbourne_data = pd.read_csv(melbourne_file_path)

In [4]:
melbourne_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

墨尔本数据有一些缺失值（一些房屋的一些变量没有记录。）
我们将在后面的教程中学习处理缺失值。
您的爱荷华州数据在您使用的列中没有缺失值。
所以我们现在将采取最简单的选择，从我们的数据中删除房屋。
现在不用担心这么多，尽管代码是：
dropna 删除缺失值（将 na 视为“不可用”）

In [5]:
melbourne_data = melbourne_data.dropna(axis=0)  # axis=0处理的为列

In [6]:
# 我们将使用点符号来选择我们想要预测的列，这称为预测目标。
# 按照惯例，预测目标称为 y。
# 所以我们需要保存墨尔本数据中的房价的代码是
y = melbourne_data.Price

In [7]:
# 选择输入模型内的特征(除目标列之外的所有列都可以成为特征)
melbourne_features = ["Rooms", "Bathroom", "Landsize", "Lattitude", "Longtitude"]

# 按照惯例，该数据称为 X
X = melbourne_data[melbourne_features]

In [8]:
# 让我们快速回顾一下我们将使用describe 方法和head 方法来预测房价的数据，该方法显示了前几行
X.head()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
4,4,1.0,120.0,-37.8072,144.9941
6,3,2.0,245.0,-37.8024,144.9993
7,2,1.0,256.0,-37.806,144.9954


In [9]:
# 导入决策树模型
from sklearn.tree import DecisionTreeRegressor

# 构建和使用模型的步骤是：
# 定义：它将是什么类型的模型？决策树？其他类型的模型？还指定了模型类型的一些其他参数。
# 拟合：从提供的数据中捕获模式。这是建模的核心。
# 预测：正如听起来的那样
# 评估：确定模型预测的准确性。

# 定义模型。为 random_state 指定一个数字以确保每次运行结果相同
melbourne_model = DecisionTreeRegressor(random_state=1)

# 拟合模型
melbourne_model.fit(X, y)

In [10]:
# 在实践中，您需要对市场上即将上市的新房屋进行预测，而不是对我们已经有价格的房屋进行预测。但我们将对训练数据的前几行进行预测，以了解预测函数的工作原理。
print("Making predictions for the following 5 houses:")
X.head()

Making predictions for the following 5 houses:


Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
4,4,1.0,120.0,-37.8072,144.9941
6,3,2.0,245.0,-37.8024,144.9993
7,2,1.0,256.0,-37.806,144.9954


In [11]:
print("The predictions are")
melbourne_model.predict(X.head())

The predictions are


array([1035000., 1465000., 1600000., 1876000., 1636000.])

In [12]:
# 一旦我们有了模型，我们就可以计算平均绝对误差
from sklearn.metrics import mean_absolute_error

# 定义预测价格
predicted_home_prices = melbourne_model.predict(X)

# 计算预测值和真实值之间的绝对误差的平均值来衡量模型的表现。
mean_absolute_error(y, predicted_home_prices)
print(mean_absolute_error(y, predicted_home_prices))
"""使用训练数据进行模型评估容易出现过拟合(在训练数据表现好，在新数据表现差)"""

1115.7467183128902


'使用训练数据进行模型评估容易出现过拟合(在训练数据表现好，在新数据表现差)'

In [13]:
# scikit-learn 库有一个函数 train_test_split 将数据分成两部分。
# 我们将使用其中一些数据作为训练数据来拟合模型，并使用其他数据作为验证数据来计算mean_absolute_error。
from sklearn.model_selection import train_test_split

# 将数据分为训练数据和验证数据，分别用于特征和目标
# 分割基于随机数生成器。提供一个数值
# random_state 参数保证我们每次都会得到相同的分割
# 运行这个脚本。

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)
"""
train_X: 使用分割好的训练特征(跟价格有关的指标)进行拟合模型
val_X: 将该验证训练特征放入训练好的模型得到预测值
train_y: 使用分割好的预测值(价格)进行拟合模型
val_y: 使用该验证预测值(价格)和预测结果值进行计算误差
"""
# 定义模型
melbourne_model = DecisionTreeRegressor()
# 拟合模型
melbourne_model.fit(train_X, train_y)

# 获取验证数据的预测价格
val_predictions = melbourne_model.predict(val_X)
mean_absolute_error(val_y, val_predictions)

"""在上述代码中使用分割数据方法，让训练数据只用于训练，模型评估使用未出现的验证数据，提高泛化能力"""

'在上述代码中使用分割数据方法，让训练数据只用于训练，模型评估使用未出现的验证数据，提高泛化能力'

In [14]:
mean_absolute_error(val_y, val_predictions)

273596.5780073165

我们可以使用实用函数来帮助比较 max_leaf_nodes 不同值的 MAE 分数

In [15]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor


def get_mat(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)

    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)

    return mae

我们可以使用 for 循环来比较使用不同 max_leaf_nodes 值构建的模型的准确性。

In [16]:
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mat(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print(f"max leaf node: {max_leaf_nodes}, mae: {my_mae}")

max leaf node: 5, mae: 385696.54278937966
max leaf node: 50, mae: 279794.61143891385
max leaf node: 500, mae: 261718.1134423186
max leaf node: 5000, mae: 271320.97310092533


过拟合：训练过多，导致对训练数据预测精确对新数据预测表现差
欠拟合：训练过少，导致未捕捉到特征从而预测结果表现差

通过上述所得到的最佳节点，使用全部数据进行训练，得到预测值

In [17]:
best_node = 500
best_node_model = DecisionTreeRegressor(max_leaf_nodes=500, random_state=0)
best_node_model.fit(X, y)
predicted_best = best_node_model.predict(X)
mean_absolute_error(y, predicted_best)

126642.40540214001

我们构建一个随机森林模型，类似于在 scikit-learn 中构建决策树的方式 - 这次使用 RandomForestRegressor 类而不是 DecisionTreeRegressor。

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# 建立模型
forest_model = RandomForestRegressor(random_state=0)
# 拟合
forest_model.fit(train_X, train_y)
# 预测
melb_preds = forest_model.predict(val_X)
mae = mean_absolute_error(val_y, melb_preds)
mae

206868.39967967046

# 处理缺失值(null)

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 导入数据
data = pd.read_csv("/home/lc/code/ai/input/melb_data.csv")

# 选择目标
y = data.Price

# 为了简单起见，我们将仅使用数值预测变量
melb_predictors = data.drop(["Price"], axis="columns")
# melb_predictors
# 即排除非数值型数据，object 类型通常是字符串或类别型数据
X = melb_predictors.select_dtypes(exclude=["object"])
# 将除了价格外全部的例作为特征

# 划分训练集和测试集
train_X, val_X, train_y, val_y = train_test_split(
    X, y, train_size=0.8, test_size=0.2, random_state=0
)

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error


# 比较不同方法的函数
def score_dataset(train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(train_X, train_y)
    preds = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds)

    return mae

## 法1：删除缺失值的列

In [21]:
# 找出包含缺失值的列
# .isnull()：检查某列中的每个值是否为空，返回布尔值（True 或 False）。
# .any()：检查是否有至少一个 True，即该列是否存在缺失值。
cols_with_missing = [col for col in X.columns if train_X[col].isnull().any()]
# cols_with_missing

# 删除有缺失行的列
reduced_train_X = train_X.drop(cols_with_missing, axis="columns")
reduced_val_X = val_X.drop(cols_with_missing, axis="columns")

# 输出误差值
print(
    f"删除缺失值:\nmae={score_dataset(reduced_train_X, reduced_val_X, train_y, val_y)}"
)

删除缺失值:
mae=183550.22137772635


# 法2：简单插补

In [22]:
# 导入填充数据包
from sklearn.impute import SimpleImputer

# 填充
my_imputer = SimpleImputer()
# fit()：学习数据中的统计信息（例如每一列的均值、众数、中位数等，取决于填充策略），并将这些统计信息存储起来，以后可以用这些规则填充缺失值。
# transform()：使用之前学到的规则，填充原数据中的缺失值。
imputed_train_X = pd.DataFrame(my_imputer.fit_transform(train_X))
imputed_val_X = pd.DataFrame(my_imputer.transform(val_X))

# 由于前面得到的DataFrame类型没有列名，因此需要加入原始数据的列名
imputed_train_X.columns = train_X.columns
imputed_val_X.columns = val_X.columns

print(
    f"插补缺失值:\nmae={score_dataset(imputed_train_X, imputed_val_X, train_y, val_y)}"
)

插补缺失值:
mae=178166.46269899711


# 法3：插补扩展法(法2的基础上加入了额外的特征列表明哪些列存在缺失值)
> 一开始特征列值为True或False，后面进行填充后变成了1或0

In [23]:
# 估算缺失值，同时还跟踪估算了哪些值
# 复制原始信息，防止原始数据被修改
train_X_plus = train_X.copy()
val_X_plus = val_X.copy()

# 创建新的列指定缺失值位置
for col in cols_with_missing:
    train_X_plus[col + "_was_missing"] = train_X_plus[col].isnull()
    val_X_plus[col + "_was_missing"] = val_X_plus[col].isnull()

# train_X_plus
# val_X_plus

# 填充
my_imputer = SimpleImputer()
imputed_train_X_plus = pd.DataFrame(my_imputer.fit_transform(train_X_plus))
imputed_val_X_plus = pd.DataFrame(my_imputer.transform(val_X_plus))

# imputed_train_X_plus
# 将填充的列加入列名
imputed_train_X_plus.columns = train_X_plus.columns
imputed_val_X_plus.columns = val_X_plus.columns
# imputed_train_X_plus

print(
    f"插补扩展法缺失值:\nmae={score_dataset(imputed_train_X_plus, imputed_val_X_plus, train_y, val_y)}"
)

插补扩展法缺失值:
mae=178927.503183954


# 分类变量(object)

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 导入文件
data = pd.read_csv("/home/lc/code/ai/input/melb_data.csv")

# 选择特征和预测目标
X = data.drop(["Price"], axis="columns")
y = data.Price

# 划分训练集和验证集
train_X, val_X, train_y, val_y = train_test_split(
    X, y, train_size=0.8, test_size=0.2, random_state=0
)

# 防止原始数据被修改
train_X_plus = train_X.copy()
val_X_plus = val_X.copy()

# 使用删除缺失值法
columns_missing = [
    col for col in train_X_plus.columns if train_X_plus[col].isnull().any()
]
reduced_train_X = train_X_plus.drop(columns_missing, axis="columns")
reduced_val_X = val_X_plus.drop(columns_missing, axis="columns")

# “基数”表示列中唯一值的数量
# 选择基数相对较低的分类列（方便分类）
low_cardinality_cols = [
    col
    for col in reduced_train_X.columns
    if reduced_train_X[col].nunique() < 10 and reduced_train_X[col].dtype == "object"
]
# nunique(),输出该列唯一值的数量，即基数
# dtype == "object"，在pandas中，object类型表示str字符串类型

# 选择数值列
number_col = [
    col
    for col in reduced_train_X.columns
    if reduced_train_X[col].dtype in ["int64", "float64"]
]

# 创建分类数据的训练集
new_cols = low_cardinality_cols + number_col
new_train_X = train_X[new_cols].copy()
new_val_X = val_X[new_cols].copy()

In [25]:
new_train_X

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.98670,13240.0
6524,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,193.0,-37.85800,144.90050,6380.0
8413,h,S,Western Metropolitan,3,12.6,3020.0,3.0,1.0,555.0,-37.79880,144.82200,3755.0
2919,u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,265.0,-37.70830,144.91580,8870.0
6043,h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,673.0,-37.76230,144.82720,4217.0
...,...,...,...,...,...,...,...,...,...,...,...,...
13123,h,SP,Northern Metropolitan,3,5.2,3056.0,3.0,1.0,212.0,-37.77695,144.95785,11918.0
3264,h,S,Eastern Metropolitan,3,10.5,3081.0,3.0,1.0,748.0,-37.74160,145.04810,2947.0
9845,h,PI,Northern Metropolitan,4,6.7,3058.0,4.0,2.0,441.0,-37.73572,144.97256,11204.0
10799,h,S,Northern Metropolitan,3,12.0,3073.0,3.0,1.0,606.0,-37.72057,145.02615,21650.0


## 分类列

In [26]:
obj = new_train_X.dtypes == "object"
# print(obj)
# print(obj[obj])
object_cols = list(obj[obj].index)
print(object_cols)  # low_cardinality_cols

['Type', 'Method', 'Regionname']


定义衡量各种方法性能的函数

In [27]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error


def score_dataset(train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(random_state=0)
    model.fit(train_X, train_y)
    preds = model.predict(val_X)
    return mean_absolute_error(val_y, preds)

# 法1：删除分类列

In [28]:
drop_train_X = new_train_X.select_dtypes(exclude=["object"])
drop_val_X = new_val_X.select_dtypes(exclude=["object"])

print(
    f"删除分类列分类变量:\nmae={score_dataset(drop_train_X, drop_val_X, train_y, val_y)}"
)

删除分类列分类变量:
mae=175703.48185157913


# 法2：OrdinalEncoder(为每一个分类对象转换一个等级，等级越高值越高)

In [29]:
from sklearn.preprocessing import OrdinalEncoder

# 备份数据
ordinal_train_X = new_train_X.copy()
ordinal_val_X = new_val_X.copy()

# 对每一行的分类数据使用方法
ordinal_encoder = OrdinalEncoder()
# 对分类列定义等级
ordinal_train_X[object_cols] = ordinal_encoder.fit_transform(
    ordinal_train_X[object_cols]
)
ordinal_val_X[object_cols] = ordinal_encoder.transform(ordinal_val_X[object_cols])

print(
    f"OrdinalEncoder分类变量:\nmae={score_dataset(ordinal_train_X, ordinal_val_X, train_y, val_y)}"
)

OrdinalEncoder分类变量:
mae=165936.40548390493


# 法3：One-Hot Encoding(独热编码)最优
> 新建数据，列名为类别名称，列内容为1/0

In [30]:
from sklearn.preprocessing import OneHotEncoder

# 将独热编码运用于分类数据中
# 设置handle_unknown ='ignore'以避免当验证数据包含训练数据中未表示的类时出现错误
# sparse_output = False 确保编码列作为 numpy 数组（而不是稀疏矩阵）返回
OH_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
OH_train_col = pd.DataFrame(OH_encoder.fit_transform(new_train_X[object_cols]))
OH_val_col = pd.DataFrame(OH_encoder.transform(new_val_X[object_cols]))

# 给OH_col加入原始索引值index，即最左边一列名称
OH_train_col.index = new_train_X.index
OH_val_col.index = new_val_X.index

# 新创建除了分类列的数据，加入OH_col中
train_X_OH = new_train_X.drop(object_cols, axis="columns")
val_X_OH = new_val_X.drop(object_cols, axis="columns")

# 将两组数据合并
OH_train_X = pd.concat([OH_train_col, train_X_OH], axis="columns")
OH_val_X = pd.concat([OH_val_col, val_X_OH], axis="columns")

# 确保每列列名为字符串类型
# TODO:处理数据都要确保列名类型为字符串类型
OH_train_X.columns = OH_train_X.columns.astype(str)
OH_val_X.columns = OH_val_X.columns.astype(str)

In [31]:
new_train_X

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.98670,13240.0
6524,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,193.0,-37.85800,144.90050,6380.0
8413,h,S,Western Metropolitan,3,12.6,3020.0,3.0,1.0,555.0,-37.79880,144.82200,3755.0
2919,u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,265.0,-37.70830,144.91580,8870.0
6043,h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,673.0,-37.76230,144.82720,4217.0
...,...,...,...,...,...,...,...,...,...,...,...,...
13123,h,SP,Northern Metropolitan,3,5.2,3056.0,3.0,1.0,212.0,-37.77695,144.95785,11918.0
3264,h,S,Eastern Metropolitan,3,10.5,3081.0,3.0,1.0,748.0,-37.74160,145.04810,2947.0
9845,h,PI,Northern Metropolitan,4,6.7,3058.0,4.0,2.0,441.0,-37.73572,144.97256,11204.0
10799,h,S,Northern Metropolitan,3,12.0,3073.0,3.0,1.0,606.0,-37.72057,145.02615,21650.0


In [32]:
OH_train_X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.98670,13240.0
6524,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,2,8.0,3016.0,2.0,2.0,193.0,-37.85800,144.90050,6380.0
8413,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3,12.6,3020.0,3.0,1.0,555.0,-37.79880,144.82200,3755.0
2919,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,3,13.0,3046.0,3.0,1.0,265.0,-37.70830,144.91580,8870.0
6043,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3,13.3,3020.0,3.0,1.0,673.0,-37.76230,144.82720,4217.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13123,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,3,5.2,3056.0,3.0,1.0,212.0,-37.77695,144.95785,11918.0
3264,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,3,10.5,3081.0,3.0,1.0,748.0,-37.74160,145.04810,2947.0
9845,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4,6.7,3058.0,4.0,2.0,441.0,-37.73572,144.97256,11204.0
10799,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3,12.0,3073.0,3.0,1.0,606.0,-37.72057,145.02615,21650.0


在数据集较为简单规模小的情况下，可以使用更为简单的get_dummies方法进行独热编码，该方法会自动识别分类列

In [33]:
# 备份数据
onehot_train_X = new_train_X.copy()
onehot_val_X = new_val_X.copy()

# 独热编码
onehot_train_X = pd.get_dummies(onehot_train_X)
onehot_val_X = pd.get_dummies(onehot_val_X)

# 一定要加上这一行代码，确保将bool类型转换成int类型
bool_cols = onehot_train_X.select_dtypes(include=["bool"]).columns
onehot_train_X[bool_cols] = onehot_train_X[bool_cols].astype(int)
onehot_val_X[bool_cols] = onehot_val_X[bool_cols].astype(int)

# 一定要加上这一行代码，确保训练集和验证机特征列相同
onehot_train_X, onehot_val_X = onehot_train_X.align(
    onehot_val_X, join="left", axis="columns"
)

# 确保每列列名为字符串类型
# TODO:处理数据都要确保列名类型为字符串类型
onehot_train_X.columns = onehot_train_X.columns.astype(str)
onehot_val_X.columns = onehot_val_X.columns.astype(str)

你也可以通过指定分类列用get_dummies方法进行独热编码，但过程稍微复杂些

In [34]:
# 已确定分类列为object_cols
onehot_train_encoded = new_train_X[object_cols]
onehot_val_encoded = new_val_X[object_cols]

# 新建DataFrame进行独热编码
onehot_train_X_encoded = pd.get_dummies(onehot_train_encoded, columns=object_cols)
onehot_val_encoded = pd.get_dummies(onehot_val_encoded, columns=object_cols)

# TODO:bool类型转换成int
bool_cols = onehot_train_X_encoded.select_dtypes(include=["bool"]).columns
onehot_train_X_encoded[bool_cols] = onehot_train_X_encoded[bool_cols].astype(int)
onehot_val_encoded[bool_cols] = onehot_val_encoded[bool_cols].astype(int)

# 新建DataFrame删除分类列
onehot_train_X_nonobj = new_train_X.drop(columns=object_cols)
onehot_val_nonobj = new_val_X.drop(columns=object_cols)

# 合并两个DataFrame
final_onehot_train_X = pd.concat(
    [onehot_train_X_encoded, onehot_train_X_nonobj], axis="columns"
)
final_onehot_val_X = pd.concat([onehot_val_encoded, onehot_val_nonobj], axis="columns")

# TODO:将特征列保持一致性
final_onehot_train_X, final_onehot_val_X = final_onehot_train_X.align(
    final_onehot_val_X, join="left", axis="columns"
)

# 确保每列列名为字符串类型
# TODO:处理数据都要确保列名类型为字符串类型
final_onehot_train_X.columns = final_onehot_train_X.columns.astype(str)
final_onehot_val_X.columns = final_onehot_val_X.columns.astype(str)

第一种和第三种方法分类列在左，数据列在右
因为在使用第二方法数据列在有，分类列在右
导致第一第三结果与第二结果不同

In [35]:
# 导入API进行独热编码
print(
    f"One-Hot Encoding分类变量:\nmae={score_dataset(OH_train_X, OH_val_X, train_y, val_y)}"
)

# 简单方法自动识别分类列进行独热编码
print(
    f"One-Hot Encoding分类变量:\nmae={score_dataset(onehot_train_X, onehot_val_X, train_y, val_y)}"
)

# 简单方法指定分类列进行独热编码
print(
    f"One-Hot Encoding分类变量:\nmae={score_dataset(final_onehot_train_X, final_onehot_val_X, train_y, val_y)}"
)

One-Hot Encoding分类变量:
mae=165699.58889227855
One-Hot Encoding分类变量:
mae=166089.4893009678
One-Hot Encoding分类变量:
mae=165699.58889227855


In [36]:
OH_train_X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.98670,13240.0
6524,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,2,8.0,3016.0,2.0,2.0,193.0,-37.85800,144.90050,6380.0
8413,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3,12.6,3020.0,3.0,1.0,555.0,-37.79880,144.82200,3755.0
2919,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,3,13.0,3046.0,3.0,1.0,265.0,-37.70830,144.91580,8870.0
6043,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3,13.3,3020.0,3.0,1.0,673.0,-37.76230,144.82720,4217.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13123,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,3,5.2,3056.0,3.0,1.0,212.0,-37.77695,144.95785,11918.0
3264,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,3,10.5,3081.0,3.0,1.0,748.0,-37.74160,145.04810,2947.0
9845,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4,6.7,3058.0,4.0,2.0,441.0,-37.73572,144.97256,11204.0
10799,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3,12.0,3073.0,3.0,1.0,606.0,-37.72057,145.02615,21650.0


In [37]:
onehot_train_X

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount,Type_h,...,Method_SP,Method_VB,Regionname_Eastern Metropolitan,Regionname_Eastern Victoria,Regionname_Northern Metropolitan,Regionname_Northern Victoria,Regionname_South-Eastern Metropolitan,Regionname_Southern Metropolitan,Regionname_Western Metropolitan,Regionname_Western Victoria
12167,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.98670,13240.0,0,...,0,0,0,0,0,0,0,1,0,0
6524,2,8.0,3016.0,2.0,2.0,193.0,-37.85800,144.90050,6380.0,1,...,0,0,0,0,0,0,0,0,1,0
8413,3,12.6,3020.0,3.0,1.0,555.0,-37.79880,144.82200,3755.0,1,...,0,0,0,0,0,0,0,0,1,0
2919,3,13.0,3046.0,3.0,1.0,265.0,-37.70830,144.91580,8870.0,0,...,1,0,0,0,1,0,0,0,0,0
6043,3,13.3,3020.0,3.0,1.0,673.0,-37.76230,144.82720,4217.0,1,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13123,3,5.2,3056.0,3.0,1.0,212.0,-37.77695,144.95785,11918.0,1,...,1,0,0,0,1,0,0,0,0,0
3264,3,10.5,3081.0,3.0,1.0,748.0,-37.74160,145.04810,2947.0,1,...,0,0,1,0,0,0,0,0,0,0
9845,4,6.7,3058.0,4.0,2.0,441.0,-37.73572,144.97256,11204.0,1,...,0,0,0,0,1,0,0,0,0,0
10799,3,12.0,3073.0,3.0,1.0,606.0,-37.72057,145.02615,21650.0,1,...,0,0,0,0,1,0,0,0,0,0


In [38]:
final_onehot_train_X

Unnamed: 0,Type_h,Type_t,Type_u,Method_PI,Method_S,Method_SA,Method_SP,Method_VB,Regionname_Eastern Metropolitan,Regionname_Eastern Victoria,...,Regionname_Western Victoria,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,0,0,1,0,1,0,0,0,0,0,...,0,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.98670,13240.0
6524,1,0,0,0,0,1,0,0,0,0,...,0,2,8.0,3016.0,2.0,2.0,193.0,-37.85800,144.90050,6380.0
8413,1,0,0,0,1,0,0,0,0,0,...,0,3,12.6,3020.0,3.0,1.0,555.0,-37.79880,144.82200,3755.0
2919,0,0,1,0,0,0,1,0,0,0,...,0,3,13.0,3046.0,3.0,1.0,265.0,-37.70830,144.91580,8870.0
6043,1,0,0,0,1,0,0,0,0,0,...,0,3,13.3,3020.0,3.0,1.0,673.0,-37.76230,144.82720,4217.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13123,1,0,0,0,0,0,1,0,0,0,...,0,3,5.2,3056.0,3.0,1.0,212.0,-37.77695,144.95785,11918.0
3264,1,0,0,0,1,0,0,0,1,0,...,0,3,10.5,3081.0,3.0,1.0,748.0,-37.74160,145.04810,2947.0
9845,1,0,0,1,0,0,0,0,0,0,...,0,4,6.7,3058.0,4.0,2.0,441.0,-37.73572,144.97256,11204.0
10799,1,0,0,0,1,0,0,0,0,0,...,0,3,12.0,3073.0,3.0,1.0,606.0,-37.72057,145.02615,21650.0


# Pipelines 管道


许多数据科学家在没有管道的情况下组合模型，但管道有一些重要的好处。其中包括：
更清晰的代码：在预处理的每个步骤中计算数据可能会变得混乱。使用管道，您无需在每个步骤中手动跟踪训练和验证数据。
错误更少：误用步骤或忘记预处理步骤的机会更少。
更容易生产：将模型从原型转变为可大规模部署的模型可能非常困难。我们不会在这里讨论许多相关的问题，但管道可以提供帮助。


In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 导入文件
data = pd.read_csv("/home/lc/code/ai/input/melb_data.csv")

# 选择特征和预测变量
y = data.Price
X = data.drop(["Price"], axis="columns")

# 划分训练集验证集
train_X, val_X, train_y, val_y = train_test_split(
    X, y, train_size=0.8, test_size=0.2, random_state=0
)

# 选择基数较少的分类列
categorical_cols = [
    cols
    for cols in train_X.columns
    if train_X[cols].nunique() < 10 and train_X[cols].dtype == "object"
]

# 选择数值列
numerical_cols = [
    cols for cols in train_X.columns if train_X[cols].dtype in ["int64", "float64"]
]

# 合并
final_cols = categorical_cols + numerical_cols
train_X = train_X[final_cols].copy()
val_X = val_X[final_cols].copy()

In [40]:
train_X

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,1.0,0.0,,1940.0,-37.85984,144.98670,13240.0
6524,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,1.0,193.0,,,-37.85800,144.90050,6380.0
8413,h,S,Western Metropolitan,3,12.6,3020.0,3.0,1.0,1.0,555.0,,,-37.79880,144.82200,3755.0
2919,u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,1.0,265.0,,1995.0,-37.70830,144.91580,8870.0
6043,h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,-37.76230,144.82720,4217.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13123,h,SP,Northern Metropolitan,3,5.2,3056.0,3.0,1.0,2.0,212.0,,,-37.77695,144.95785,11918.0
3264,h,S,Eastern Metropolitan,3,10.5,3081.0,3.0,1.0,1.0,748.0,101.0,1950.0,-37.74160,145.04810,2947.0
9845,h,PI,Northern Metropolitan,4,6.7,3058.0,4.0,2.0,2.0,441.0,255.0,2002.0,-37.73572,144.97256,11204.0
10799,h,S,Northern Metropolitan,3,12.0,3073.0,3.0,1.0,1.0,606.0,,,-37.72057,145.02615,21650.0


数据包含分类数据和具有缺失值的列，使用管道可以解决上述问题

第一步：定义预处理部分

1. 估算数值数据中的缺失值
2. 估算缺失值并对分类数据应用 one-hot 编码。


Optuna选择超参数时，以下为最佳超参数
'n_estimators': 126
'max_depth': 14
'max_features': 0.6824672350008439,
'min_samples_split': 7
'min_samples_leaf': 1

In [41]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# 数值数据预处理
numerical_transformer = SimpleImputer(strategy="constant")

# 分类数据预处理
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

# 对数值数据和分类数据进行捆绑预处理
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

第二步：定义模型

In [42]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(
    n_estimators=126,
    # max_depth=14,
    # max_features=0.6824672350008439,
    # min_samples_split=7,
    # min_samples_leaf=1,
    random_state=0,
)

第三步：创建并评估管道
- 通过管道，我们预处理训练数据并在一行代码中拟合模型。 （相反，如果没有管道，我们必须在单独的步骤中进行插补、one-hot 编码和模型训练。如果我们必须处理数值变量和分类变量，这会变得特别混乱！）

- 通过管道，我们将 X_valid 中未处理的特征提供给 Predict() 命令，管道在生成预测之前自动预处理这些特征。 （但是，如果没有管道，我们必须记住在进行预测之前对验证数据进行预处理。）


In [43]:
from sklearn.metrics import mean_absolute_error

# 将预处理和模型代码捆绑到管道中
my_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

# 将预处理后的训练数据拟合模型
my_pipeline.fit(train_X, train_y)

# 从预处理中的验证数据中获得预测结果
preds = my_pipeline.predict(val_X)

# 评估模型
score = mean_absolute_error(val_y, preds)
print("mae:", score)

mae: 160099.51368531375


# 交叉验证
- 对于小型数据集，额外的计算负担并不是什么大问题，您应该运行交叉验证。
- 对于较大的数据集，单个验证集就足够了。您的代码将运行得更快，并且您可能拥有足够的数据，几乎不需要重新使用其中的一些数据来保留。

我们将输入数据加载到 X 中，将输出数据加载到 y 中。

In [44]:
import pandas as pd

# 读取数据
data = pd.read_csv("/home/lc/code/ai/input/melb_data.csv")

# 选择特征列
cols_to_use = ["Rooms", "Distance", "Landsize", "BuildingArea", "YearBuilt"]
X = data[cols_to_use]

# 选择预测值
y = data.Price

然后，我们定义一个管道，使用输入器来填充缺失值，并使用随机森林模型来进行预测。
虽然可以在没有管道的情况下进行交叉验证，但这非常困难！使用管道将使代码变得非常简单。

In [45]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

my_pipeline = Pipeline(
    steps=[
        ("preprocessor", SimpleImputer()),
        ("model", RandomForestRegressor(n_estimators=50, random_state=0)),
    ]
)

我们使用 scikit-learn 中的 cross_val_score() 函数获取交叉验证分数。我们使用 cv 参数设置折叠次数。

In [46]:
from sklearn.model_selection import cross_val_score

# 乘-1，因为该值=-MAE
scores = -1 * cross_val_score(
    my_pipeline, X, y, cv=5, scoring="neg_mean_absolute_error"
)

print("MAE scores:\n", scores)

MAE scores:
 [301628.7893587  303164.4782723  287298.331666   236061.84754543
 260383.45111427]


将交叉验证评估模型的方法获得的预测分数求平均值

In [47]:
print("Average MAE score (across experiments):")
print(scores.mean())

Average MAE score (across experiments):
277707.3795913405


# XGBoost
## 梯度提升
- 首先，我们使用当前的集合来为数据集中的每个观察生成预测。为了进行预测，我们将集合中所有模型的预测相加。
- 这些预测用于计算损失函数（例如均方误差）
- 然后，我们使用损失函数来拟合将添加到集成中的新模型。具体来说，我们确定模型参数，以便将这个新模型添加到集成中将减少损失。 （旁注：“梯度提升”中的“梯度”指的是我们将在损失函数上使用梯度下降来确定这个新模型中的参数。）
- 最后，我们将新模型添加到集成中，然后......
- ... 重复！

In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
data = pd.read_csv("/home/lc/code/ai/input/melb_data.csv")

# Select subset of predictors
cols_to_use = ["Rooms", "Distance", "Landsize", "BuildingArea", "YearBuilt"]
X = data[cols_to_use]

# Select target
y = data.Price

# Separate data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y)

导入 XGBoost，这使我们能够像在 scikit-learn 中一样构建和拟合模型。

In [52]:
from xgboost import XGBRegressor

my_model = XGBRegressor()
my_model.fit(X_train, y_train)

进行预测并评估模型

In [53]:
from sklearn.metrics import mean_absolute_error

predictions = my_model.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))

Mean Absolute Error: 241758.91595636966


# 调整超参数
## n_estimators
n_estimators 指定经历上述建模周期的次数。它等于我们包含在集成中的模型数量。
- 值太低会导致欠拟合，从而导致训练数据和测试数据的预测不准确。
- 太高的值会导致过拟合，从而导致对训练数据的预测准确，但对测试数据的预测不准确（这是我们关心的）。

In [54]:
my_model = XGBRegressor(n_estimators=500)
my_model.fit(X_train, y_train)

## early_stopping_rounds
Early_stopping_rounds 提供了一种自动查找 n_estimators 理想值的方法。当验证分数停止提高时，提前停止会导致模型停止迭代，即使我们没有处于 n_estimators 的硬停止状态。明智的做法是为 n_estimators 设置一个较高的值，然后使用 Early_stopping_rounds 来找到停止迭代的最佳时间。
> 使用early_stopping_rounds时，您还需要留出一些数据来计算验证分数 - 这是通过设置eval_set参数来完成的。

In [55]:
my_model = XGBRegressor(n_estimators=500, early_stopping_rounds=5)
my_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)

predictions = my_model.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))

Mean Absolute Error: 245753.20658136965


# learning_rate
我们不是通过简单地将每个组件模型的预测相加来获得预测，而是可以将每个模型的预测乘以一个小数（称为学习率），然后再将它们相加。
这意味着我们添加到集合中的每棵树对我们的帮助都会减少。因此，我们可以为 n_estimators 设置更高的值而不会过度拟合。如果我们使用提前停止，则会自动确定适当的树木数量。
一般来说，较小的学习率和大量的估计器将产生更准确的 XGBoost 模型，但模型的训练时间也会更长，因为它在循环中进行了更多的迭代。默认情况下，XGBoost 设置learning_rate=0.1。

In [56]:
my_model = XGBRegressor(n_estimators=1000, early_stopping_rounds=5, learning_rate=0.05)
my_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)

predictions = my_model.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))

Mean Absolute Error: 240273.052195324


# n_jobs
在考虑运行时间的较大数据集上，您可以使用并行性来更快地构建模型。通常将参数 n_jobs 设置为等于计算机上的核心数。对于较小的数据集，这没有帮助。
生成的模型不会更好，因此对拟合时间进行微观优化通常只会分散注意力。但是，它在大型数据集中非常有用，否则您将在 fit 命令期间等待很长时间。

In [57]:
my_model = XGBRegressor(
    n_estimators=1000, early_stopping_rounds=5, learning_rate=0.05, n_jobs=4
)
my_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)

predictions = my_model.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))

Mean Absolute Error: 240273.052195324


# Data Leakage
泄漏会导致模型看起来很准确，直到您开始使用模型做出决策，然后模型就会变得非常不准确。

## Target leakage(目标泄露)
根据以下例子可以很好的表示出目标泄露

| got_pneumonia | age | weight | male | took_antibiotic_medicine |
|---------------|-----|--------|------|--------------------------|
| False         |  65 |  100   | False| False                    |
| False         |  72 |  130   | True | False                    |
| True          |  58 |  100   | False| True                     |

人们在患肺炎后服用抗生素药物took_antibiotic_medicine才能康复。原始数据显示这些列之间存在很强的关系，但在确定是否确诊got_pneumonia 的值后，take_antibiotic_medicine 经常发生更改。这就是目标泄漏。
该模型会发现，任何 take_antibiotic_medicine 值为 False 的人都没有患有肺炎。由于验证数据与训练数据来自同一来源，因此该模式将在验证中重复，并且模型将具有很高的验证（或交叉验证）分数。
但当随后在现实世界中部署时，该模型将非常不准确，因为当我们需要预测他们未来的健康状况时，即使是患有肺炎的患者也不会接受抗生素治疗。
为了防止这种类型的数据泄漏，应排除在实现目标值后更新（或创建）的任何变量。

## Target leakage(训练测试污染)
当您不小心区分训练数据和验证数据时，就会发生另一种类型的泄漏。
回想一下，验证旨在衡量模型如何处理之前未考虑过的数据。如果验证数据影响预处理行为，您可能会以微妙的方式破坏此过程。这有时称为训练测试污染。
例如，假设您在调用 train_test_split() 之前运行预处理（例如为缺失值拟合输入器）。最终结果？您的模型可能会获得良好的验证分数，让您对它充满信心，但在部署它来做出决策时却表现不佳。
毕竟，您将验证或测试数据中的数据合并到预测中，因此即使无法推广到新数据，也可能在该特定数据上表现良好。当您进行更复杂的特征工程时，这个问题变得更加微妙（也更危险）。
如果您的验证基于简单的训练测试分割，请从任何类型的拟合中排除验证数据，包括预处理步骤的拟合。如果您使用 scikit-learn 管道，这会更容易。使用交叉验证时，在管道内进行预处理更为重要！

将学习一种检测和消除目标泄漏的方法。

In [58]:
import pandas as pd

# Read the data
data = pd.read_csv(
    "/home/lc/code/ai/input/AER_credit_card_data.csv",
    true_values=["yes"],
    false_values=["no"],
)

# Select target
y = data.card

# Select predictors
X = data.drop(["card"], axis=1)

print("Number of rows in the dataset:", X.shape[0])
X.head()

Number of rows in the dataset: 1319


Unnamed: 0,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,0,37.66667,4.52,0.03327,124.9833,True,False,3,54,1,12
1,0,33.25,2.42,0.005217,9.854167,False,False,3,34,1,13
2,0,33.66667,4.5,0.004156,15.0,True,False,4,58,1,5
3,0,30.5,2.54,0.065214,137.8692,False,False,0,25,1,7
4,0,32.16667,9.7867,0.067051,546.5033,True,False,2,64,1,5


由于这是一个小数据集，我们将使用交叉验证来确保模型质量的准确测量

In [59]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# 由于没有预处理，我们不需要管道（无论如何都用作最佳实践！）
my_pipeline = make_pipeline(RandomForestClassifier(n_estimators=100))
cv_scores = cross_val_score(my_pipeline, X, y, cv=5, scoring="accuracy")

print("Cross-validation accuracy: %f" % cv_scores.mean())

Cross-validation accuracy: 0.980294


根据经验，您会发现很难找到准确率达到 98% 的模型。这种情况确实发生过，但这种情况并不常见，因此我们应该更仔细地检查数据是否存在目标泄漏。

以下是数据摘要，您也可以在数据选项卡下找到：

- card：如果接受信用卡申请则为 1，如果不接受则为 0
- reports：主要诽谤性报告的数量
- age：年龄 n 岁加十二分之一岁
- income：年收入（除以10,000）
- share：每月信用卡支出与年收入的比率
- expenditure：平均每月信用卡支出
- owner：如果拥有房屋，则为 1；如果租房，则为 0
- selfempl：如果是自雇人士，则为 1；如果不是，则为 0
- dependents：1 + 家属人数
- months：在当前地址居住的月数
- Majorcards：持有的主要信用卡数量
- active：活跃信用账户数量

一些变量看起来很可疑。例如，支出是指这张卡上的支出还是申请前使用过的卡上的支出？

In [60]:
# 表示“卡持有者”的支出
expenditures_cardholders = X.expenditure[y]
# “非卡持有者”的支出
expenditures_noncardholders = X.expenditure[~y]

print(
    "未收到卡且没有支出的人所占比例: %.2f" % ((expenditures_noncardholders == 0).mean())
)
print("收到卡但没有支出的人所占比例: %.2f" % ((expenditures_cardholders == 0).mean()))

未收到卡且没有支出的人所占比例: 1.00
收到卡但没有支出的人所占比例: 0.02


所有没有收到卡的人都没有支出，而只有2%的收到卡的人没有支出。我们的模型似乎具有很高的准确性，这并不奇怪。但这似乎也是一种目标泄漏的情况，其中支出可能意味着他们申请的卡上的支出。

由于share由expenditure决定，因此也应排除在外。变量 active 和 Majorcards 不太清楚，但从描述来看，它们听起来令人担忧。在大多数情况下，如果您无法追踪数据创建者以了解更多信息，那么安全总比后悔好。

运行一个没有目标泄漏的模型

In [61]:
# 从数据集中删除泄漏预测变量
potential_leaks = ["expenditure", "share", "active", "majorcards"]
X2 = X.drop(potential_leaks, axis=1)

# 评估删除了泄漏预测变量的模型
cv_scores = cross_val_score(my_pipeline, X2, y, cv=5, scoring="accuracy")

print("Cross-val accuracy: %f" % cv_scores.mean())

Cross-val accuracy: 0.829410
