# 使用heamy模块进行模型在线融合


In [2]:
import pandas as pd
import numpy as np
import warnings
import os
warnings.filterwarnings('ignore')

In [3]:
# import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
"""
sns 相关设置
@return:
"""
# 声明使用 Seaborn 样式
sns.set()
# 有五种seaborn的绘图风格，它们分别是：darkgrid, whitegrid, dark, white, ticks。默认的主题是darkgrid。
sns.set_style("whitegrid")
# 有四个预置的环境，按大小从小到大排列分别为：paper, notebook, talk, poster。其中，notebook是默认的。
sns.set_context('talk')
# 中文字体设置-黑体
plt.rcParams['font.sans-serif'] = ['SimHei']
# 解决保存图像是负号'-'显示为方块的问题
plt.rcParams['axes.unicode_minus'] = False
# 解决Seaborn中文显示问题并调整字体大小
sns.set(font='SimHei')

数据读取

In [4]:
from utils_jessie import reduce_mem_usage
df_data = pd.read_csv('user_data/data_for_model.csv')
df_data = reduce_mem_usage(df_data)

Memory usage of dataframe is 440.60 MB
Memory usage after optimization is: 288.01 MB
Decreased by 34.6%


数据处理

In [5]:
# 分离数据集
features = [f for f in df_data.columns if f not in ['id','issueDate','isDefault']]

train = df_data[df_data.isDefault.notnull()].reset_index(drop=True)
test = df_data[df_data.isDefault.isnull()].reset_index(drop=True)

X_train = train[features]
X_test = test[features]

y_train = train['isDefault']

In [6]:
from model_utils import xgb_model,lgb_model

In [7]:
from heamy.dataset import Dataset
from heamy.estimator import Classifier

model_dataset = Dataset(X_train=X_train, y_train=y_train, X_test=X_test)
model_xgb = Classifier(dataset=model_dataset, estimator=xgb_model, name='xgb', use_cache=False)
model_lgb = Classifier(dataset=model_dataset, estimator=lgb_model, name='lgb', use_cache=False)


## 使用stacking 方法进行模型融合

In [8]:
from heamy.pipeline import ModelsPipeline
pipeline = ModelsPipeline(model_xgb, model_lgb)
pipeline

<heamy.pipeline.ModelsPipeline at 0x25b55813520>

In [9]:
# 构建第一层新特征，其中k默认是5，表示5折交叉验证，full_test=True，对全部训练集进行训练得到基学习器，然后用基学习器对测试集预测得到新特征
stack_ds = pipeline.stack(k=5,seed=111,full_test=True)

[0]	train-auc:0.69328	eval-auc:0.69336
[200]	train-auc:0.72887	eval-auc:0.72258
[400]	train-auc:0.73668	eval-auc:0.72614
[600]	train-auc:0.74262	eval-auc:0.72811
[800]	train-auc:0.74741	eval-auc:0.72919
[1000]	train-auc:0.75174	eval-auc:0.72982
[1200]	train-auc:0.75569	eval-auc:0.73016
[1400]	train-auc:0.75959	eval-auc:0.73059
[1600]	train-auc:0.76310	eval-auc:0.73081
[1800]	train-auc:0.76664	eval-auc:0.73099
[2000]	train-auc:0.77010	eval-auc:0.73103
[2176]	train-auc:0.77298	eval-auc:0.73102
调参后xgboost单模型在验证集上的AUC：0.7310387122418293
[0]	train-auc:0.69476	eval-auc:0.69305
[200]	train-auc:0.72904	eval-auc:0.72335
[400]	train-auc:0.73701	eval-auc:0.72687
[600]	train-auc:0.74273	eval-auc:0.72852
[800]	train-auc:0.74759	eval-auc:0.72969
[1000]	train-auc:0.75187	eval-auc:0.73032
[1200]	train-auc:0.75594	eval-auc:0.73085
[1400]	train-auc:0.75971	eval-auc:0.73106
[1600]	train-auc:0.76317	eval-auc:0.73120
[1800]	train-auc:0.76646	eval-auc:0.73136
[2000]	train-auc:0.76989	eval-auc:0.73143
[2200]

In [26]:
from sklearn.linear_model import LogisticRegression
# 第二层使用逻辑回归进行stack
LogisticRegression(solver='lbfgs')
stacker = Classifier(dataset=stack_ds, estimator=LogisticRegression, parameters={'solver': 'lbfgs'})
# 测试集的预测结果
test_pred = stacker.predict()
test_pred



array([0.099757  , 0.3480528 , 0.74377717, ..., 0.19295431, 0.15915452,
       0.07000274])

In [12]:
test_pred.shape

(200000,)

In [17]:
"""生成提交格式的DataFrame"""
df_result = pd.DataFrame({'id': test['id'], 'isDefault': test_pred})
df_result.sort_values(by='id').head(20)

Unnamed: 0,id,isDefault
0,800000,0.099757
1,800001,0.348053
2,800002,0.743777
3,800003,0.292312
4,800004,0.262373
5,800005,0.068968
6,800006,0.267375
7,800007,0.076076
8,800008,0.757172
9,800009,0.074873


In [18]:
"""保存数据用于预测建模"""
df_result.to_csv('./prediction_result/pred_by_stack_20250307_V1.csv', index=False)

## 使用blending方法进行模型融合

In [19]:
# 构建第一层新特征，将训练集切分成8:2，其中80%用于训练基学习器，20%用于构建新特征
blend_ds = pipeline.blend(proportion=0.2,seed=111)
# 第二层使用逻辑回归进行blend
blender = Classifier(dataset=blend_ds, estimator=LogisticRegression, parameters={'solver': 'lbfgs'})
# 测试集的预测结果
test_pred = blender.predict()
test_pred

[0]	train-auc:0.69341	eval-auc:0.69403
[200]	train-auc:0.72842	eval-auc:0.72483
[400]	train-auc:0.73613	eval-auc:0.72846
[600]	train-auc:0.74187	eval-auc:0.73044
[800]	train-auc:0.74660	eval-auc:0.73157
[1000]	train-auc:0.75090	eval-auc:0.73219
[1200]	train-auc:0.75491	eval-auc:0.73260
[1400]	train-auc:0.75843	eval-auc:0.73286
[1600]	train-auc:0.76200	eval-auc:0.73301
[1800]	train-auc:0.76532	eval-auc:0.73314
[2000]	train-auc:0.76845	eval-auc:0.73320
[2200]	train-auc:0.77195	eval-auc:0.73317
[2294]	train-auc:0.77344	eval-auc:0.73315
调参后xgboost单模型在验证集上的AUC：0.7332487758617261
Training until validation scores don't improve for 500 rounds
[500]	training's auc: 0.725703	valid_1's auc: 0.723424
[1000]	training's auc: 0.733103	valid_1's auc: 0.727678
[1500]	training's auc: 0.738143	valid_1's auc: 0.729677
[2000]	training's auc: 0.742272	valid_1's auc: 0.730995
[2500]	training's auc: 0.74599	valid_1's auc: 0.731843
[3000]	training's auc: 0.749465	valid_1's auc: 0.732426
[3500]	training's auc: 

array([0.09234938, 0.33190246, 0.70607175, ..., 0.18517285, 0.17166879,
       0.06734737])

In [20]:
df_result2 = pd.DataFrame({'id': test['id'], 'isDefault': test_pred})
df_result2.sort_values(by='id').head(20)

Unnamed: 0,id,isDefault
0,800000,0.092349
1,800001,0.331902
2,800002,0.706072
3,800003,0.248769
4,800004,0.285136
5,800005,0.066736
6,800006,0.261221
7,800007,0.072459
8,800008,0.722855
9,800009,0.073296


In [21]:
df_result2.to_csv('./prediction_result/pred_by_blend_20250307_V1.csv', index=False)