In [1]:
#!/usr/bin/env python
# coding: utf-8


# 导入第三方包
import pandas as pd
import numpy as np

# from xgboost import XGBClassifier  as xgb
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings('ignore')



# 读取数据集，具体下载方式可见操作手册
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

sample_submit = pd.read_csv('submit.csv')


# 训练数据及测试数据准备
all_cols = [f for f in train.columns if f not in ['customer_id','loan_default']]

x_train = train[all_cols]
x_test = test[all_cols]

y_train = train['loan_default']

catb = CatBoostClassifier(
    iterations=20000, 
    learning_rate=0.05, 
    depth=2,
    loss_function='Logloss',
    logging_level='Verbose'
)


In [2]:
# 作为baseline部分仅使用经典的**LightGBM**作为训练模型，我们还能尝试**XGBoost、CatBoost和NN（神经网络）**
def cv_model(clf, train_x, train_y, test_x, clf_name='catb'):
    folds = 5
    seed = 2021
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        model = clf.fit(trn_x, trn_y,cat_features=[0,1],eval_set=(val_x, val_y),verbose_eval=800,early_stopping_rounds=500)
        val_pred = model.predict(val_x)
        test_pred = model.predict(test_x)

        # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])

        train[valid_index] = val_pred
        test += test_pred / kf.n_splits
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)
       
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return train, test



catb_train, catb_test = cv_model(catb, x_train, y_train, x_test)

print(catb_test)
# 预测结果
sample_submit['loan_default'] = catb_test
sample_submit['loan_default'] = sample_submit['loan_default'].apply(lambda x:1 if x>0.25 else 0).values
sample_submit.to_csv('catb_result.csv', index=False)

************************************ 1 ************************************
0:	learn: 0.6682157	test: 0.6684387	best: 0.6684387 (0)	total: 114ms	remaining: 37m 53s
800:	learn: 0.4423791	test: 0.4480493	best: 0.4480459 (799)	total: 41.4s	remaining: 16m 33s
1600:	learn: 0.4389884	test: 0.4466216	best: 0.4466216 (1600)	total: 1m 24s	remaining: 16m 8s
2400:	learn: 0.4367811	test: 0.4460701	best: 0.4460644 (2289)	total: 2m 4s	remaining: 15m 10s
3200:	learn: 0.4351364	test: 0.4458329	best: 0.4458216 (3191)	total: 2m 43s	remaining: 14m 15s
4000:	learn: 0.4336853	test: 0.4457166	best: 0.4456945 (3921)	total: 3m 21s	remaining: 13m 24s
4800:	learn: 0.4323417	test: 0.4456833	best: 0.4456572 (4733)	total: 3m 59s	remaining: 12m 37s
5600:	learn: 0.4312100	test: 0.4456343	best: 0.4456258 (5537)	total: 4m 34s	remaining: 11m 45s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.4456213425
bestIteration = 5655

Shrink model to first 5656 iterations.
[0.5021813538493902]
**************