In [4]:
# !pip install bayesian-optimization==1.1.0  -t $HOME/external-libraries/bayesian-optimization
# !pip install toad==0.0.64  -t $HOME/external-libraries/toad
# !pip install catboost==0.26.1 -t $HOME/external-libraries/catboost
# !pip install ipywidgets==7.6.5 -t $HOME/external-libraries/ipywidgets
# !pip install pandas-profiling==3.1.0 -t $HOME/external-libraries/pandas-profiling

In [5]:
import sys
sys.path.append('/home/aistudio/external-libraries/catboost104')
sys.path.append('/home/aistudio/external-libraries/ipywidgets')
sys.path.append('/home/aistudio/external-libraries/bayesian-optimization')
sys.path.append('/home/aistudio/external-libraries/toad')
sys.path.append('/home/aistudio/external-libraries/pandas-profiling')

In [6]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib # 注意这个也要import一次
import matplotlib.pyplot as plt 
import seaborn as sns
import pickle
from sklearn import metrics
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, mean_squared_error, roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import lightgbm as lgb
import toad
from toad.plot import bin_plot, badrate_plot, proportion_plot
import pandas_profiling as pp
from bayes_opt import BayesianOptimization
# from skopt.space import Real, Categorical, Integer
import os
import shutil
import warnings
warnings.filterwarnings("ignore")

In [7]:
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)

In [8]:
# reduce_mem_usage 函数通过调整数据类型，帮助我们减少数据在内存中占用的空间
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
#         else:
#             df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [9]:
# 导入数据
train_data = pd.read_csv('./data/data119778/train.csv', index_col= 'id')
testA_data = pd.read_csv('./data/data119778/testA.csv', index_col= 'id')

In [10]:
train_data = reduce_mem_usage(train_data)
testA_data = reduce_mem_usage(testA_data)

Memory usage of dataframe is 300800000.00 MB
Memory usage after optimization is: 102400000.00 MB
Decreased by 66.0%
Memory usage of dataframe is 73600000.00 MB
Memory usage after optimization is: 25400000.00 MB
Decreased by 65.5%


In [11]:
toad.detect(train_data)

Unnamed: 0,type,size,missing,unique,mean_or_top1,std_or_top2,min_or_top3,1%_or_top4,10%_or_top5,50%_or_bottom5,75%_or_bottom4,90%_or_bottom3,99%_or_bottom2,max_or_bottom1
loanAmnt,float16,800000,0.00%,1488,,,500,1500,5000,12000,20000,28000,35008,40000
term,int8,800000,0.00%,2,3.48274,0.855832,3,3,3,3,3,5,5,5
interestRate,float16,800000,0.00%,620,,0,5.30859,5.32031,7.39062,12.7422,15.9922,19.4844,26.2969,30.9844
installment,float16,800000,0.00%,5589,,,15.6875,53.1562,157.625,375.25,580.5,807.5,1220,1715
grade,object,800000,0.00%,7,B:29.21%,C:28.39%,A:17.46%,D:14.93%,E:6.96%,A:17.46%,D:14.93%,E:6.96%,F:2.38%,G:0.67%
subGrade,object,800000,0.00%,35,C1:6.35%,B4:6.19%,B5:6.12%,B3:6.08%,C2:5.88%,G1:0.22%,G2:0.15%,G3:0.12%,G4:0.09%,G5:0.08%
employmentTitle,float32,800000,0.00%,248683,72005.4,106586,0,19,54,7755,117664,256721,365010,378351
employmentLength,object,800000,5.85%,11,10+ years:32.84%,2 years:9.04%,< 1 year:8.03%,3 years:8.02%,1 year:6.56%,4 years:6.00%,6 years:4.66%,8 years:4.52%,7 years:4.43%,9 years:3.78%
homeOwnership,int8,800000,0.00%,6,0.614213,0.675749,0,0,0,1,1,2,2,5
annualIncome,float32,800000,0.00%,44926,76133.9,68947.5,0,18000,34000,65000,90000,125000,250000,1.09992e+07


In [12]:
# iv_info = toad.quality(train_data,'isDefault', iv_only=True)
# iv_info.sort_values(by='iv', ascending=False)

In [13]:
train = train_data.copy()
testA = testA_data.copy()

## 一、特征工程

### 1.特征标签编码

In [14]:
#标签编码 grade、subGrade
grade_map = dict(zip(('A', 'B', 'C', 'D', 'E', 'F', 'G'), (0, 5, 10, 15, 20, 25, 30, 35)))
subGrade_map =dict(zip(('A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4', 'C5', 'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2','E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5', 'G1', 'G2', 'G3', 'G4', 'G5'),
(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34)))

In [15]:
train['grade'] = train['grade'].map(grade_map)
testA['grade'] = testA['grade'].map(grade_map)

train['subGrade'] = train['subGrade'].map(subGrade_map)
testA['subGrade'] = testA['subGrade'].map(subGrade_map)

In [16]:
# 标签编码 employmentLength
employmentLength_map = dict(zip(( '< 1 year', '1 year', '2 years', '3 years', '4 years', '5 years', '6 years', '7 years', '8 years', '9 years', '10+ years'),
(0 , 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)))

In [17]:
train['employmentLength'] = train['employmentLength'].map(employmentLength_map)
testA['employmentLength'] = testA['employmentLength'].map(employmentLength_map)

In [18]:
#标签编码 issueDate	
train['issueDate'] = pd.to_datetime(train['issueDate'])
testA['issueDate'] = pd.to_datetime(testA['issueDate'])

In [19]:
train['issueDate_year'] = pd.to_datetime(train['issueDate']).dt.year
train['issueDate_month'] = pd.to_datetime(train['issueDate']).dt.month

testA['issueDate_year'] = pd.to_datetime(testA['issueDate']).dt.year
testA['issueDate_month'] = pd.to_datetime(testA['issueDate']).dt.month

In [20]:
base_time = pd.datetime.strptime('2007-06-01', '%Y-%m-%d')
train['issueDate_diff'] = train['issueDate'].apply(lambda x: (x - base_time).days)
testA['issueDate_diff'] = testA['issueDate'].apply(lambda x: (x - base_time).days)

In [21]:
# 标签编码 earliesCreditLine
train['earliesCreditLine'] = pd.to_datetime(train['earliesCreditLine'])
testA['earliesCreditLine'] = pd.to_datetime(testA['earliesCreditLine'])

In [22]:
train['earliesCreditLine_year'] = pd.to_datetime(train['earliesCreditLine']).dt.year
train['earliesCreditLine_month'] = pd.to_datetime(train['earliesCreditLine']).dt.month

testA['earliesCreditLine_year'] = pd.to_datetime(testA['earliesCreditLine']).dt.year
testA['earliesCreditLine_month'] = pd.to_datetime(testA['earliesCreditLine']).dt.month

In [23]:
base_time2 = pd.datetime.strptime('1944-01-01', '%Y-%m-%d')
train['earliesCreditLine_diff'] = train['earliesCreditLine'].apply(lambda x: (x - base_time2).days)
testA['earliesCreditLine_diff'] = testA['earliesCreditLine'].apply(lambda x: (x - base_time2).days)

In [24]:
# 使用FreqEncode 编码employmentTitle、title
f_en_list = ['employmentTitle', 'title']
for col in f_en_list:
    merge_data = pd.concat((train[[col]], testA[[col]]))
    col_map = merge_data[col].value_counts(dropna=False,normalize=True).to_dict()
#     temp1 = data[col].value_counts().to_dict()
    train['{}_freq_encode'.format(col)] = train[col].map(col_map)
    testA['{}_freq_encode'.format(col)] = testA[col].map(col_map)

### 2.业务特征

In [25]:
train['earliesCreditLine_issueDate_diff'] = (train.issueDate - train.earliesCreditLine).apply(lambda x:x.days)
testA['earliesCreditLine_issueDate_diff'] = (testA.issueDate - testA.earliesCreditLine).apply(lambda x:x.days)

In [26]:
train['loan_2_income'] = train.loanAmnt / (train.annualIncome)
testA['loan_2_income'] = testA.loanAmnt / (testA.annualIncome)

In [27]:
train['loan_2_income_per_year'] = train.loanAmnt / train.term / (train.annualIncome)
testA['loan_2_income_per_year'] = testA.loanAmnt / testA.term / (testA.annualIncome)

In [28]:
train['installment_2_income'] = train.installment * 12 / (train.annualIncome)
testA['installment_2_income'] = testA.installment * 12 / (testA.annualIncome)

In [29]:
train['openAcc_totalAcc_rate'] = train.openAcc / train.totalAcc
testA['openAcc_totalAcc_rate'] = testA.openAcc / testA.totalAcc

In [30]:
train['pubRec_diff'] = train.pubRec - train.pubRecBankruptcies
testA['pubRec_diff'] = testA.pubRec - testA.pubRecBankruptcies

In [31]:
train['revolamnt'] = train.revolBal / train.revolUtil * 100
testA['revolamnt'] = testA.revolBal / testA.revolUtil * 100

In [32]:
train['revol_2_income'] = train.revolBal / (train.annualIncome)
testA['revol_2_income'] = testA.revolBal / (testA.annualIncome)

In [33]:
train['installment_revol_2_income'] = (train.installment + train.revolBal) / ((train.annualIncome) / 12)
testA['installment_revol_2_income'] = (testA.installment + train.revolBal) / ((testA.annualIncome) / 12)

In [34]:
train['total_income'] = train.annualIncome * train.employmentLength
testA['total_income'] = testA.annualIncome * testA.employmentLength

In [35]:
train['null_num'] = train.isnull().sum(axis=1)
testA['null_num'] = testA.isnull().sum(axis=1)

In [36]:
train['end_year'] = train.issueDate_year + train.term
testA['end_year'] = testA.issueDate_year + testA.term

In [37]:
train.drop(columns='issueDate', inplace = True)
testA.drop(columns='issueDate', inplace = True)
train.drop(columns='earliesCreditLine', inplace = True)
testA.drop(columns='earliesCreditLine', inplace = True)

In [38]:
def inf_2_nan(df):
    for col, inf_cnt in (np.isinf(df.select_dtypes(exclude='O')).sum()).to_dict().items():
        if inf_cnt >0:
            df[col][np.isinf(df[col])] = np.nan

In [39]:
inf_2_nan(train)
inf_2_nan(testA)

### 3.保存初步特征工程后的数据

In [40]:
X_train = train.drop(columns=['isDefault'])
y_train = train[['isDefault']]

X_testA = testA.copy()

In [41]:
with open('./work/features/X_train.csv', 'wb') as file:
    pickle.dump(X_train, file)
    
with open('./work/features/y_train.csv', 'wb') as file:
    pickle.dump(y_train, file)
    
with open('./work/features/X_testA.csv', 'wb') as file:
    pickle.dump(X_testA, file)

### 4. 分箱(耗时太长，效果不佳，暂未完成，跳过)

#### 4.1 分箱

In [None]:
X_train_s, X_eval,y_train_s, y_eval = train_test_split(X_train,y_train,test_size=0.2,random_state=1, stratify=y_train)

In [None]:
data_tr = pd.concat((X_train_s, y_train_s), axis=1)
data_tr['type'] = 'train'

data_ts = pd.concat((X_eval,y_eval), axis=1)
data_ts['type'] = 'eval'

In [None]:
# feat_lst = list(X_eval.columns)
# psi_df = toad.metrics.PSI(X_train_s[feat_lst], X_eval[feat_lst]).sort_values(0)

In [None]:
# psi_df[psi_df.values > 0.3]

In [None]:
bins = dict()

# 初始化一个combiner类
combiner = toad.transform.Combiner()

In [None]:
# 训练数据并指定分箱方法，其它参数可选
# combiner.fit(data_tr, y='isDefault', method='chi', min_samples =  0.05, exclude = 'type')

for col in list(data_tr.columns):
    print('col:', col)
    combiner.fit(data_tr[[col, 'isDefault']], y='isDefault', method='chi', min_samples =  0.05)
    b = combiner.export()
    print(b)
    bins.update(combiner.export())

col: loanAmnt
{'loanAmnt': [3524.0, 9504.0, 10024.0, 15024.0, 28032.0]}
col: term
{'term': [5]}
col: interestRate
{'interestRate': [7.890625, 10.40625, 12.59375, 14.4609375, 17.796875]}
col: installment


In [None]:
bins

#### 4.1.1保存、读取bins信息

In [None]:
# with open('./work/features/bins.csv', 'wb') as file:
#     pickle.dump(bins, file)

In [None]:
with open('./work/features/bins.csv', 'rb') as file:
    bins = pickle.load(file)

#### 4.2 分箱可视化化，调整分箱

In [None]:
c2 = toad.transform.Combiner()
c2.set_rules(bins)

data_ = pd.concat((data_tr, data_ts))
temp_data = c2.transform(data_)

In [None]:
temp_data.columns

Index(['loanAmnt', 'term', 'interestRate', 'installment', 'grade', 'subGrade',
       'employmentTitle', 'employmentLength', 'homeOwnership', 'annualIncome',
       'verificationStatus', 'purpose', 'postCode', 'regionCode', 'dti',
       'delinquency_2years', 'ficoRangeLow', 'ficoRangeHigh', 'openAcc',
       'pubRec', 'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc',
       'initialListStatus', 'applicationType', 'title', 'policyCode', 'n0',
       'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11',
       'n12', 'n13', 'n14', 'issueDate_year', 'issueDate_month',
       'issueDate_diff', 'earliesCreditLine_year', 'earliesCreditLine_month',
       'earliesCreditLine_diff', 'employmentTitle_freq_encode',
       'title_freq_encode', 'earliesCreditLine_issueDate_diff',
       'loan_2_income', 'loan_2_income_per_year', 'installment_2_income',
       'openAcc_totalAcc_rate', 'pubRec_diff', 'revolamnt', 'revol_2_income',
       'installment_revol_2_income', 'total_in

In [None]:
adj_bin = {'installment': [162.0, 251.5, 335.0, 501.0],
           'employmentLength': [0.0, 2.0, 5.0, 10.0],
           'homeOwnership': [1],
           'postCode': [21.0, 272.0],
           'regionCode': [11],
           'title': [4.0, 26.0],
           'n5': [7.0, 11.0],
           'n8': [1.0],
           'issueDate_diff': [2496, 3196],
           'employmentTitle_freq_encode': [4e-06, 3.4e-05, 0.00559, 0.063978],
           'title_freq_encode': [0.004412, 0.4914],
           'total_income': [0.0, 92000.0, 450027.0, 606267.0, 758070.0, 1018000.0],
           'null_num': [4],
}
bins.update(adj_bin)
c2.set_rules(bins)
temp_data = c2.transform(data_)

In [None]:
# badrate_plot(temp_data, target = 'isDefault', x = 'type', by = 'end_year')
# bin_plot(temp_data, x='end_year', target='isDefault')

#### 4.3 分箱后的特征选择

In [None]:
selected_data, drop_lst= toad.selection.select(temp_data,target = 'isDefault', empty = 0.5, iv = 0.003,
                                               corr = 0.8, 
                                               return_drop=True, exclude=['type'])

In [None]:
selected_data.columns

Index(['loanAmnt', 'term', 'subGrade', 'employmentTitle', 'employmentLength',
       'homeOwnership', 'annualIncome', 'verificationStatus', 'purpose', 'dti',
       'ficoRangeLow', 'pubRec', 'revolUtil', 'title', 'n0', 'n2', 'n7', 'n8',
       'n10', 'n13', 'n14', 'issueDate_diff', 'earliesCreditLine_diff',
       'employmentTitle_freq_encode', 'loan_2_income', 'openAcc_totalAcc_rate',
       'revolamnt', 'installment_revol_2_income', 'total_income', 'end_year',
       'isDefault', 'type'],
      dtype='object')

In [None]:
drop_lst

{'empty': array([], dtype=float64),
 'iv': array(['postCode', 'regionCode', 'delinquency_2years', 'revolBal',
        'totalAcc', 'initialListStatus', 'applicationType', 'policyCode',
        'n4', 'n5', 'n11', 'issueDate_month', 'earliesCreditLine_month',
        'pubRec_diff'], dtype=object),
 'corr': array(['null_num', 'n1', 'n6', 'earliesCreditLine_issueDate_diff', 'n9',
        'loan_2_income_per_year', 'interestRate', 'openAcc',
        'pubRecBankruptcies', 'n12', 'revol_2_income',
        'earliesCreditLine_year', 'title_freq_encode', 'issueDate_year',
        'installment', 'n3', 'installment_2_income', 'ficoRangeHigh',
        'grade'], dtype=object)}

In [None]:
selected_fea = list(selected_data.columns)
selected_fea.remove('isDefault')
selected_fea.remove('type')
# selected_fea

In [None]:
# X_train_bined = c2.transform(X_train)
# X_testA_bined = c2.transform(X_testA)

X_train_bined = c2.transform(X_train[selected_fea])
X_testA_bined = c2.transform(X_testA[selected_fea])

#### 4.4 WOE映射

In [None]:
woe_transer = toad.transform.WOETransformer()
X_train_woe = woe_transer.fit_transform(X_train_bined,y_train.values)
X_testA_woe = woe_transer.transform(X_testA_bined)

# data = pd.concat([dev_slct2_woe,off_woe]

#### 4.5 稳定性筛选特征

In [None]:
psi_df = toad.metrics.PSI(X_train_woe, X_testA_woe).sort_values(0)

In [None]:
psi_df[psi_df.values > 0.05]

installment_revol_2_income    0.630002
dtype: float64

In [None]:
X_train_woe.drop(columns=['installment_revol_2_income'], inplace=True)
X_testA_woe.drop(columns=['installment_revol_2_income'], inplace=True)

#### 4.6 stepwise 选择特征

In [None]:
X_train_stepwise = toad.selection.stepwise(pd.concat((X_train_woe, y_train), axis=1), target = 'isDefault', direction = 'both', criterion = 'aic')

In [None]:
X_train_stepwise.drop(columns=['isDefault'], inplace=True)

#### 4.7 输出数据

In [None]:
X_train = X_train_woe[X_train_stepwise.columns].copy()
X_testA = X_testA_woe[X_train_stepwise.columns].copy()

In [None]:
X_train.shape, X_testA.shape

((800000, 25), (200000, 25))

## 二、提取、切割数据

In [3]:
# with open('./work/features/X_train.csv', 'rb') as file:
#     X_train = pickle.load(file)

# with open('./work/features/y_train.csv', 'rb') as file:
#     y_train = pickle.load(file)
    
# with open('./work/features/X_testA.csv', 'rb') as file:
#     X_testA = pickle.load(file)

## 三、使用五折交叉验证的CatBoost

In [42]:
from sklearn.model_selection import StratifiedKFold,KFold
import catboost as cb
from catboost import CatBoostClassifier, cv, Pool

In [43]:
data_X = X_train.copy()
data_y = y_train.copy()
data_X_testA = X_testA.copy()

In [44]:
### 标记分类变量
categorical_fea = ['initialListStatus', 'employmentTitle', 'applicationType', 
                   'title', 'policyCode', 'purpose', 'regionCode', 'postCode',
                   'verificationStatus', 'homeOwnership']

In [45]:
for i in data_X.columns:
    if i in categorical_fea:
        data_X[i] = data_X[i].astype('str')

In [46]:
for i in data_X_testA.columns:
    if i in categorical_fea:
        data_X_testA[i] = data_X_testA[i].astype('str')

In [47]:
clfs = []
answers = []
mean_score = 0
cv_scores = []
NFOLD = 5
seed = 2020
CB_INFO_PATH = '/home/aistudio/catboost_info'

In [48]:
kf = StratifiedKFold(n_splits=NFOLD, shuffle=True, random_state=seed)

In [49]:
from catboost.utils import get_gpu_device_count
print('I see %i GPU devices' % get_gpu_device_count())

I see 1 GPU devices


In [50]:
cv_cat_model = cb.CatBoostClassifier(loss_function='Logloss', eval_metric='AUC', 
                                     iterations=20000, depth=6, learning_rate=0.1,
                                     random_state=2020, od_type="Iter",
                                    #  scale_pos_weight = 4,
#                                      bagging_temperature=0.5, sampling_frequency='PerTree', sampling_unit='Object',
#                                      colsample_bylevel=0.8,
#                                      task_type='GPU', devices=0,
                                     use_best_model=True, metric_period=10)

In [51]:
for fold, (train_index, val_index) in enumerate(kf.split(data_X, data_y)):
    X_train_fold, X_val_fold = data_X.iloc[train_index], data_X.iloc[val_index]
    y_train_fold, y_val_fold = data_y.iloc[train_index], data_y.iloc[val_index]
    
    print("fold:", fold)
    clf = cv_cat_model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold),
                           verbose=10, 
                        #    cat_features=categorical_fea
                           )
    clfs.append(clf)

    pred_val_fold = clfs[fold].predict(X_val_fold, prediction_type='Probability',
                                       ntree_end = clfs[fold].get_best_iteration())[:,-1]
    
    print('cat验证的auc:{}'.format(roc_auc_score(y_val_fold, pred_val_fold)))
    mean_score += roc_auc_score(y_val_fold, pred_val_fold) / NFOLD
    cv_scores.append(roc_auc_score(y_val_fold, pred_val_fold))

    pred = clfs[fold].predict(data_X_testA, prediction_type='Probability',
                              ntree_end = clfs[fold].get_best_iteration())[:,-1]
    answers.append(pred)

fold: 0




0:	test: 0.6915922	best: 0.6915922 (0)	total: 206ms	remaining: 1h 8m 49s
10:	test: 0.7145014	best: 0.7145014 (10)	total: 1.61s	remaining: 48m 43s
20:	test: 0.7188148	best: 0.7188148 (20)	total: 3.03s	remaining: 48m 2s
30:	test: 0.7218563	best: 0.7218563 (30)	total: 4.38s	remaining: 46m 58s
40:	test: 0.7236641	best: 0.7236641 (40)	total: 5.75s	remaining: 46m 38s
50:	test: 0.7253461	best: 0.7253461 (50)	total: 7.07s	remaining: 46m 5s
60:	test: 0.7264182	best: 0.7264182 (60)	total: 8.45s	remaining: 46m
70:	test: 0.7272788	best: 0.7272788 (70)	total: 9.84s	remaining: 46m 3s
80:	test: 0.7279537	best: 0.7279537 (80)	total: 11.2s	remaining: 45m 53s
90:	test: 0.7286769	best: 0.7286769 (90)	total: 12.5s	remaining: 45m 41s
100:	test: 0.7290605	best: 0.7290605 (100)	total: 13.8s	remaining: 45m 18s
110:	test: 0.7295800	best: 0.7295800 (110)	total: 15.1s	remaining: 45m
120:	test: 0.7300569	best: 0.7300569 (120)	total: 16.4s	remaining: 44m 50s
130:	test: 0.7305179	best: 0.7305179 (130)	total: 17.6s	



0:	test: 0.6882217	best: 0.6882217 (0)	total: 154ms	remaining: 51m 12s
10:	test: 0.7149007	best: 0.7149007 (10)	total: 1.53s	remaining: 46m 13s
20:	test: 0.7191212	best: 0.7191212 (20)	total: 2.91s	remaining: 46m 6s
30:	test: 0.7214940	best: 0.7214940 (30)	total: 4.32s	remaining: 46m 22s
40:	test: 0.7235415	best: 0.7235415 (40)	total: 5.8s	remaining: 47m 4s
50:	test: 0.7250216	best: 0.7250216 (50)	total: 7.12s	remaining: 46m 23s
60:	test: 0.7259633	best: 0.7259633 (60)	total: 8.43s	remaining: 45m 54s
70:	test: 0.7269721	best: 0.7269721 (70)	total: 9.72s	remaining: 45m 27s
80:	test: 0.7276527	best: 0.7276527 (80)	total: 11s	remaining: 45m 11s
90:	test: 0.7281887	best: 0.7281887 (90)	total: 12.3s	remaining: 44m 53s
100:	test: 0.7286897	best: 0.7286897 (100)	total: 13.6s	remaining: 44m 46s
110:	test: 0.7291350	best: 0.7291350 (110)	total: 14.9s	remaining: 44m 37s
120:	test: 0.7296331	best: 0.7296331 (120)	total: 16.2s	remaining: 44m 26s
130:	test: 0.7301951	best: 0.7301951 (130)	total: 17



0:	test: 0.6865921	best: 0.6865921 (0)	total: 168ms	remaining: 55m 49s
10:	test: 0.7137799	best: 0.7137799 (10)	total: 1.65s	remaining: 50m 6s
20:	test: 0.7190556	best: 0.7190556 (20)	total: 3.21s	remaining: 50m 58s
30:	test: 0.7214629	best: 0.7214629 (30)	total: 4.74s	remaining: 50m 53s
40:	test: 0.7233266	best: 0.7233266 (40)	total: 6.36s	remaining: 51m 37s
50:	test: 0.7248191	best: 0.7248191 (50)	total: 7.96s	remaining: 51m 52s
60:	test: 0.7259015	best: 0.7259015 (60)	total: 9.96s	remaining: 54m 14s
70:	test: 0.7267732	best: 0.7267732 (70)	total: 12s	remaining: 55m 57s
80:	test: 0.7274887	best: 0.7274887 (80)	total: 13.6s	remaining: 55m 56s
90:	test: 0.7280847	best: 0.7280847 (90)	total: 15.1s	remaining: 55m 1s
100:	test: 0.7286442	best: 0.7286442 (100)	total: 16.6s	remaining: 54m 24s
110:	test: 0.7291086	best: 0.7291086 (110)	total: 18s	remaining: 53m 52s
120:	test: 0.7296630	best: 0.7296630 (120)	total: 19.5s	remaining: 53m 25s
130:	test: 0.7300160	best: 0.7300160 (130)	total: 21s



0:	test: 0.6859496	best: 0.6859496 (0)	total: 154ms	remaining: 51m 19s
10:	test: 0.7148832	best: 0.7148832 (10)	total: 1.68s	remaining: 50m 58s
20:	test: 0.7188214	best: 0.7188214 (20)	total: 3.17s	remaining: 50m 14s
30:	test: 0.7215393	best: 0.7215393 (30)	total: 4.57s	remaining: 49m 3s
40:	test: 0.7233264	best: 0.7233264 (40)	total: 5.97s	remaining: 48m 24s
50:	test: 0.7245544	best: 0.7245544 (50)	total: 7.25s	remaining: 47m 13s
60:	test: 0.7254395	best: 0.7254395 (60)	total: 8.51s	remaining: 46m 22s
70:	test: 0.7262225	best: 0.7262225 (70)	total: 9.82s	remaining: 45m 57s
80:	test: 0.7269049	best: 0.7269049 (80)	total: 11.1s	remaining: 45m 32s
90:	test: 0.7276831	best: 0.7276831 (90)	total: 12.4s	remaining: 45m 20s
100:	test: 0.7281905	best: 0.7281905 (100)	total: 13.7s	remaining: 45m 1s
110:	test: 0.7288156	best: 0.7288156 (110)	total: 15.1s	remaining: 44m 56s
120:	test: 0.7291788	best: 0.7291788 (120)	total: 16.4s	remaining: 44m 53s
130:	test: 0.7296507	best: 0.7296507 (130)	total:



0:	test: 0.6858355	best: 0.6858355 (0)	total: 159ms	remaining: 52m 51s
10:	test: 0.7153163	best: 0.7153163 (10)	total: 1.81s	remaining: 54m 52s
20:	test: 0.7200331	best: 0.7200331 (20)	total: 3.32s	remaining: 52m 43s
30:	test: 0.7228007	best: 0.7228007 (30)	total: 4.91s	remaining: 52m 41s
40:	test: 0.7246989	best: 0.7246989 (40)	total: 6.4s	remaining: 51m 57s
50:	test: 0.7261323	best: 0.7261323 (50)	total: 7.97s	remaining: 51m 57s
60:	test: 0.7271467	best: 0.7271467 (60)	total: 9.46s	remaining: 51m 31s
70:	test: 0.7279873	best: 0.7279873 (70)	total: 10.9s	remaining: 51m 12s
80:	test: 0.7286357	best: 0.7286357 (80)	total: 12.5s	remaining: 51m 8s
90:	test: 0.7291870	best: 0.7291870 (90)	total: 14s	remaining: 50m 55s
100:	test: 0.7296564	best: 0.7296564 (100)	total: 15.5s	remaining: 51m 1s
110:	test: 0.7300889	best: 0.7300889 (110)	total: 16.9s	remaining: 50m 31s
120:	test: 0.7304718	best: 0.7304718 (120)	total: 18.4s	remaining: 50m 21s
130:	test: 0.7308228	best: 0.7308228 (130)	total: 19

In [52]:
print("cat_scotrainre_list:{}".format(cv_scores))
print("cat_score_mean:{}".format(np.mean(cv_scores)))
print("cat_score_std:{}".format(np.std(cv_scores)))

cat_scotrainre_list:[0.7370405544009001, 0.737727855350864, 0.7360848698353699, 0.7366609646358647, 0.7372610085273018]
cat_score_mean:0.73695505055006
cat_score_std:0.000555310625441075


In [53]:
#最终加权平均的预测结果
cat_pre=sum(answers)/NFOLD
cat_pre

array([0.08095502, 0.38732961, 0.63035361, ..., 0.1981108 , 0.29199864,
       0.03566144])

In [54]:
submission = pd.DataFrame({'id' : X_testA.index, 'isDefault' : cat_pre})
# submission['isDefault'] = submission['isDefault'].apply(lambda x : x if x > 0 else 0)
# submission['isDefault'] = submission['isDefault'].apply(lambda x : x if x < 1 else 1)
submission.to_csv('./work/cv_catboost/submission_cv_catboost20211219(全训练数据+scale_pos_weight).csv', index = None)

#### 暂时调参优化的效果还没有没有优化的好

## 四、参数调优

In [55]:
def rm_catboost_info_dir(path):
    # 文件路径
    if os.path.exists(path):
        shutil.rmtree(path)
        # os.rmdir()
        # os.removedirs()
    else:
        print('no such file:%s'%path)  # 则返回文件不存在
    
    pass

In [56]:
cv_result_global = []

"""定义优化函数"""
def cv_cat_eval(depth, max_leaves, min_data_in_leaf, random_strength, bagging_temperature, 
            #  colsample_bylevel, 
             border_count, l2_leaf_reg, scale_pos_weight):
    # 建立模型
    params = {'loss_function': 'Logloss', 'eval_metric': 'AUC', 'grow_policy': 'Lossguide', #'logging_level': 'Silent',
              'iterations': 3000, 'learning_rate': 0.1,
              'random_state': 2020, 'od_type': "Iter",
              'depth': int(depth), 'max_leaves': int(max_leaves),
              'min_data_in_leaf': int(min_data_in_leaf),
              'random_strength': random_strength,
              'bagging_temperature': round(bagging_temperature, 2),
            #   'colsample_bylevel': min(colsample_bylevel, 0.999), #rsm on GPU is supported for pairwise modes only
              'border_count': int(border_count),
              'l2_leaf_reg': round(l2_leaf_reg, 2),
              'scale_pos_weight': round(scale_pos_weight, 2),
              # 'sampling_frequency': 'PerTree', 'sampling_unit': 'Object',
              'task_type': 'GPU', 'devices': '0', 'pinned_memory_size': '10737418240',
              'thread_count': 2, 'verbose': 500,
              'use_best_model': True, 'metric_period': 10}

    cv_dataset = Pool(data = data_X, label = data_y, cat_features = categorical_fea, thread_count = 2)
    
    rm_catboost_info_dir(CB_INFO_PATH)

    cv_result_cb = cb.cv(cv_dataset,
                         params,
                         fold_count= NFOLD,
                         stratified = True, shuffle=True, partition_random_seed=seed,
                         # plot="True"
                         )

    rm_catboost_info_dir(CB_INFO_PATH)

    global cv_result_global
    cv_result_global.append(cv_result_cb)

    return cv_result_cb['test-AUC-mean'].max()
    
    # val = cross_val_score(model_cat, data_X, data_y, cv=5, scoring='roc_auc').mean()
    # print(val)
    # return val

In [57]:
"""定义优化参数"""
cat_opt = BayesianOptimization(cv_cat_eval, {'depth': (4, 8),
                                             'max_leaves': (10, 50),
                                             'min_data_in_leaf': (10,100),
                                             'random_strength': (1e-9, 10),
                                             'bagging_temperature': (0, 10),
                                            #  'colsample_bylevel': (0,1),
                                             'border_count': (1,255),
                                             'l2_leaf_reg': (0, 30),
                                             'scale_pos_weight': (0.1, 5.0)})

"""开始优化"""
cat_opt.maximize(n_iter=20)

|   iter    |  target   | baggin... | border... |   depth   | l2_lea... | max_le... | min_da... | random... | scale_... |
-------------------------------------------------------------------------------------------------------------------------




Training on fold [0/5]
0:	learn: 0.6939226	test: 0.6974331	best: 0.6974331 (0)	total: 33ms	remaining: 1m 38s
500:	learn: 0.7415850	test: 0.7382085	best: 0.7382085 (500)	total: 14.2s	remaining: 1m 10s
1000:	learn: 0.7495371	test: 0.7397317	best: 0.7397317 (1000)	total: 29.7s	remaining: 59.2s
bestTest = 0.7400788665
bestIteration = 1170
Training on fold [1/5]
0:	learn: 0.6966583	test: 0.6943708	best: 0.6943708 (0)	total: 32.9ms	remaining: 1m 38s
500:	learn: 0.7431604	test: 0.7336750	best: 0.7336750 (500)	total: 14.6s	remaining: 1m 12s
bestTest = 0.7344437838
bestIteration = 742
Training on fold [2/5]
0:	learn: 0.6976165	test: 0.6980253	best: 0.6980253 (0)	total: 33.7ms	remaining: 1m 41s
500:	learn: 0.7423071	test: 0.7362721	best: 0.7362779 (495)	total: 15.4s	remaining: 1m 16s
1000:	learn: 0.7502112	test: 0.7376959	best: 0.7376959 (1000)	total: 30.4s	remaining: 1m
bestTest = 0.7377085984
bestIteration = 1003
Training on fold [3/5]
0:	learn: 0.6999359	test: 0.6979571	best: 0.6979571 (0)	to



0:	learn: 0.6652480	test: 0.6677816	best: 0.6677816 (0)	total: 29.1ms	remaining: 1m 27s
500:	learn: 0.7352768	test: 0.7351256	best: 0.7351256 (500)	total: 12.1s	remaining: 1m
1000:	learn: 0.7418087	test: 0.7378840	best: 0.7378840 (1000)	total: 24.2s	remaining: 48.3s
bestTest = 0.7379463911
bestIteration = 1020
Training on fold [1/5]
0:	learn: 0.6755511	test: 0.6734095	best: 0.6734095 (0)	total: 26ms	remaining: 1m 17s
500:	learn: 0.7364123	test: 0.7304314	best: 0.7304314 (500)	total: 12.6s	remaining: 1m 2s
1000:	learn: 0.7429910	test: 0.7333730	best: 0.7333730 (1000)	total: 24.8s	remaining: 49.5s
1500:	learn: 0.7476493	test: 0.7345520	best: 0.7345520 (1500)	total: 37.5s	remaining: 37.4s
bestTest = 0.7347373068
bestIteration = 1578
Training on fold [2/5]
0:	learn: 0.6696383	test: 0.6710070	best: 0.6710070 (0)	total: 27.1ms	remaining: 1m 21s
500:	learn: 0.7357810	test: 0.7333435	best: 0.7333435 (500)	total: 13.1s	remaining: 1m 5s
1000:	learn: 0.7426258	test: 0.7363816	best: 0.7363864 (998



0:	learn: 0.6957644	test: 0.6983480	best: 0.6983480 (0)	total: 24.5ms	remaining: 1m 13s
500:	learn: 0.7377232	test: 0.7384293	best: 0.7384308 (499)	total: 11.2s	remaining: 55.8s
1000:	learn: 0.7417079	test: 0.7400110	best: 0.7400110 (1000)	total: 21.7s	remaining: 43.3s
bestTest = 0.740763247
bestIteration = 1393
Training on fold [1/5]
0:	learn: 0.6956671	test: 0.6936293	best: 0.6936293 (0)	total: 22.3ms	remaining: 1m 6s
500:	learn: 0.7392933	test: 0.7340646	best: 0.7340646 (500)	total: 9.65s	remaining: 48.1s
bestTest = 0.7351928353
bestIteration = 888
Training on fold [2/5]
0:	learn: 0.6949012	test: 0.6956131	best: 0.6956131 (0)	total: 25ms	remaining: 1m 15s
500:	learn: 0.7383157	test: 0.7361292	best: 0.7361292 (500)	total: 11s	remaining: 54.8s
1000:	learn: 0.7424421	test: 0.7376963	best: 0.7376968 (999)	total: 21.2s	remaining: 42.4s
bestTest = 0.7381860018
bestIteration = 1289
Training on fold [3/5]
0:	learn: 0.6976484	test: 0.6961239	best: 0.6961239 (0)	total: 26ms	remaining: 1m 17s




0:	learn: 0.6657144	test: 0.6682739	best: 0.6682739 (0)	total: 39.4ms	remaining: 1m 58s
500:	learn: 0.7347078	test: 0.7359179	best: 0.7359179 (500)	total: 12.9s	remaining: 1m 4s
1000:	learn: 0.7402501	test: 0.7389359	best: 0.7389359 (1000)	total: 29.2s	remaining: 58.3s
1500:	learn: 0.7438172	test: 0.7400475	best: 0.7400514 (1499)	total: 48.6s	remaining: 48.5s
2000:	learn: 0.7468587	test: 0.7407733	best: 0.7407733 (1997)	total: 1m 1s	remaining: 30.6s
bestTest = 0.7411688566
bestIteration = 2309
Training on fold [1/5]
0:	learn: 0.6582599	test: 0.6553875	best: 0.6553875 (0)	total: 25.2ms	remaining: 1m 15s
500:	learn: 0.7356667	test: 0.7307904	best: 0.7307904 (500)	total: 12.4s	remaining: 1m 1s
1000:	learn: 0.7412087	test: 0.7337410	best: 0.7337410 (1000)	total: 24.2s	remaining: 48.4s
1500:	learn: 0.7449003	test: 0.7349123	best: 0.7349123 (1500)	total: 37.1s	remaining: 37s
bestTest = 0.735075891
bestIteration = 1593
Training on fold [2/5]
0:	learn: 0.6665347	test: 0.6679758	best: 0.6679758



0:	learn: 0.6254365	test: 0.6278834	best: 0.6278834 (0)	total: 38ms	remaining: 1m 54s
500:	learn: 0.7348854	test: 0.7350296	best: 0.7350296 (500)	total: 18s	remaining: 1m 29s
1000:	learn: 0.7412896	test: 0.7382152	best: 0.7382152 (1000)	total: 36.7s	remaining: 1m 13s
1500:	learn: 0.7459124	test: 0.7396604	best: 0.7396604 (1500)	total: 55.1s	remaining: 55s
bestTest = 0.7398480177
bestIteration = 1593
Training on fold [1/5]
0:	learn: 0.6471864	test: 0.6451367	best: 0.6451367 (0)	total: 38ms	remaining: 1m 53s
500:	learn: 0.7357854	test: 0.7301116	best: 0.7301116 (500)	total: 19s	remaining: 1m 34s
1000:	learn: 0.7425205	test: 0.7334299	best: 0.7334299 (1000)	total: 36.8s	remaining: 1m 13s
1500:	learn: 0.7470857	test: 0.7347527	best: 0.7347553 (1497)	total: 54.8s	remaining: 54.7s
bestTest = 0.7355172038
bestIteration = 1934
Training on fold [2/5]
0:	learn: 0.6664709	test: 0.6673651	best: 0.6673651 (0)	total: 28.5ms	remaining: 1m 25s
500:	learn: 0.7352617	test: 0.7336096	best: 0.7336096 (500



0:	learn: 0.6652030	test: 0.6669779	best: 0.6669779 (0)	total: 27.6ms	remaining: 1m 22s
500:	learn: 0.7337061	test: 0.7351332	best: 0.7351332 (500)	total: 13.6s	remaining: 1m 7s
1000:	learn: 0.7386825	test: 0.7382488	best: 0.7382488 (1000)	total: 25.8s	remaining: 51.5s
bestTest = 0.7393679917
bestIteration = 1445
Training on fold [1/5]
0:	learn: 0.6798350	test: 0.6767035	best: 0.6767035 (0)	total: 26ms	remaining: 1m 17s
500:	learn: 0.7348196	test: 0.7308773	best: 0.7308773 (500)	total: 12.1s	remaining: 1m
1000:	learn: 0.7397905	test: 0.7338969	best: 0.7338969 (1000)	total: 24.1s	remaining: 48.1s
bestTest = 0.735031724
bestIteration = 1429
Training on fold [2/5]
0:	learn: 0.6792665	test: 0.6797629	best: 0.6797629 (0)	total: 27.3ms	remaining: 1m 21s
500:	learn: 0.7341594	test: 0.7334341	best: 0.7334341 (500)	total: 11.5s	remaining: 57.6s
1000:	learn: 0.7392533	test: 0.7364355	best: 0.7364355 (1000)	total: 23.5s	remaining: 47s
1500:	learn: 0.7425392	test: 0.7376984	best: 0.7376984 (1500)	



0:	learn: 0.7028822	test: 0.7046610	best: 0.7046610 (0)	total: 28.1ms	remaining: 1m 24s
500:	learn: 0.7501015	test: 0.7412040	best: 0.7412040 (500)	total: 12.6s	remaining: 1m 2s
bestTest = 0.7420064211
bestIteration = 844
Training on fold [1/5]
0:	learn: 0.7032326	test: 0.7004382	best: 0.7004382 (0)	total: 22.7ms	remaining: 1m 8s
500:	learn: 0.7516798	test: 0.7363644	best: 0.7363743 (497)	total: 11s	remaining: 55s
bestTest = 0.7368984818
bestIteration = 649
Training on fold [2/5]
0:	learn: 0.7025368	test: 0.7027010	best: 0.7027010 (0)	total: 23.9ms	remaining: 1m 11s
500:	learn: 0.7509851	test: 0.7388820	best: 0.7388870 (495)	total: 12.4s	remaining: 1m 2s
bestTest = 0.7394677997
bestIteration = 762
Training on fold [3/5]
0:	learn: 0.7036194	test: 0.7014566	best: 0.7014566 (0)	total: 22.8ms	remaining: 1m 8s
500:	learn: 0.7514254	test: 0.7366574	best: 0.7366687 (497)	total: 10.9s	remaining: 54.5s
bestTest = 0.7368879914
bestIteration = 594
Training on fold [4/5]
0:	learn: 0.7034452	test: 



0:	learn: 0.6744259	test: 0.6772050	best: 0.6772050 (0)	total: 20.7ms	remaining: 1m 2s
500:	learn: 0.7309682	test: 0.7332355	best: 0.7332355 (500)	total: 9.86s	remaining: 49.2s
1000:	learn: 0.7348565	test: 0.7355756	best: 0.7355769 (998)	total: 19.9s	remaining: 39.7s
1500:	learn: 0.7373462	test: 0.7366802	best: 0.7366826 (1499)	total: 29.6s	remaining: 29.6s
bestTest = 0.7374768555
bestIteration = 1841
Training on fold [1/5]
0:	learn: 0.6826523	test: 0.6812575	best: 0.6812575 (0)	total: 54.3ms	remaining: 2m 42s
500:	learn: 0.7318625	test: 0.7288526	best: 0.7288526 (500)	total: 8.8s	remaining: 43.9s
1000:	learn: 0.7358859	test: 0.7312015	best: 0.7312015 (1000)	total: 19.2s	remaining: 38.3s
1500:	learn: 0.7383907	test: 0.7322834	best: 0.7322843 (1499)	total: 30.7s	remaining: 30.7s
2000:	learn: 0.7405989	test: 0.7330412	best: 0.7330412 (2000)	total: 40.6s	remaining: 20.3s
bestTest = 0.7331592143
bestIteration = 2136
Training on fold [2/5]
0:	learn: 0.6837532	test: 0.6846732	best: 0.6846732



0:	learn: 0.6895719	test: 0.6925960	best: 0.6925960 (0)	total: 27.1ms	remaining: 1m 21s
500:	learn: 0.7360505	test: 0.7359178	best: 0.7359178 (500)	total: 12.5s	remaining: 1m 2s
1000:	learn: 0.7419127	test: 0.7381786	best: 0.7381788 (999)	total: 25.1s	remaining: 50.1s
bestTest = 0.739167273
bestIteration = 1450
Training on fold [1/5]
0:	learn: 0.6928208	test: 0.6905401	best: 0.6905401 (0)	total: 28.6ms	remaining: 1m 25s
500:	learn: 0.7375917	test: 0.7314892	best: 0.7314899 (499)	total: 12.6s	remaining: 1m 3s
1000:	learn: 0.7432427	test: 0.7332799	best: 0.7332799 (1000)	total: 26.3s	remaining: 52.5s
1500:	learn: 0.7475238	test: 0.7342674	best: 0.7342698 (1496)	total: 39s	remaining: 39s
bestTest = 0.7347192168
bestIteration = 1801
Training on fold [2/5]
0:	learn: 0.6917111	test: 0.6933782	best: 0.6933782 (0)	total: 40.9ms	remaining: 2m 2s
500:	learn: 0.7368900	test: 0.7340670	best: 0.7340670 (500)	total: 18.3s	remaining: 1m 31s
1000:	learn: 0.7424032	test: 0.7356648	best: 0.7356648 (1000



0:	learn: 0.6619055	test: 0.6641541	best: 0.6641541 (0)	total: 29.4ms	remaining: 1m 28s
500:	learn: 0.7336717	test: 0.7347775	best: 0.7347775 (500)	total: 15.2s	remaining: 1m 15s
1000:	learn: 0.7393878	test: 0.7379560	best: 0.7379560 (1000)	total: 28.4s	remaining: 56.6s
bestTest = 0.7392341495
bestIteration = 1409
Training on fold [1/5]
0:	learn: 0.6472173	test: 0.6447699	best: 0.6447699 (0)	total: 35.9ms	remaining: 1m 47s
500:	learn: 0.7345461	test: 0.7300595	best: 0.7300595 (500)	total: 12.5s	remaining: 1m 2s
1000:	learn: 0.7404200	test: 0.7332692	best: 0.7332692 (1000)	total: 25.2s	remaining: 50.3s
1500:	learn: 0.7441881	test: 0.7346383	best: 0.7346383 (1500)	total: 36.7s	remaining: 36.7s
2000:	learn: 0.7472504	test: 0.7354817	best: 0.7354817 (1991)	total: 47.9s	remaining: 23.9s
2500:	learn: 0.7499484	test: 0.7360841	best: 0.7360945 (2490)	total: 59s	remaining: 11.8s
bestTest = 0.7360945344
bestIteration = 2490
Training on fold [2/5]
0:	learn: 0.6337341	test: 0.6359175	best: 0.63591



0:	learn: 0.6911342	test: 0.6946656	best: 0.6946656 (0)	total: 24.2ms	remaining: 1m 12s
500:	learn: 0.7374227	test: 0.7369525	best: 0.7369525 (500)	total: 12.1s	remaining: 1m
1000:	learn: 0.7428010	test: 0.7386331	best: 0.7386342 (983)	total: 22.6s	remaining: 45s
bestTest = 0.739400804
bestIteration = 1403
Training on fold [1/5]
0:	learn: 0.6956398	test: 0.6934020	best: 0.6934020 (0)	total: 35ms	remaining: 1m 45s
500:	learn: 0.7386998	test: 0.7329116	best: 0.7329116 (500)	total: 16.2s	remaining: 1m 20s
1000:	learn: 0.7438940	test: 0.7343990	best: 0.7344006 (999)	total: 32.9s	remaining: 1m 5s
bestTest = 0.7349691987
bestIteration = 1295
Training on fold [2/5]
0:	learn: 0.6963496	test: 0.6976501	best: 0.6976501 (0)	total: 33.9ms	remaining: 1m 41s
500:	learn: 0.7382737	test: 0.7356412	best: 0.7356421 (498)	total: 15.5s	remaining: 1m 17s
1000:	learn: 0.7436631	test: 0.7371759	best: 0.7371765 (999)	total: 27.6s	remaining: 55s
bestTest = 0.7377358675
bestIteration = 1309
Training on fold [3/



0:	learn: 0.6968606	test: 0.6995028	best: 0.6995028 (0)	total: 22.7ms	remaining: 1m 7s
500:	learn: 0.7400815	test: 0.7395401	best: 0.7395401 (500)	total: 9.17s	remaining: 45.8s
1000:	learn: 0.7445818	test: 0.7410427	best: 0.7410427 (1000)	total: 18.3s	remaining: 36.5s
bestTest = 0.7415855527
bestIteration = 1300
Training on fold [1/5]
0:	learn: 0.6978665	test: 0.6956244	best: 0.6956244 (0)	total: 21.4ms	remaining: 1m 4s
500:	learn: 0.7415267	test: 0.7349308	best: 0.7349327 (499)	total: 9.58s	remaining: 47.8s
1000:	learn: 0.7461398	test: 0.7363628	best: 0.7363628 (1000)	total: 18.7s	remaining: 37.3s
bestTest = 0.7366746664
bestIteration = 1221
Training on fold [2/5]
0:	learn: 0.6975344	test: 0.6986906	best: 0.6986906 (0)	total: 20.6ms	remaining: 1m 1s
500:	learn: 0.7406865	test: 0.7374417	best: 0.7374417 (500)	total: 9.37s	remaining: 46.7s
1000:	learn: 0.7452084	test: 0.7388452	best: 0.7388459 (999)	total: 18.8s	remaining: 37.5s
bestTest = 0.7393548489
bestIteration = 1288
Training on f



0:	learn: 0.6667732	test: 0.6693208	best: 0.6693208 (0)	total: 28.2ms	remaining: 1m 24s
500:	learn: 0.7361645	test: 0.7358212	best: 0.7358212 (500)	total: 13.7s	remaining: 1m 8s
1000:	learn: 0.7426258	test: 0.7382092	best: 0.7382092 (1000)	total: 26.2s	remaining: 52.3s
bestTest = 0.7382567525
bestIteration = 1024
Training on fold [1/5]
0:	learn: 0.6690050	test: 0.6656888	best: 0.6656888 (0)	total: 28ms	remaining: 1m 24s
500:	learn: 0.7369084	test: 0.7305424	best: 0.7305424 (500)	total: 13.2s	remaining: 1m 5s
1000:	learn: 0.7435337	test: 0.7331758	best: 0.7331771 (999)	total: 24.7s	remaining: 49.3s
1500:	learn: 0.7484208	test: 0.7343884	best: 0.7343958 (1486)	total: 36.2s	remaining: 36.2s
bestTest = 0.7344481647
bestIteration = 1523
Training on fold [2/5]
0:	learn: 0.6741776	test: 0.6749186	best: 0.6749186 (0)	total: 26.5ms	remaining: 1m 19s
500:	learn: 0.7361318	test: 0.7334359	best: 0.7334359 (500)	total: 11.9s	remaining: 59.4s
1000:	learn: 0.7429148	test: 0.7359539	best: 0.7359539 (1



0:	learn: 0.6697372	test: 0.6722181	best: 0.6722181 (0)	total: 26.1ms	remaining: 1m 18s
500:	learn: 0.7357268	test: 0.7352403	best: 0.7352403 (500)	total: 12.1s	remaining: 1m
1000:	learn: 0.7418994	test: 0.7376359	best: 0.7376447 (995)	total: 23.1s	remaining: 46.2s
bestTest = 0.7383947968
bestIteration = 1343
Training on fold [1/5]
0:	learn: 0.6698364	test: 0.6683846	best: 0.6683846 (0)	total: 28.8ms	remaining: 1m 26s
500:	learn: 0.7367901	test: 0.7309892	best: 0.7309922 (499)	total: 12.9s	remaining: 1m 4s
1000:	learn: 0.7427499	test: 0.7331499	best: 0.7331499 (1000)	total: 25.3s	remaining: 50.6s
bestTest = 0.7339369655
bestIteration = 1280
Training on fold [2/5]
0:	learn: 0.6810496	test: 0.6825665	best: 0.6825665 (0)	total: 25.6ms	remaining: 1m 16s
500:	learn: 0.7362401	test: 0.7341142	best: 0.7341142 (500)	total: 12.5s	remaining: 1m 2s
1000:	learn: 0.7424614	test: 0.7364562	best: 0.7364562 (1000)	total: 25.9s	remaining: 51.8s
1500:	learn: 0.7470695	test: 0.7376260	best: 0.7376331 (14



0:	learn: 0.7047867	test: 0.7066324	best: 0.7066324 (0)	total: 28.5ms	remaining: 1m 25s
bestTest = 0.741638422
bestIteration = 376
Training on fold [1/5]
0:	learn: 0.7057650	test: 0.7031619	best: 0.7031619 (0)	total: 39.4ms	remaining: 1m 58s
bestTest = 0.7370615005
bestIteration = 474
Training on fold [2/5]
0:	learn: 0.7053730	test: 0.7049503	best: 0.7049503 (0)	total: 39.3ms	remaining: 1m 57s
bestTest = 0.7391845584
bestIteration = 433
Training on fold [3/5]
0:	learn: 0.7062667	test: 0.7042470	best: 0.7042470 (0)	total: 37.1ms	remaining: 1m 51s
bestTest = 0.7369781733
bestIteration = 311
Training on fold [4/5]
0:	learn: 0.7057149	test: 0.7045504	best: 0.7045504 (0)	total: 37.1ms	remaining: 1m 51s
bestTest = 0.7384781241
bestIteration = 330
| [0m 15      [0m | [0m 0.7386  [0m | [0m 0.2089  [0m | [0m 12.54   [0m | [0m 6.652   [0m | [0m 14.22   [0m | [0m 36.99   [0m | [0m 47.86   [0m | [0m 5.734   [0m | [0m 4.722   [0m |
no such file:/home/aistudio/catboost_info
Train



0:	learn: 0.6978656	test: 0.6998840	best: 0.6998840 (0)	total: 29.5ms	remaining: 1m 28s
500:	learn: 0.7473826	test: 0.7416728	best: 0.7416927 (495)	total: 14.7s	remaining: 1m 13s
bestTest = 0.7420065701
bestIteration = 603
Training on fold [1/5]
0:	learn: 0.6980748	test: 0.6954314	best: 0.6954314 (0)	total: 28.7ms	remaining: 1m 25s
500:	learn: 0.7486605	test: 0.7366935	best: 0.7366947 (499)	total: 13.6s	remaining: 1m 7s
bestTest = 0.7370699644
bestIteration = 590
Training on fold [2/5]
0:	learn: 0.6973249	test: 0.6984730	best: 0.6984730 (0)	total: 28.9ms	remaining: 1m 26s
500:	learn: 0.7484095	test: 0.7394811	best: 0.7394860 (499)	total: 13.9s	remaining: 1m 9s
bestTest = 0.7399599552
bestIteration = 784
Training on fold [3/5]
0:	learn: 0.6982395	test: 0.6964140	best: 0.6964140 (0)	total: 29.4ms	remaining: 1m 28s
500:	learn: 0.7486284	test: 0.7369853	best: 0.7369924 (498)	total: 14.3s	remaining: 1m 11s
bestTest = 0.7378280759
bestIteration = 820
Training on fold [4/5]
0:	learn: 0.697823



0:	learn: 0.7007433	test: 0.7030842	best: 0.7030842 (0)	total: 33.5ms	remaining: 1m 40s
500:	learn: 0.7415903	test: 0.7391178	best: 0.7391178 (500)	total: 12.5s	remaining: 1m 2s
1000:	learn: 0.7478351	test: 0.7405181	best: 0.7405185 (993)	total: 23.7s	remaining: 47.3s
bestTest = 0.7409712076
bestIteration = 1261
Training on fold [1/5]
0:	learn: 0.7001355	test: 0.6981873	best: 0.6981873 (0)	total: 27.2ms	remaining: 1m 21s
500:	learn: 0.7431533	test: 0.7350655	best: 0.7350655 (500)	total: 12.2s	remaining: 1m
bestTest = 0.7363841534
bestIteration = 956
Training on fold [2/5]
0:	learn: 0.7010373	test: 0.7016376	best: 0.7016376 (0)	total: 25.4ms	remaining: 1m 16s
500:	learn: 0.7422335	test: 0.7374418	best: 0.7374525 (498)	total: 10.6s	remaining: 53s
1000:	learn: 0.7484762	test: 0.7389010	best: 0.7389145 (996)	total: 21s	remaining: 41.9s
bestTest = 0.738928318
bestIteration = 1006
Training on fold [3/5]
0:	learn: 0.7013930	test: 0.6995886	best: 0.6995886 (0)	total: 56.3ms	remaining: 2m 48s
5



0:	learn: 0.7038774	test: 0.7058736	best: 0.7058736 (0)	total: 29.3ms	remaining: 1m 27s
500:	learn: 0.7510781	test: 0.7413242	best: 0.7413242 (500)	total: 12s	remaining: 59.9s
bestTest = 0.7416752577
bestIteration = 605
Training on fold [1/5]
0:	learn: 0.7048671	test: 0.7024022	best: 0.7024022 (0)	total: 24.6ms	remaining: 1m 13s
500:	learn: 0.7527411	test: 0.7366716	best: 0.7366769 (499)	total: 12s	remaining: 1m
bestTest = 0.7372493148
bestIteration = 828
Training on fold [2/5]
0:	learn: 0.7041165	test: 0.7042146	best: 0.7042146 (0)	total: 29.3ms	remaining: 1m 27s
500:	learn: 0.7519528	test: 0.7394938	best: 0.7394938 (500)	total: 12.1s	remaining: 1m
bestTest = 0.7397505343
bestIteration = 643
Training on fold [3/5]
0:	learn: 0.7043402	test: 0.7026448	best: 0.7026448 (0)	total: 28.9ms	remaining: 1m 26s
500:	learn: 0.7525160	test: 0.7374630	best: 0.7374852 (490)	total: 13.1s	remaining: 1m 5s
bestTest = 0.7380842566
bestIteration = 806
Training on fold [4/5]
0:	learn: 0.7045967	test: 0.70



0:	learn: 0.7030941	test: 0.7053933	best: 0.7053933 (0)	total: 25.4ms	remaining: 1m 16s
500:	learn: 0.7570809	test: 0.7421862	best: 0.7422019 (494)	total: 11.4s	remaining: 56.9s
bestTest = 0.7422417402
bestIteration = 519
Training on fold [1/5]
0:	learn: 0.7041122	test: 0.7014216	best: 0.7014216 (0)	total: 27.4ms	remaining: 1m 22s
500:	learn: 0.7582587	test: 0.7375344	best: 0.7375344 (500)	total: 12.3s	remaining: 1m 1s
bestTest = 0.7375866771
bestIteration = 513
Training on fold [2/5]
0:	learn: 0.7033628	test: 0.7037789	best: 0.7037789 (0)	total: 25.2ms	remaining: 1m 15s
500:	learn: 0.7579743	test: 0.7400746	best: 0.7400768 (499)	total: 12.6s	remaining: 1m 3s
bestTest = 0.7403504252
bestIteration = 600
Training on fold [3/5]
0:	learn: 0.7043251	test: 0.7022900	best: 0.7022900 (0)	total: 24.9ms	remaining: 1m 14s
500:	learn: 0.7583489	test: 0.7380106	best: 0.7380202 (498)	total: 11s	remaining: 55.1s
bestTest = 0.7382983565
bestIteration = 635
Training on fold [4/5]
0:	learn: 0.7037923	te



0:	learn: 0.7016524	test: 0.7042324	best: 0.7042324 (0)	total: 27.5ms	remaining: 1m 22s
500:	learn: 0.7449215	test: 0.7404709	best: 0.7404709 (500)	total: 13.4s	remaining: 1m 6s
bestTest = 0.7416541874
bestIteration = 933
Training on fold [1/5]
0:	learn: 0.7029272	test: 0.6996459	best: 0.6996459 (0)	total: 24.7ms	remaining: 1m 14s
500:	learn: 0.7466109	test: 0.7357370	best: 0.7357394 (499)	total: 12.2s	remaining: 1m
bestTest = 0.7369678617
bestIteration = 963
Training on fold [2/5]
0:	learn: 0.7021874	test: 0.7026593	best: 0.7026593 (0)	total: 25.4ms	remaining: 1m 16s
500:	learn: 0.7458804	test: 0.7382096	best: 0.7382134 (497)	total: 11.7s	remaining: 58.2s
1000:	learn: 0.7532861	test: 0.7394840	best: 0.7394888 (997)	total: 23.5s	remaining: 46.9s
bestTest = 0.7397165298
bestIteration = 1217
Training on fold [3/5]
0:	learn: 0.7024520	test: 0.7004123	best: 0.7004123 (0)	total: 26.2ms	remaining: 1m 18s
500:	learn: 0.7463431	test: 0.7361596	best: 0.7361623 (499)	total: 11.7s	remaining: 58.4



0:	learn: 0.7061848	test: 0.7081253	best: 0.7081253 (0)	total: 29.5ms	remaining: 1m 28s
500:	learn: 0.7640315	test: 0.7425953	best: 0.7426208 (496)	total: 13.2s	remaining: 1m 5s
bestTest = 0.7427509427
bestIteration = 532
Training on fold [1/5]
0:	learn: 0.7071671	test: 0.7035021	best: 0.7035021 (0)	total: 28.2ms	remaining: 1m 24s
bestTest = 0.7372590303
bestIteration = 415
Training on fold [2/5]
0:	learn: 0.7062594	test: 0.7063934	best: 0.7063934 (0)	total: 65.6ms	remaining: 3m 16s
bestTest = 0.7398683429
bestIteration = 365
Training on fold [3/5]
0:	learn: 0.7072505	test: 0.7049785	best: 0.7049785 (0)	total: 30.1ms	remaining: 1m 30s
bestTest = 0.7380760312
bestIteration = 448
Training on fold [4/5]
0:	learn: 0.7063945	test: 0.7047938	best: 0.7047938 (0)	total: 29.2ms	remaining: 1m 27s
500:	learn: 0.7653199	test: 0.7394253	best: 0.7394449 (482)	total: 12.2s	remaining: 1m
bestTest = 0.7394448519
bestIteration = 482
| [0m 21      [0m | [0m 0.7395  [0m | [0m 0.7489  [0m | [0m 24.3



0:	learn: 0.6976029	test: 0.6998872	best: 0.6998872 (0)	total: 22ms	remaining: 1m 5s
500:	learn: 0.7480738	test: 0.7419157	best: 0.7419253 (497)	total: 9.8s	remaining: 48.9s
bestTest = 0.7426923513
bestIteration = 813
Training on fold [1/5]
0:	learn: 0.6985776	test: 0.6957003	best: 0.6957003 (0)	total: 22.6ms	remaining: 1m 7s
500:	learn: 0.7492802	test: 0.7369455	best: 0.7369455 (500)	total: 9.93s	remaining: 49.5s
bestTest = 0.7374585271
bestIteration = 658
Training on fold [2/5]
0:	learn: 0.6977085	test: 0.6990909	best: 0.6990909 (0)	total: 20ms	remaining: 59.9s
500:	learn: 0.7485291	test: 0.7392493	best: 0.7392495 (499)	total: 11.2s	remaining: 55.7s
bestTest = 0.7399367094
bestIteration = 740
Training on fold [3/5]
0:	learn: 0.6981213	test: 0.6966008	best: 0.6966008 (0)	total: 20.9ms	remaining: 1m 2s
500:	learn: 0.7493795	test: 0.7369750	best: 0.7369804 (499)	total: 9.15s	remaining: 45.6s
bestTest = 0.7376890182
bestIteration = 700
Training on fold [4/5]
0:	learn: 0.6983403	test: 0.6



0:	learn: 0.6969691	test: 0.6995758	best: 0.6995758 (0)	total: 22.9ms	remaining: 1m 8s
500:	learn: 0.7412588	test: 0.7402511	best: 0.7402522 (499)	total: 9.49s	remaining: 47.4s
1000:	learn: 0.7463259	test: 0.7417820	best: 0.7417825 (999)	total: 19.3s	remaining: 38.6s
bestTest = 0.7421043515
bestIteration = 1227
Training on fold [1/5]
0:	learn: 0.6976775	test: 0.6950227	best: 0.6950227 (0)	total: 23.5ms	remaining: 1m 10s
500:	learn: 0.7426779	test: 0.7356476	best: 0.7356523 (499)	total: 10.4s	remaining: 51.9s
1000:	learn: 0.7477996	test: 0.7370592	best: 0.7370592 (1000)	total: 21.6s	remaining: 43s
bestTest = 0.7376655638
bestIteration = 1347
Training on fold [2/5]
0:	learn: 0.6977101	test: 0.6986707	best: 0.6986707 (0)	total: 19.9ms	remaining: 59.6s
500:	learn: 0.7419016	test: 0.7378218	best: 0.7378299 (498)	total: 8.88s	remaining: 44.3s
1000:	learn: 0.7468581	test: 0.7390375	best: 0.7390375 (1000)	total: 17.7s	remaining: 35.4s
bestTest = 0.7392765284
bestIteration = 1142
Training on fo



0:	learn: 0.7067506	test: 0.7083851	best: 0.7083851 (0)	total: 43.1ms	remaining: 2m 9s
500:	learn: 0.7702186	test: 0.7426168	best: 0.7426513 (498)	total: 18.4s	remaining: 1m 31s
bestTest = 0.7426512837
bestIteration = 498
Training on fold [1/5]
0:	learn: 0.7073806	test: 0.7044751	best: 0.7044751 (0)	total: 47.7ms	remaining: 2m 22s
bestTest = 0.7371564209
bestIteration = 297
Training on fold [2/5]
0:	learn: 0.7071538	test: 0.7058966	best: 0.7058966 (0)	total: 29.9ms	remaining: 1m 29s
bestTest = 0.7400183082
bestIteration = 376
Training on fold [3/5]
0:	learn: 0.7077284	test: 0.7052859	best: 0.7052859 (0)	total: 32ms	remaining: 1m 35s
bestTest = 0.737629354
bestIteration = 447
Training on fold [4/5]
0:	learn: 0.7071167	test: 0.7052518	best: 0.7052518 (0)	total: 28.3ms	remaining: 1m 24s
bestTest = 0.7395144999
bestIteration = 407
| [0m 24      [0m | [0m 0.7394  [0m | [0m 0.4284  [0m | [0m 31.42   [0m | [0m 6.571   [0m | [0m 19.18   [0m | [0m 46.22   [0m | [0m 64.26   [0m |



0:	learn: 0.6999377	test: 0.7032522	best: 0.7032522 (0)	total: 26.2ms	remaining: 1m 18s
500:	learn: 0.7419901	test: 0.7393273	best: 0.7393273 (500)	total: 13.1s	remaining: 1m 5s
1000:	learn: 0.7484758	test: 0.7409016	best: 0.7409021 (999)	total: 26.8s	remaining: 53.5s
bestTest = 0.7414176464
bestIteration = 1266
Training on fold [1/5]
0:	learn: 0.6998509	test: 0.6971237	best: 0.6971237 (0)	total: 29.4ms	remaining: 1m 28s
500:	learn: 0.7434390	test: 0.7349268	best: 0.7349297 (499)	total: 13.3s	remaining: 1m 6s
1000:	learn: 0.7498697	test: 0.7363071	best: 0.7363071 (1000)	total: 26s	remaining: 51.9s
bestTest = 0.7366576791
bestIteration = 1221
Training on fold [2/5]
0:	learn: 0.7006403	test: 0.7017781	best: 0.7017781 (0)	total: 28.3ms	remaining: 1m 25s
500:	learn: 0.7429014	test: 0.7373592	best: 0.7373592 (500)	total: 12.5s	remaining: 1m 2s
1000:	learn: 0.7495075	test: 0.7389256	best: 0.7389256 (1000)	total: 25.4s	remaining: 50.6s
bestTest = 0.7392787933
bestIteration = 1216
Training on 

In [72]:
cv_result_global

[     iterations  test-AUC-mean  test-AUC-std  test-Logloss-mean  \
 0             0       0.697488      0.001932           0.679490   
 1            10       0.712917      0.002129           0.626455   
 2            20       0.717829      0.002075           0.615289   
 3            30       0.721765      0.001711           0.610537   
 4            40       0.724080      0.001694           0.607930   
 ..          ...            ...           ...                ...   
 116        1160       0.737088      0.002123           0.595578   
 117        1170       0.737103      0.002129           0.595567   
 118        1180       0.737100      0.002126           0.595569   
 119        1190       0.737102      0.002122           0.595568   
 120        1191       0.737101      0.002122                NaN   
 
      test-Logloss-std  train-Logloss-mean  train-Logloss-std  
 0            0.000245            0.679474           0.000348  
 1            0.000819            0.626314           0

In [59]:
"""显示优化结果"""
cat_opt.max

{'target': 0.7395971357822418,
 'params': {'bagging_temperature': 0.5037619741645072,
  'border_count': 18.976123358421376,
  'depth': 5.5682973820573505,
  'l2_leaf_reg': 17.391754278266006,
  'max_leaves': 39.00066812365198,
  'min_data_in_leaf': 51.742870385668986,
  'random_strength': 0.5232209175547612,
  'scale_pos_weight': 4.625102763650997}}

In [60]:
par_bagging_temperature = 0.5037619741645072
par_border_count = 18.976123358421376
par_depth = 5.5682973820573505
par_l2_leaf_reg = 17.391754278266006
par_max_leaves = 39.00066812365198
par_min_data_in_leaf = 51.742870385668986
par_random_strength = 0.5232209175547612
par_scale_pos_weight = 4.625102763650997

In [61]:
"""调整一个较小的学习率，并通过cv函数确定当前最优的迭代次数"""

params = {'loss_function': 'Logloss', 'eval_metric': 'AUC', 'grow_policy': 'Lossguide', #'logging_level': 'Silent',
           'iterations': 20000, 'learning_rate': 0.01,
           'random_state': 2020, 'od_type': "Iter",
           'depth': int(par_depth), 'max_leaves': int(par_max_leaves),
           'min_data_in_leaf': int(par_min_data_in_leaf),
           'random_strength': par_random_strength,
           'bagging_temperature': round(par_bagging_temperature, 2),
            #   'colsample_bylevel': min(colsample_bylevel, 0.999), #rsm on GPU is supported for pairwise modes only
            'border_count': int(par_border_count),
            'l2_leaf_reg': round(par_l2_leaf_reg, 2),
            'scale_pos_weight': round(par_scale_pos_weight, 2),
            # 'sampling_frequency': 'PerTree', 'sampling_unit': 'Object',
            'task_type': 'GPU', 'devices': '0', 'pinned_memory_size': '10737418240',
            'thread_count': 2, 'verbose': 500,
            'use_best_model': True, 'metric_period': 10}

cv_dataset = Pool(data = data_X, label = data_y, cat_features = categorical_fea, thread_count = 2)
    
# 文件路径
rm_catboost_info_dir(CB_INFO_PATH)

scores = cb.cv(cv_dataset,
               params,
               fold_count= NFOLD,
               stratified = True, shuffle=True, partition_random_seed=seed,
               # plot="True"
               )

rm_catboost_info_dir(CB_INFO_PATH)

# return scores['test-AUC-mean'].max()

print('迭代次数{}'.format(scores['test-AUC-mean'].argmax() + 1))
print('最终模型的AUC为{}'.format(scores['test-AUC-mean'].max()))

no such file:/home/aistudio/catboost_info
Training on fold [0/5]




0:	learn: 0.7030941	test: 0.7053933	best: 0.7053933 (0)	total: 37.4ms	remaining: 12m 27s
500:	learn: 0.7320927	test: 0.7327000	best: 0.7327000 (500)	total: 17.3s	remaining: 11m 12s
1000:	learn: 0.7386786	test: 0.7375127	best: 0.7375127 (1000)	total: 34.8s	remaining: 11m 1s
1500:	learn: 0.7426929	test: 0.7396033	best: 0.7396033 (1500)	total: 46.4s	remaining: 9m 31s
2000:	learn: 0.7455929	test: 0.7407473	best: 0.7407473 (2000)	total: 57.4s	remaining: 8m 36s
2500:	learn: 0.7481725	test: 0.7415001	best: 0.7415001 (2500)	total: 1m 8s	remaining: 8m 1s
3000:	learn: 0.7504249	test: 0.7420131	best: 0.7420131 (3000)	total: 1m 20s	remaining: 7m 36s
3500:	learn: 0.7524911	test: 0.7423753	best: 0.7423753 (3500)	total: 1m 33s	remaining: 7m 19s
bestTest = 0.7426194549
bestIteration = 3879
Training on fold [1/5]
0:	learn: 0.7041122	test: 0.7014216	best: 0.7014216 (0)	total: 34.7ms	remaining: 11m 33s
500:	learn: 0.7331631	test: 0.7287759	best: 0.7287759 (500)	total: 13.9s	remaining: 9m
1000:	learn: 0.7

In [62]:
print('迭代次数{}'.format(scores.iterations[scores['test-AUC-mean'].argmax()]))
print('最终模型的AUC为{}'.format(scores['test-AUC-mean'].max()))

迭代次数5969
最终模型的AUC为0.7402165770530701


#### 模型参数已经确定，建立最终模型并对验证集进行验证

In [63]:
cv_cat_model = cb.CatBoostClassifier(loss_function = 'Logloss', eval_metric = 'AUC', grow_policy = 'Lossguide', #logging_level = 'Silent',
                                     iterations = 5969, learning_rate = 0.01,
                                     random_state = 2020, od_type = "Iter",
                                     depth = int(par_depth), max_leaves = int(par_max_leaves),
                                     min_data_in_leaf = int(par_min_data_in_leaf),
                                     random_strength = par_random_strength,
                                     bagging_temperature = round(par_bagging_temperature, 2),
                                     #   colsample_bylevel = min(colsample_bylevel, 0.999), #rsm on GPU is supported for pairwise modes only
                                     border_count = int(par_border_count),
                                     l2_leaf_reg = round(par_l2_leaf_reg, 2),
                                     scale_pos_weight = round(par_scale_pos_weight, 2),
                                    #  sampling_frequency = 'PerTree', sampling_unit = 'Object',
                                     task_type = 'GPU', devices = '0', pinned_memory_size = '10737418240',
                                     thread_count = 2, verbose = 500,
                                     use_best_model = True, metric_period = 10)

In [64]:
clfs = []
answers = []
mean_score = 0
cv_scores = []

In [65]:
for fold, (train_index, val_index) in enumerate(kf.split(data_X, data_y)):
    X_train_fold, X_val_fold = data_X.iloc[train_index], data_X.iloc[val_index]
    y_train_fold, y_val_fold = data_y.iloc[train_index], data_y.iloc[val_index]
    
    print("fold:", fold)
    clf = cv_cat_model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold),
                           cat_features=categorical_fea)
    clfs.append(clf)

    pred_val_fold = clfs[fold].predict(X_val_fold, prediction_type='Probability',
                                       ntree_end = clfs[fold].get_best_iteration())[:,-1]
    
    print('cat验证的auc:{}'.format(roc_auc_score(y_val_fold, pred_val_fold)))
    mean_score += roc_auc_score(y_val_fold, pred_val_fold) / NFOLD
    cv_scores.append(roc_auc_score(y_val_fold, pred_val_fold))

    pred = clfs[fold].predict(data_X_testA, prediction_type='Probability',
                              ntree_end = clfs[fold].get_best_iteration())[:,-1]
    answers.append(pred)

fold: 0
0:	learn: 0.7038276	test: 0.7033719	best: 0.7033719 (0)	total: 23.6ms	remaining: 2m 20s
500:	learn: 0.7328880	test: 0.7296177	best: 0.7296177 (500)	total: 12.7s	remaining: 2m 18s
1000:	learn: 0.7395415	test: 0.7342659	best: 0.7342659 (1000)	total: 24.6s	remaining: 2m 1s
1500:	learn: 0.7436759	test: 0.7362799	best: 0.7362799 (1500)	total: 41.6s	remaining: 2m 3s
2000:	learn: 0.7466781	test: 0.7374201	best: 0.7374201 (2000)	total: 57.7s	remaining: 1m 54s
2500:	learn: 0.7492211	test: 0.7381709	best: 0.7381709 (2500)	total: 1m 13s	remaining: 1m 41s
3000:	learn: 0.7514995	test: 0.7386770	best: 0.7386770 (3000)	total: 1m 29s	remaining: 1m 28s
3500:	learn: 0.7536110	test: 0.7390461	best: 0.7390462 (3499)	total: 1m 46s	remaining: 1m 14s
4000:	learn: 0.7555768	test: 0.7394036	best: 0.7394036 (4000)	total: 2m 2s	remaining: 1m
4500:	learn: 0.7574908	test: 0.7396578	best: 0.7396578 (4500)	total: 2m 17s	remaining: 45s
bestTest = 0.7398664951
bestIteration = 4965
Shrink model to first 4966 it

In [66]:
print("cat_scotrainre_list:{}".format(cv_scores))
print("cat_score_mean:{}".format(np.mean(cv_scores)))
print("cat_score_std:{}".format(np.std(cv_scores)))

cat_scotrainre_list:[0.7398652728279432, 0.7408151269017267, 0.7400836406176304, 0.7394105299369708, 0.7403068820703156]
cat_score_mean:0.7400962904709173
cat_score_std:0.0004658049626330782


In [67]:
#最终加权平均的预测结果
cat_pre=sum(answers)/NFOLD
cat_pre

array([0.24133653, 0.69075798, 0.8806343 , ..., 0.45419934, 0.69965486,
       0.09784952])

In [68]:
submission = pd.DataFrame({'id' : data_X_testA.index, 'isDefault' : cat_pre})
# submission['isDefault'] = submission['isDefault'].apply(lambda x : x if x > 0 else 0)
# submission['isDefault'] = submission['isDefault'].apply(lambda x : x if x < 1 else 1)
submission.to_csv('./work/cv_catboost/submission_cv_catboost20211216(全训练数据、参数优化、五折交叉验证).csv', index = None)

#### 通过5折交叉验证可以发现，模型迭代次数在10173次的时候会停之，那么我们在建立新模型时直接设置最大迭代次数，并使用验证集进行模型预测

In [69]:
data_X,X_eval,data_y,y_eval=train_test_split(data_X,data_y,test_size=0.2,random_state=1, stratify=data_y)

In [70]:
final_cat_model = cb.CatBoostClassifier(loss_function = 'Logloss', eval_metric = 'AUC', grow_policy = 'Lossguide', #logging_level = 'Silent',
                                     iterations = 20173, learning_rate = 0.01,
                                     random_state = 2020, od_type = "Iter",
                                     depth = int(par_depth), max_leaves = int(par_max_leaves),
                                     min_data_in_leaf = int(par_min_data_in_leaf),
                                     random_strength = par_random_strength,
                                     bagging_temperature = round(par_bagging_temperature, 2),
                                     #   colsample_bylevel = min(colsample_bylevel, 0.999), #rsm on GPU is supported for pairwise modes only
                                     border_count = int(par_border_count),
                                     l2_leaf_reg = round(par_l2_leaf_reg, 2),
                                     scale_pos_weight = round(par_scale_pos_weight, 2),
                                    #  sampling_frequency = 'PerTree', sampling_unit = 'Object',
                                     task_type = 'GPU', devices = '0', pinned_memory_size = '10737418240',
                                     thread_count = 2, verbose = 500,
                                     use_best_model = True, metric_period = 10)

final_cat_model.fit(data_X, data_y, eval_set=(X_eval, y_eval), cat_features=categorical_fea)

y_pred = final_cat_model.predict(data_X_testA, prediction_type='Probability',
                                 ntree_end = final_cat_model.get_best_iteration())[:,-1]

0:	learn: 0.7038437	test: 0.7042420	best: 0.7042420 (0)	total: 25.5ms	remaining: 8m 34s
500:	learn: 0.7323768	test: 0.7310732	best: 0.7310732 (500)	total: 12.3s	remaining: 8m 2s
1000:	learn: 0.7390128	test: 0.7357012	best: 0.7357012 (1000)	total: 23.8s	remaining: 7m 35s
1500:	learn: 0.7430415	test: 0.7376584	best: 0.7376584 (1500)	total: 36.1s	remaining: 7m 28s
2000:	learn: 0.7460886	test: 0.7387739	best: 0.7387739 (2000)	total: 46.8s	remaining: 7m 5s
2500:	learn: 0.7486027	test: 0.7394626	best: 0.7394626 (2500)	total: 58s	remaining: 6m 49s
3000:	learn: 0.7508562	test: 0.7399707	best: 0.7399720 (2987)	total: 1m 9s	remaining: 6m 35s
3500:	learn: 0.7530599	test: 0.7403523	best: 0.7403523 (3500)	total: 1m 20s	remaining: 6m 23s
bestTest = 0.7404764593
bestIteration = 3726
Shrink model to first 3727 iterations.


In [71]:
submission = pd.DataFrame({'id' : data_X_testA.index, 'isDefault' : y_pred})
# submission['isDefault'] = submission['isDefault'].apply(lambda x : x if x > 0 else 0)
# submission['isDefault'] = submission['isDefault'].apply(lambda x : x if x < 1 else 1)
submission.to_csv('./work/cv_catboost/submission_cv_catboost20211217(全训练数据、参数优化、单独模型).csv', index = None)