In [None]:
import numpy as np
import pandas as pd
import pickle
%matplotlib inline
import matplotlib # 注意这个也要import一次
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, mean_squared_error, roc_curve, auc
from sklearn.model_selection import train_test_split
import gc

import warnings
warnings.filterwarnings("ignore")

In [None]:
from sklearn.model_selection import StratifiedKFold,KFold
import catboost as cb
from catboost import CatBoostClassifier, cv, Pool

In [None]:
NFOLD = 5
seed = 2022

In [None]:
kf = StratifiedKFold(n_splits=NFOLD, shuffle=True, random_state=seed)

In [None]:
from catboost.utils import get_gpu_device_count
print('I see %i GPU devices' % get_gpu_device_count())

In [None]:
def cv_train_model(X_train, X_test, y_train, kf):
    clfs = []
    train_pred = np.zeros(X_train.shape[0])
    answers = []
    mean_score = 0
    cv_scores = []
    CB_INFO_PATH = './catboost_info'
    
    cv_cat_model = cb.CatBoostClassifier(iterations=10000,
                                     depth=7,
                                     learning_rate=0.001,
                                     loss_function='Logloss',
                                     eval_metric='AUC',
                                     logging_level='Verbose',
                                     metric_period=50)
    
    for fold, (train_index, val_index) in enumerate(kf.split(X_train, y_train)):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

        print("fold:", fold)
        clf = cv_cat_model.fit(X_train_fold, y_train_fold, eval_set=[(X_train_fold, y_train_fold), (X_val_fold, y_val_fold)],
                               verbose=200, 
                               cat_features=cat_feature
                               )
        clfs.append(clf)

        pred_val_fold = clf.predict(X_val_fold, prediction_type='Probability', ntree_end = clf.get_best_iteration() + 1)[:,-1]
        train_pred[val_index] = pred_val_fold

        print('cat验证的auc:{}'.format(roc_auc_score(y_val_fold, pred_val_fold)))
        mean_score += roc_auc_score(y_val_fold, pred_val_fold) / NFOLD
        cv_scores.append(roc_auc_score(y_val_fold, pred_val_fold))

        pred = clf.predict(X_test, prediction_type='Probability', ntree_end = clf.get_best_iteration() + 1)[:,-1]
        answers.append(pred)

        del clf, X_train_fold, X_val_fold, y_train_fold, y_val_fold
        gc.collect()
        
    print("cat_scotrainre_list:{}".format(cv_scores))
    print("cat_score_mean:{}".format(np.mean(cv_scores)))
    print("cat_score_std:{}".format(np.std(cv_scores)))

    print('Full AUC score %.6f' % roc_auc_score(y_train, train_pred))
    
    #最终加权平均的预测结果
    y_pred = sum(answers)/NFOLD
    
return y_pred, train_pred, clfs, answers

In [None]:
y_pred, train_pred, _, _ = cv_train_model(X_train, X_test, y_train, kf)