In [1]:
"""
作者：librauee
微信公众号：老肥码码码
日期：2020.12.15
线上得分：0.816
截至日期排名：1
"""
import pandas as pd
from tqdm import tqdm
import warnings
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score

warnings.filterwarnings('ignore')
KF = StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)

In [2]:
# 数据读取，去除无关列和标签列
train = pd.read_csv('train.csv')
test = pd.read_csv('test_noLabel.csv')
X_train = train.drop(['uid', 'y'], axis=1)
y = train['y']
X_test = test.drop(['uid'],  axis=1)
features = X_train.columns

In [3]:
# LGB模型五折交叉验证
params = {
          'objective':'binary',
          'metric':'binary_error', 
          'learning_rate':0.05, 
          'subsample':0.8, 
          'subsample_freq':3, 
          'colsample_btree':0.8,
          'num_iterations': 10000, 
}

oof_lgb = np.zeros(len(X_train))
predictions_lgb = np.zeros((len(X_test)))

for fold_, (trn_idx, val_idx) in enumerate(KF.split(X_train.values, y.values)):
    print("fold n°{}".format(fold_))
    print('trn_idx:',trn_idx)
    print('val_idx:',val_idx)
    trn_data = lgb.Dataset(X_train.iloc[trn_idx][features],label=y.iloc[trn_idx])    
    val_data = lgb.Dataset(X_train.iloc[val_idx][features],label=y.iloc[val_idx])
    num_round = 10000
    clf = lgb.train(
        params,
                    trn_data,
                    num_round,
                    valid_sets = [trn_data, val_data],
                    verbose_eval=500,
                    early_stopping_rounds=200,  
    )
        
    oof_lgb[val_idx] = clf.predict(X_train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    predictions_lgb[:] += clf.predict(X_test[features], num_iteration=clf.best_iteration) / 5
print("AUC score: {}".format(roc_auc_score(y, oof_lgb)))
print("ACC score: {}".format(accuracy_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))

fold n°0
trn_idx: [   2    3    4 ... 7996 7997 7998]
val_idx: [   0    1   44 ... 7990 7995 7999]
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[189]	training's binary_error: 0.107344	valid_1's binary_error: 0.206875
fold n°1
trn_idx: [   0    1    2 ... 7997 7998 7999]
val_idx: [  15   28   35 ... 7986 7993 7996]
Training until validation scores don't improve for 200 rounds
[500]	training's binary_error: 0.0151562	valid_1's binary_error: 0.19375
Early stopping, best iteration is:
[792]	training's binary_error: 0.0009375	valid_1's binary_error: 0.186875
fold n°2
trn_idx: [   0    1    2 ... 7997 7998 7999]
val_idx: [   3    4   14 ... 7977 7978 7994]
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[128]	training's binary_error: 0.135	valid_1's binary_error: 0.188125
fold n°3
trn_idx: [   0    1    3 ... 7995 7996 7999]
val_idx: [   2    6    8 ... 7992 7997 7998]
Training until validati

In [4]:
# XGB模型五折交叉验证
import xgboost as xgb
oof_xgb = np.zeros(len(X_train))
predictions_xgb = np.zeros((len(X_test)))

for fold_, (trn_idx, val_idx) in enumerate(KF.split(X_train.values, y.values)):
    print("fold n°{}".format(fold_))
    print('trn_idx:',trn_idx)
    print('val_idx:',val_idx)

    X_trn, X_val = X_train.iloc[trn_idx], X_train.iloc[val_idx]
    y_trn, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    model = xgb.XGBClassifier(n_estimators=9999,
                               tree_method='gpu_hist',
                                  )
    model.fit(X_trn[features], y_trn, early_stopping_rounds=200, verbose=100, 
              eval_set=[(X_trn[features], y_trn), (X_val[features], y_val)], eval_metric=['error'])
        
    oof_xgb[val_idx] = model.predict(X_train.iloc[val_idx][features])
    predictions_xgb[:] += model.predict(X_test[features]) / 5
print("AUC score: {}".format(roc_auc_score(y, oof_xgb)))
print("ACC score: {}".format(accuracy_score(y, [1 if i >= 0.5 else 0 for i in oof_xgb])))

fold n°0
trn_idx: [   2    3    4 ... 7996 7997 7998]
val_idx: [   0    1   44 ... 7990 7995 7999]
[0]	validation_0-error:0.20859	validation_1-error:0.25875
[1]	validation_0-error:0.20016	validation_1-error:0.24000
[2]	validation_0-error:0.19172	validation_1-error:0.23125
[3]	validation_0-error:0.19016	validation_1-error:0.23375
[4]	validation_0-error:0.18688	validation_1-error:0.22750
[5]	validation_0-error:0.17969	validation_1-error:0.22250
[6]	validation_0-error:0.17484	validation_1-error:0.22187
[7]	validation_0-error:0.16875	validation_1-error:0.22125
[8]	validation_0-error:0.16641	validation_1-error:0.22062
[9]	validation_0-error:0.16219	validation_1-error:0.21875
[10]	validation_0-error:0.16062	validation_1-error:0.21688
[11]	validation_0-error:0.15750	validation_1-error:0.22062
[12]	validation_0-error:0.15734	validation_1-error:0.22125
[13]	validation_0-error:0.15641	validation_1-error:0.22187
[14]	validation_0-error:0.15328	validation_1-error:0.22187
[15]	validation_0-error:0.

[137]	validation_0-error:0.01219	validation_1-error:0.21563
[138]	validation_0-error:0.01203	validation_1-error:0.21563
[139]	validation_0-error:0.01203	validation_1-error:0.21500
[140]	validation_0-error:0.01188	validation_1-error:0.21688
[141]	validation_0-error:0.01109	validation_1-error:0.21375
[142]	validation_0-error:0.01094	validation_1-error:0.21313
[143]	validation_0-error:0.01109	validation_1-error:0.21375
[144]	validation_0-error:0.01094	validation_1-error:0.21313
[145]	validation_0-error:0.01094	validation_1-error:0.21313
[146]	validation_0-error:0.01109	validation_1-error:0.21313
[147]	validation_0-error:0.01109	validation_1-error:0.21438
[148]	validation_0-error:0.01109	validation_1-error:0.21563
[149]	validation_0-error:0.01109	validation_1-error:0.21563
[150]	validation_0-error:0.01031	validation_1-error:0.21688
[151]	validation_0-error:0.01016	validation_1-error:0.21813
[152]	validation_0-error:0.01016	validation_1-error:0.21813
[153]	validation_0-error:0.01000	validat

[32]	validation_0-error:0.11828	validation_1-error:0.21438
[33]	validation_0-error:0.11500	validation_1-error:0.21313
[34]	validation_0-error:0.11109	validation_1-error:0.20938
[35]	validation_0-error:0.10922	validation_1-error:0.21000
[36]	validation_0-error:0.10547	validation_1-error:0.20938
[37]	validation_0-error:0.10422	validation_1-error:0.21125
[38]	validation_0-error:0.10328	validation_1-error:0.21000
[39]	validation_0-error:0.10281	validation_1-error:0.21188
[40]	validation_0-error:0.10172	validation_1-error:0.21125
[41]	validation_0-error:0.09828	validation_1-error:0.20813
[42]	validation_0-error:0.09625	validation_1-error:0.20750
[43]	validation_0-error:0.09563	validation_1-error:0.20813
[44]	validation_0-error:0.09312	validation_1-error:0.21000
[45]	validation_0-error:0.09266	validation_1-error:0.20938
[46]	validation_0-error:0.09203	validation_1-error:0.20938
[47]	validation_0-error:0.09109	validation_1-error:0.20875
[48]	validation_0-error:0.09000	validation_1-error:0.209

[170]	validation_0-error:0.00375	validation_1-error:0.20750
[171]	validation_0-error:0.00375	validation_1-error:0.20688
[172]	validation_0-error:0.00375	validation_1-error:0.20750
[173]	validation_0-error:0.00375	validation_1-error:0.20688
[174]	validation_0-error:0.00375	validation_1-error:0.20938
[175]	validation_0-error:0.00375	validation_1-error:0.20813
[176]	validation_0-error:0.00344	validation_1-error:0.20938
[177]	validation_0-error:0.00297	validation_1-error:0.20750
[178]	validation_0-error:0.00281	validation_1-error:0.21000
[179]	validation_0-error:0.00281	validation_1-error:0.21000
[180]	validation_0-error:0.00266	validation_1-error:0.21125
[181]	validation_0-error:0.00266	validation_1-error:0.21125
[182]	validation_0-error:0.00219	validation_1-error:0.21250
[183]	validation_0-error:0.00203	validation_1-error:0.21125
[184]	validation_0-error:0.00203	validation_1-error:0.21188
[185]	validation_0-error:0.00203	validation_1-error:0.21188
[186]	validation_0-error:0.00203	validat

[16]	validation_0-error:0.14609	validation_1-error:0.19000
[17]	validation_0-error:0.14422	validation_1-error:0.19062
[18]	validation_0-error:0.14297	validation_1-error:0.18812
[19]	validation_0-error:0.14125	validation_1-error:0.18812
[20]	validation_0-error:0.13875	validation_1-error:0.19000
[21]	validation_0-error:0.13828	validation_1-error:0.18812
[22]	validation_0-error:0.13547	validation_1-error:0.18812
[23]	validation_0-error:0.13500	validation_1-error:0.18812
[24]	validation_0-error:0.13375	validation_1-error:0.18750
[25]	validation_0-error:0.13297	validation_1-error:0.18812
[26]	validation_0-error:0.13187	validation_1-error:0.18812
[27]	validation_0-error:0.13156	validation_1-error:0.18875
[28]	validation_0-error:0.12922	validation_1-error:0.18625
[29]	validation_0-error:0.12625	validation_1-error:0.18875
[30]	validation_0-error:0.12562	validation_1-error:0.18937
[31]	validation_0-error:0.12469	validation_1-error:0.18937
[32]	validation_0-error:0.12375	validation_1-error:0.190

[154]	validation_0-error:0.00547	validation_1-error:0.20563
[155]	validation_0-error:0.00531	validation_1-error:0.20563
[156]	validation_0-error:0.00438	validation_1-error:0.20312
[157]	validation_0-error:0.00438	validation_1-error:0.20500
[158]	validation_0-error:0.00438	validation_1-error:0.20438
[159]	validation_0-error:0.00422	validation_1-error:0.20500
[160]	validation_0-error:0.00422	validation_1-error:0.20500
[161]	validation_0-error:0.00422	validation_1-error:0.20438
[162]	validation_0-error:0.00438	validation_1-error:0.20563
[163]	validation_0-error:0.00406	validation_1-error:0.20500
[164]	validation_0-error:0.00406	validation_1-error:0.20375
[165]	validation_0-error:0.00375	validation_1-error:0.20563
[166]	validation_0-error:0.00375	validation_1-error:0.20563
[167]	validation_0-error:0.00375	validation_1-error:0.20500
[168]	validation_0-error:0.00313	validation_1-error:0.20625
[169]	validation_0-error:0.00313	validation_1-error:0.20750
[170]	validation_0-error:0.00313	validat

[62]	validation_0-error:0.06359	validation_1-error:0.21125
[63]	validation_0-error:0.06250	validation_1-error:0.21000
[64]	validation_0-error:0.06094	validation_1-error:0.20750
[65]	validation_0-error:0.06078	validation_1-error:0.20875
[66]	validation_0-error:0.06031	validation_1-error:0.20750
[67]	validation_0-error:0.05984	validation_1-error:0.20688
[68]	validation_0-error:0.05953	validation_1-error:0.20625
[69]	validation_0-error:0.05937	validation_1-error:0.20688
[70]	validation_0-error:0.05672	validation_1-error:0.20563
[71]	validation_0-error:0.05437	validation_1-error:0.20500
[72]	validation_0-error:0.05359	validation_1-error:0.20563
[73]	validation_0-error:0.05125	validation_1-error:0.20750
[74]	validation_0-error:0.04938	validation_1-error:0.20625
[75]	validation_0-error:0.04906	validation_1-error:0.20625
[76]	validation_0-error:0.04875	validation_1-error:0.20625
[77]	validation_0-error:0.04734	validation_1-error:0.20625
[78]	validation_0-error:0.04516	validation_1-error:0.206

[200]	validation_0-error:0.00047	validation_1-error:0.21937
[201]	validation_0-error:0.00047	validation_1-error:0.22000
[202]	validation_0-error:0.00047	validation_1-error:0.22000
[203]	validation_0-error:0.00047	validation_1-error:0.21937
[204]	validation_0-error:0.00047	validation_1-error:0.21875
[205]	validation_0-error:0.00047	validation_1-error:0.21875
[206]	validation_0-error:0.00047	validation_1-error:0.21875
[207]	validation_0-error:0.00047	validation_1-error:0.21937
[208]	validation_0-error:0.00031	validation_1-error:0.22000
[209]	validation_0-error:0.00031	validation_1-error:0.22187
[210]	validation_0-error:0.00031	validation_1-error:0.22062
[211]	validation_0-error:0.00031	validation_1-error:0.22250
[212]	validation_0-error:0.00031	validation_1-error:0.22250
[213]	validation_0-error:0.00031	validation_1-error:0.22062
[214]	validation_0-error:0.00031	validation_1-error:0.22062
[215]	validation_0-error:0.00031	validation_1-error:0.21937
[216]	validation_0-error:0.00031	validat

[65]	validation_0-error:0.05875	validation_1-error:0.19750
[66]	validation_0-error:0.05875	validation_1-error:0.19812
[67]	validation_0-error:0.05828	validation_1-error:0.19687
[68]	validation_0-error:0.05734	validation_1-error:0.19625
[69]	validation_0-error:0.05625	validation_1-error:0.19562
[70]	validation_0-error:0.05281	validation_1-error:0.19625
[71]	validation_0-error:0.05203	validation_1-error:0.19625
[72]	validation_0-error:0.04984	validation_1-error:0.19750
[73]	validation_0-error:0.04875	validation_1-error:0.19750
[74]	validation_0-error:0.04672	validation_1-error:0.20062
[75]	validation_0-error:0.04484	validation_1-error:0.20000
[76]	validation_0-error:0.04375	validation_1-error:0.19937
[77]	validation_0-error:0.04297	validation_1-error:0.20187
[78]	validation_0-error:0.04031	validation_1-error:0.19875
[79]	validation_0-error:0.03844	validation_1-error:0.20125
[80]	validation_0-error:0.03812	validation_1-error:0.20375
[81]	validation_0-error:0.03797	validation_1-error:0.204

[203]	validation_0-error:0.00063	validation_1-error:0.20563
[204]	validation_0-error:0.00063	validation_1-error:0.20563
[205]	validation_0-error:0.00063	validation_1-error:0.20125
[206]	validation_0-error:0.00063	validation_1-error:0.20250
[207]	validation_0-error:0.00063	validation_1-error:0.20375
[208]	validation_0-error:0.00063	validation_1-error:0.20375
[209]	validation_0-error:0.00047	validation_1-error:0.20500
[210]	validation_0-error:0.00047	validation_1-error:0.20312
[211]	validation_0-error:0.00031	validation_1-error:0.20563
[212]	validation_0-error:0.00031	validation_1-error:0.20688
[213]	validation_0-error:0.00031	validation_1-error:0.20625
[214]	validation_0-error:0.00031	validation_1-error:0.20500
[215]	validation_0-error:0.00031	validation_1-error:0.20563
[216]	validation_0-error:0.00031	validation_1-error:0.20563
[217]	validation_0-error:0.00031	validation_1-error:0.20625
[218]	validation_0-error:0.00031	validation_1-error:0.20688
[219]	validation_0-error:0.00047	validat

In [6]:
# CAT模型五折交叉验证
import catboost
cat_params = {
    'eval_metric': 'Accuracy',
    'random_seed': 666,
    'logging_level': 'Verbose',
    'use_best_model': True,
    'loss_function': 'Logloss',
    'task_type':'GPU',
    'learning_rate' : 0.1
}
oof_cat = np.zeros(len(X_train))
predictions_cat = np.zeros((len(X_test)))

for fold_, (trn_idx, val_idx) in enumerate(KF.split(X_train.values, y.values)):
    print("fold n°{}".format(fold_))
    print('trn_idx:',trn_idx)
    print('val_idx:',val_idx)
    trn_data = catboost.Pool(X_train.iloc[trn_idx],label=y.iloc[trn_idx])    
    val_data = catboost.Pool(X_train.iloc[val_idx],label=y.iloc[val_idx])
    num_round = 10000
    clf = catboost.train(
                    params=cat_params,
                    pool=trn_data,
                    iterations=num_round,
                    eval_set=val_data,
                    verbose_eval=500,
                    early_stopping_rounds=200,
    )
        
    oof_cat[val_idx] = [i[1] for i in clf.predict(X_train.iloc[val_idx], prediction_type='Probability')]
    predictions_cat[:] += [i[1] / 5 for i in clf.predict(X_test, prediction_type='Probability')]
print("AUC score: {}".format(roc_auc_score(y, oof_cat)))
print("ACC score: {}".format(accuracy_score(y, [1 if i >= 0.5 else 0 for i in oof_cat])))

fold n°0
trn_idx: [   2    3    4 ... 7996 7997 7998]
val_idx: [   0    1   44 ... 7990 7995 7999]
0:	learn: 0.7650000	test: 0.7462500	best: 0.7462500 (0)	total: 46.7ms	remaining: 7m 46s
bestTest = 0.794375
bestIteration = 256
Shrink model to first 257 iterations.
fold n°1
trn_idx: [   0    1    2 ... 7997 7998 7999]
val_idx: [  15   28   35 ... 7986 7993 7996]
0:	learn: 0.7634375	test: 0.7537500	best: 0.7537500 (0)	total: 45.3ms	remaining: 7m 33s
bestTest = 0.8
bestIteration = 185
Shrink model to first 186 iterations.
fold n°2
trn_idx: [   0    1    2 ... 7997 7998 7999]
val_idx: [   3    4   14 ... 7977 7978 7994]
0:	learn: 0.7528125	test: 0.7731250	best: 0.7731250 (0)	total: 43.8ms	remaining: 7m 18s
bestTest = 0.81375
bestIteration = 294
Shrink model to first 295 iterations.
fold n°3
trn_idx: [   0    1    3 ... 7995 7996 7999]
val_idx: [   2    6    8 ... 7992 7997 7998]
0:	learn: 0.7598438	test: 0.7512500	best: 0.7512500 (0)	total: 44.4ms	remaining: 7m 23s
bestTest = 0.795
bestIte

In [7]:
# 三个模型概率均值融合
submit = pd.read_csv('submit_example.csv')
result = (predictions_lgb + predictions_xgb + predictions_cat) / 3
submit['y'] = [1 if i >= 0.5 else 0 for i in result]
submit.to_csv('submit_combine.csv', index=False)