In [9]:
#!/usr/bin/env python
# coding: utf-8


# 导入第三方包
import pandas as pd
import numpy as np

import lightgbm as lgb

from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

In [13]:
# 读取数据集，具体下载方式可见操作手册
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

sample_submit = pd.read_csv('submit.csv')


# 训练数据及测试数据准备 ,'year_of_birth','disbursed_date','employee_code_id','manufacturer_id','supplier_id', 'branch_id', 'area_id'
all_cols = [f for f in train.columns if f not in ['customer_id','loan_default', 'year_of_birth','disbursed_date']]

x_train = train[all_cols]
x_test = test[all_cols]

y_train = train['loan_default']

# 调参

In [14]:
X_train,X_test,y_train,y_test=train_test_split(x_train,y_train,random_state=0,test_size=0.2)

In [19]:
params = {    
          'boosting_type': 'gbdt',
          'objective': 'binary',
          'metric': 'auc',
          'nthread':4,
          'learning_rate':0.1,
          'num_leaves':30, 
          'max_depth': 5,   
          'subsample': 0.8, 
          'colsample_bytree': 0.8, 
    }


# 获取最好的n_estimators
## dart
lr=0.1, n_estimators = 1000

## gbdt
lr=0.1, n_estimators = 158
lr=0.01, n_estimators = 1000

In [20]:

data_train = lgb.Dataset(X_train, y_train)
cv_results = lgb.cv(params, data_train, num_boost_round=1000, nfold=5, stratified=False, shuffle=True, metrics='auc',early_stopping_rounds=50,seed=0)
print('best n_estimators:', len(cv_results['auc-mean']))
print('best cv score:', pd.Series(cv_results['auc-mean']).max())


[LightGBM] [Info] Number of positive: 16894, number of negative: 79106
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6613
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 47
[LightGBM] [Info] Number of positive: 16923, number of negative: 79077
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6613
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 47
[LightGBM] [Info] Number of positive: 16905, number of negative: 79095
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6613
[LightGBM] [Info] Number of data points in the train set: 96000, number of used features: 47
[LightGBM] [Info] Number of positive: 17054, number of negative: 78946
You can set `force_row_wise=true`

















































best n_estimators: 1000
best cv score: 0.6586649930796458


# 确定max_depth和num_leaves
{'max_depth': 6, 'num_leaves': 30},
 0.6588262632245319,
 LGBMClassifier(bagging_fraction=0.8, feature_fraction=0.8, max_depth=6,
                metrics='auc', n_estimators=158, num_leaves=30,
                objective='binary'))

In [26]:

from sklearn.model_selection import GridSearchCV

params_test1={'max_depth': range(3,8,1), 'num_leaves':range(5, 100, 5)}
              
gsearch1 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.1, n_estimators=158, max_depth=6, bagging_fraction = 0.8,feature_fraction = 0.8), 
                       param_grid = params_test1, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch1.fit(X_train,y_train)
gsearch1.scorer_, gsearch1.best_params_, gsearch1.best_score_,gsearch1.best_estimator_


KeyboardInterrupt: 

# 确定min_data_in_leaf和max_bin
<!-- (make_scorer(roc_auc_score, needs_threshold=True),
 {'max_bin': 95, 'min_data_in_leaf': 91},
 0.6578919194138838) -->

In [27]:
params_test2={'max_bin': range(5,256,10), 'min_data_in_leaf':range(1,102,10)}
              
gsearch2 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.1, n_estimators=158, max_depth=6, num_leaves=30,bagging_fraction = 0.8,feature_fraction = 0.8), 
                       param_grid = params_test2, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch2.fit(X_train,y_train)
gsearch2.scorer_, gsearch2.best_params_, gsearch2.best_score_


(make_scorer(roc_auc_score, needs_threshold=True),
 {'max_bin': 115, 'min_data_in_leaf': 51},
 0.6600317671754254)

# 确定feature_fraction、bagging_fraction、bagging_freq

In [30]:
params_test3={'feature_fraction': [0.6,0.7,0.8,0.9,1.0],
              'bagging_fraction': [0.6,0.7,0.8,0.9,1.0],
              'bagging_freq': range(0,81,5)
}
              
gsearch3 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.1, n_estimators=158, max_depth=6, num_leaves=30,max_bin=115,min_data_in_leaf=51), 
                       param_grid = params_test3, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch3.fit(X_train,y_train)
gsearch3.scorer_, gsearch3.best_params_, gsearch3.best_score_


(make_scorer(roc_auc_score, needs_threshold=True),
 {'bagging_fraction': 0.6, 'bagging_freq': 0, 'feature_fraction': 0.8},
 0.6600317671754254)

# 确定lambda_l1和lambda_l2

In [32]:
params_test4={'lambda_l1': [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0],
              'lambda_l2': [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0]
}
              
gsearch4 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.1, n_estimators=158, max_depth=6, num_leaves=30,max_bin=115,min_data_in_leaf=51,bagging_fraction=0.6,bagging_freq= 0, feature_fraction= 0.8), 
                       param_grid = params_test4, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch4.fit(X_train,y_train)
gsearch4.scorer_, gsearch4.best_params_, gsearch4.best_score_

(make_scorer(roc_auc_score, needs_threshold=True),
 {'lambda_l1': 10.0, 'lambda_l2': 8.0},
 0.661781070429582)

In [28]:
# 确定 min_split_gain

In [33]:
params_test5={'min_split_gain':[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]}
              
gsearch5 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.1, n_estimators=158, max_depth=6, num_leaves=30,max_bin=115,min_data_in_leaf=51,bagging_fraction=0.6,bagging_freq= 0, feature_fraction= 0.8,
lambda_l1=10.0,lambda_l2=8.0, min_split_gain=0), 
                       param_grid = params_test5, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch5.fit(X_train,y_train)
gsearch5.scorer_, gsearch5.best_params_, gsearch5.best_score_

(make_scorer(roc_auc_score, needs_threshold=True),
 {'min_split_gain': 0.0},
 0.661781070429582)

In [35]:
# 读取数据集，具体下载方式可见操作手册
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

sample_submit = pd.read_csv('submit.csv')


# 训练数据及测试数据准备 ,'year_of_birth','disbursed_date','employee_code_id','manufacturer_id','supplier_id', 'branch_id', 'area_id'
all_cols = [f for f in train.columns if f not in ['customer_id','loan_default', 'year_of_birth','disbursed_date']]

x_train = train[all_cols]
x_test = test[all_cols]

y_train = train['loan_default']

In [40]:

# 作为baseline部分仅使用经典的**LightGBM**作为训练模型，我们还能尝试**XGBoost、CatBoost和NN（神经网络）**
def cv_model(clf, train_x, train_y, test_x, clf_name='lgb'):
    folds = 5
    seed = 2021
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
#         enc = OneHotEncoder(handle_unknown='ignore')
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
#         enc.fit_transform(trn_x)
#         enc.fit_transform(val_x)

        train_matrix = clf.Dataset(trn_x, label=trn_y)
        valid_matrix = clf.Dataset(val_x, label=val_y)
        
# estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.1, 
# n_estimators=158, max_depth=6, num_leaves=30,max_bin=115,min_data_in_leaf=51,bagging_fraction=0.6,
# bagging_freq= 0, feature_fraction= 0.8,lambda_l1=10.0,lambda_l2=8.0, min_split_gain=0
        params = {
            'boosting_type': 'dart',
            'objective': 'binary',
            'metric': 'auc',
            'n_estimators':1000,
            'min_child_weight': 5,
#             'num_leaves': 2 ** 7,
            'num_leaves': 30,
            'max_depth':5,
            'lambda_l1': 10,
            'lambda_l2': 8,
            'max_bin':115,
            'min_data_in_leaf':10,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.9,
            'bagging_freq': 6,
            'learning_rate': 0.01,
            'seed': 2021,
            'num_threads': 16,
            'n_jobs':-1,
            'silent': False,
            'verbose': -1,
            'min_split_gain':2
        }



        model = clf.train(params, train_matrix, 20000, valid_sets=[train_matrix, valid_matrix], verbose_eval=500,early_stopping_rounds=200)
        val_pred = model.predict(val_x, num_iteration=model.best_iteration)
        test_pred = model.predict(test_x, num_iteration=model.best_iteration)

        train[valid_index] = val_pred
        test += test_pred / kf.n_splits
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)
       
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return train, test



lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test)
# score_max, limit_best = 0, 0
# for thr in [0.2,0.25,0.275,0.3,0.325,0.35,0.4]:
#     y_true = y_train
#     y_pred = pd.DataFrame(lgb_train)[0].apply(lambda x:1 if x>thr else 0).values
#     score = f1_score(y_true, y_pred, average='macro')
#     if score>=score_max:
#         score_max = score
#         limit_best = thr
#     print(thr, score)

# 预测结果
sample_submit['loan_default'] = lgb_test
sample_submit['loan_default'] = sample_submit['loan_default'].apply(lambda x:1 if x>0.25 else 0).values
sample_submit.to_csv('loan_just_para_dart_v1.csv', index=False)

************************************ 1 ************************************
[500]	training's auc: 0.641672	valid_1's auc: 0.635224
[1000]	training's auc: 0.655078	valid_1's auc: 0.645994
[0.6459939083224703]
************************************ 2 ************************************
[500]	training's auc: 0.642143	valid_1's auc: 0.634223
[1000]	training's auc: 0.655216	valid_1's auc: 0.644539
[0.6459939083224703, 0.6445386810688688]
************************************ 3 ************************************
[500]	training's auc: 0.641863	valid_1's auc: 0.638219
[1000]	training's auc: 0.655317	valid_1's auc: 0.64743
[0.6459939083224703, 0.6445386810688688, 0.6474296219292458]
************************************ 4 ************************************
[500]	training's auc: 0.645505	valid_1's auc: 0.62268
[1000]	training's auc: 0.658906	valid_1's auc: 0.631475
[0.6459939083224703, 0.6445386810688688, 0.6474296219292458, 0.6314752783201047]
************************************ 5 ***********