In [3]:
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
warnings.filterwarnings('ignore')


In [32]:
def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 2020
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'is_unbalanced':True,
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 7,
                'min_data_in_leaf': 32,
                'max_depth':6,
                'num_leaves': 40,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'pos_bagging_fraction':0.64,
                'neg_bagging_fraction':0.16,
                'bagging_freq': 5,
                'learning_rate': 0.02,
                'seed': 2020,
                'nthread': 28,
                'n_jobs':24,
                #'silent': True,
                'verbose': -1,
            }

            model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
            # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
                
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            test_matrix = clf.DMatrix(test_x)
            
            params = {'booster': 'gbtree',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'gamma': 1,
                      'min_child_weight': 2,
                      'max_depth': 6,
                      'lambda': 10,
                      'subsample': 0.8,
                      'colsample_bytree': 0.8,
                      'colsample_bylevel': 0.8,
                      'scale_pos_weight' : 4,
                      'eta': 0.05,
                      'tree_method': 'exact',
                      'seed': 2020,
                      'nthread': 36,
                      "silent": True
                      }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(test_matrix , ntree_limit=model.best_ntree_limit)
                 
        if clf_name == "cat":
            params = {'learning_rate': 0.05, 'depth': 5, 'eval_metric':'AUC','l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
                      'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
            
            model = clf(iterations=20000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      cat_features=[], use_best_model=True, verbose=500)
            
            val_pred  = model.predict(val_x)
            test_pred = model.predict(test_x)
            
        train[valid_index] = val_pred
        test = test_pred / kf.n_splits
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)
        
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return train, test

In [33]:
def lgb_model(x_train, y_train, x_test):
    lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
    return lgb_train, lgb_test

def xgb_model(x_train, y_train, x_test):
    xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
    return xgb_train, xgb_test

def cat_model(x_train, y_train, x_test):
    cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")
    return cat_train, cat_test

In [24]:
data_train =pd.read_csv('../01_inter_data/train_cat_cleaned.csv')
data_test_a = pd.read_csv('../01_inter_data/test_a_cat_cleaned.csv')

In [25]:
data_test_a_id = pd.read_csv('../00_data/testA.csv')
test_id = data_test_a_id['id']

In [26]:
from category_encoders import OrdinalEncoder
cate_features = ['grade', 'subGrade']

for i in cate_features:
    Le = LabelEncoder()
    data = pd.concat([data_train,data_test_a])
    Le.fit(data[i])
    data_train[i] = Le.transform(data_train[i])
    data_test_a[i] = Le.transform(data_test_a[i])
#Te.fit(data_train[cate_features],data_train['isDefault'])
#te_cols = Te.transform(data[cate_features])

In [28]:
features = [f for f in data_train.columns if f not in ['id','isDefault']]
x_train = data_train[features]
x_test = data_test_a[features]
y_train = data_train['isDefault']

In [29]:
data_train.info(max_cols=1000)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 46 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  800000 non-null  int64  
 1   loanAmnt            800000 non-null  float64
 2   term                800000 non-null  int64  
 3   interestRate        800000 non-null  float64
 4   installment         800000 non-null  float64
 5   grade               800000 non-null  int32  
 6   subGrade            800000 non-null  int32  
 7   employmentTitle     800000 non-null  float64
 8   employmentLength    800000 non-null  float64
 9   homeOwnership       800000 non-null  int64  
 10  annualIncome        800000 non-null  float64
 11  verificationStatus  800000 non-null  int64  
 12  issueDate           800000 non-null  int64  
 13  isDefault           800000 non-null  int64  
 14  purpose             800000 non-null  int64  
 15  postCode            800000 non-nul

In [30]:
lgb_train,lgb_test = lgb_model(x_train, y_train, x_test)

************************************ 1 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.72971	valid_1's auc: 0.728054
[400]	training's auc: 0.737012	valid_1's auc: 0.732556
[600]	training's auc: 0.741763	valid_1's auc: 0.734438
[800]	training's auc: 0.74577	valid_1's auc: 0.735611
[1000]	training's auc: 0.749158	valid_1's auc: 0.736342
[1200]	training's auc: 0.75212	valid_1's auc: 0.736772
[1400]	training's auc: 0.754871	valid_1's auc: 0.737101
[1600]	training's auc: 0.757465	valid_1's auc: 0.73739
[1800]	training's auc: 0.759803	valid_1's auc: 0.737458
[2000]	training's auc: 0.762203	valid_1's auc: 0.737604
Early stopping, best iteration is:
[1933]	training's auc: 0.761414	valid_1's auc: 0.737624
[0.7376242733761404]
************************************ 2 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.730748	valid_1's auc: 0.724233
[400]

In [34]:
xgb_train, xgb_test= xgb_model(x_train, y_train, x_test)

************************************ 1 ************************************
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.70693	eval-auc:0.70742
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 200 rounds.
[200]	train-auc:0.74362	eval-auc:0.73375
[400]	train-auc:0.75565	eval-auc:0.73643
[600]	train-auc:0.76476	eval-auc:0.73720
[800]	train-auc:0.77253	eval-auc:0.73759
[1000]	train-auc:0.77977	eval-auc:0.73776
Stopping. Best iteration:
[972]	train-auc:0.77885	eval-auc:0.73782

[0.7378169771878842]
************************************ 2 ************************************
Parameters: { silent } might not be used.

  This may not be accurate due to some paramete

In [34]:
#将三个模型的output做voting
test_result =pd.concat([test_id,pd.DataFrame(lgb_test),pd.DataFrame(cat_test)],axis = 1)
test_result.columns = ['id','isDefault_lgb','isDefault_cat']

In [35]:
test_result.to_csv('../03_result/testA_pred_2.csv',index = False)

In [38]:
lgb_test_result = pd.concat([test_id,pd.DataFrame(lgb_test)],axis = 1)
lgb_test_result.to_csv('../03_result/testA_pred_lgb7370.csv')
lgb_train_result = pd.concat([data_train.id,pd.DataFrame(lgb_train),data_train.isDefault],axis = 1)
lgb_train_result.to_csv('../03_result/trainA_pred_lgb7370.csv')

In [43]:
# voting
#xgb_result = pd.read_csv('../03_result/testA_pred.csv')

In [44]:
total_result = xgb_result.merge(test_result,on = 'id')

In [45]:
total_result['isDefault_xgb'] =  total_result['isDefault']

In [50]:
total_result['isDefault'] = total_result['isDefault_xgb']*0.4 + total_result['isDefault_lgb']*0.3 +total_result['isDefault_cat']*0.3

In [55]:
total_result[['id','isDefault']].to_csv('../03_result/testA_pred_stack.csv',index = False)

In [58]:
test_result[['id','isDefault_lgb']].to_csv('../03_result/testA_pred_2.csv',index = False)

In [59]:
test_result[['id','isDefault_cat']].to_csv('../03_result/testA_pred_3.csv',index = False)

In [54]:
test_result

Unnamed: 0,id,isDefault_lgb,isDefault_cat
0,800000,0.012237,0.012668
1,800001,0.057759,0.066223
2,800002,0.119079,0.113430
3,800003,0.060883,0.062080
4,800004,0.075759,0.071827
...,...,...,...
199995,999995,0.033294,0.033081
199996,999996,0.005441,0.006606
199997,999997,0.028636,0.035593
199998,999998,0.053191,0.046561


In [56]:
total_result

Unnamed: 0,id,isDefault,isDefault_lgb,isDefault_cat,isDefault_xgb
0,800000,0.013324,0.012237,0.012668,0.014630
1,800001,0.063443,0.057759,0.066223,0.065620
2,800002,0.119657,0.119079,0.113430,0.124761
3,800003,0.061649,0.060883,0.062080,0.061900
4,800004,0.068613,0.075759,0.071827,0.060845
...,...,...,...,...,...
199995,999995,0.031765,0.033294,0.033081,0.029632
199996,999996,0.006010,0.005441,0.006606,0.005989
199997,999997,0.032861,0.028636,0.035593,0.033981
199998,999998,0.049142,0.053191,0.046561,0.048041
