In [1]:
from catboost import CatBoostClassifier, Pool, cv
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_csv('data/dk/train.csv')[:1000]
test_data = pd.read_csv('data/dk/testA.csv')[:100]

In [16]:
grade_dict = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6}

employmentLength_dict = {'1 year': 1,
                         '10+ years': 10,
                         '2 years': 2,
                         '3 years': 3,
                         '4 years': 4,
                         '5 years': 5,
                         '6 years': 6,
                         '7 years': 7,
                         '8 years': 8,
                         '9 years': 9,
                         '< 1 year': 0}


def get_sub_grade(grade, sub):
    return grade * 10 + int(sub[1])


def trans_issueDate(issueDate):
    year, month, day = issueDate.split('-')
    return int(year) * 12 + int(month) - 1


def trans_earliesCreditLine(earliesCreditLine):
    month_dict = {"Jan": 1, "Feb": 2, "Mar": 3, "Apr": 4, "May": 5, "Jun": 6, "Jul": 7, "Aug": 8, "Sep": 9, "Oct": 10,
                  "Nov": 11, "Dec": 12}
    month, year = earliesCreditLine.split('-')
    month = month_dict[month]
    return int(year) * 12 + month - 1


# 类别特征
cate_features = ['employmentTitle', 'employmentLength_bin', 'purpose', 'postCode', 'subGrade', 'earliesCreditLine_bin',
                 'regionCode', 'title', 'issueDate_bin', 'term_bin', 'interestRate_bin', 'annualIncome_bin',
                 'loanAmnt_bin', 'homeOwnership_bin', 'revolBal_bin', 'dti_bin', 'installment_bin', 'revolBal_bin',
                 'revolUtil_bin']



def process(dfs):
    for df in dfs:
        print(df.shape)
        df['grade'] = df['grade'].apply(lambda x: x if x not in grade_dict else grade_dict[x])
        df['subGrade'] = df.apply(lambda row: get_sub_grade(row['grade'], row['subGrade']), axis=1)
        df['employmentLength'] = df['employmentLength'].apply(
            lambda x: x if x not in employmentLength_dict else employmentLength_dict[x])
        df['issueDate'] = df['issueDate'].apply(lambda x: trans_issueDate(x))
        df['earliesCreditLine'] = df['earliesCreditLine'].apply(lambda x: trans_earliesCreditLine(x))
        df['date_Diff'] = df['issueDate'] - df['earliesCreditLine']
        df['installment_term_revolBal'] = df['installment'] * 12 * df['term'] / (df['revolBal'] + 0.1)
        df['revolUtil_revolBal'] = df['revolUtil'] / (df['revolBal'] + 0.1)
        df['openAcc_totalAcc'] = df['openAcc'] / df['totalAcc']
        df['dti'] = np.abs(df['dti'].fillna(1000))
        df['loanAmnt_dti_annualIncome'] = df['loanAmnt'] / (np.abs(df['dti']) * df['annualIncome'] + 0.1)
        df['employmentLength_bin'] = df['employmentLength']
        df['issueDate_bin'] = df['issueDate']
        df['earliesCreditLine_bin'] = df['earliesCreditLine']
        df['term_bin'] = df['term']
        df['homeOwnership_bin'] = df['homeOwnership']
        df['annualIncome_loanAmnt'] = df['annualIncome'] / (df['loanAmnt'] + 0.1)
        df['revolBal_loanAmnt'] = df['revolBal'] / (df['loanAmnt'] + 0.1)
        df['revolBal_installment'] = df['revolBal'] / (df['installment'] + 0.1)
        df['annualIncome_installment'] = df['annualIncome'] / (df['installment'] + 0.1)
    concated_df = pd.concat(dfs)
    label_lst = []
    # 把分箱后的特征做为类别特征处理
    bin_number = 10
    for i in range(bin_number):
        label_lst.append(i)
    dfs[0]['annualIncome_bin'] = pd.qcut(concated_df['annualIncome'], bin_number, labels=label_lst, duplicates='drop')[
                                 :dfs[0].shape[0]]
    dfs[0]['loanAmnt_bin'] = pd.qcut(concated_df['loanAmnt'], bin_number, labels=label_lst, duplicates='drop')[
                             :dfs[0].shape[0]]
    dfs[1]['annualIncome_bin'] = pd.qcut(concated_df['annualIncome'], bin_number, labels=label_lst, duplicates='drop')[
                                 dfs[0].shape[0]:]
    dfs[1]['loanAmnt_bin'] = pd.qcut(concated_df['loanAmnt'], bin_number, labels=label_lst, duplicates='drop')[
                             dfs[0].shape[0]:]

    label_lst = []
    bin_number = 100
    for i in range(bin_number):
        label_lst.append(i)
    dfs[0]['interestRate_bin'] = pd.qcut(concated_df['revolBal'], bin_number, labels=label_lst, duplicates='drop')[
                                 :dfs[0].shape[0]]
    dfs[0]['dti_bin'] = pd.qcut(concated_df['dti'], bin_number, labels=label_lst, duplicates='drop')[:dfs[0].shape[0]]
    dfs[0]['installment_bin'] = pd.qcut(concated_df['installment'], bin_number, labels=label_lst, duplicates='drop')[
                                :dfs[0].shape[0]]
    dfs[0]['revolBal_bin'] = pd.qcut(concated_df['revolBal'], bin_number, labels=label_lst, duplicates='drop')[
                             :dfs[0].shape[0]]
    dfs[0]['revolUtil_bin'] = pd.qcut(concated_df['revolUtil'], bin_number, labels=label_lst, duplicates='drop')[
                              :dfs[0].shape[0]]

    dfs[1]['interestRate_bin'] = pd.qcut(concated_df['revolBal'], bin_number, labels=label_lst, duplicates='drop')[
                                 dfs[0].shape[0]:]
    dfs[1]['dti_bin'] = pd.qcut(concated_df['dti'], bin_number, labels=label_lst, duplicates='drop')[dfs[0].shape[0]:]
    dfs[1]['installment_bin'] = pd.qcut(concated_df['installment'], bin_number, labels=label_lst, duplicates='drop')[
                                dfs[0].shape[0]:]
    dfs[1]['revolBal_bin'] = pd.qcut(concated_df['revolBal'], bin_number, labels=label_lst, duplicates='drop')[
                             dfs[0].shape[0]:]
    dfs[1]['revolUtil_bin'] = pd.qcut(concated_df['revolUtil'], bin_number, labels=label_lst, duplicates='drop')[
                              dfs[0].shape[0]:]

    for df in dfs:
        for cate in cate_features:
            df[cate] = df[cate].fillna(0).astype('int')
    issueDate_lst = list(set(concated_df['issueDate']))
    ratio_feat_lst = ['loanAmnt', 'installment', 'interestRate', 'annualIncome', 'dti', 'openAcc', \
                      'revolBal', 'revolUtil', 'totalAcc']
    issueDate_lst = list(set(concated_df['issueDate']))
    employmentLength_lst = list(set(concated_df['employmentLength']))
    purpose_lst = list(set(concated_df['purpose']))
    homeOwnership_lst = list(set(concated_df['homeOwnership']))
    for feat in ratio_feat_lst:
        issueDate_median = {}
        issueDate_item_rank = {}
        issueDate_label_mean = {}
        for dt in issueDate_lst:
            # 取最近6个月
            mask = (concated_df['issueDate'] >= dt - 3) & (concated_df['issueDate'] <= dt + 3)
            # 取最近6个月除去当前月份
            mask_1 = (concated_df['issueDate'] >= dt - 3) & (concated_df['issueDate'] <= dt + 3) & (
                    concated_df['issueDate'] != dt)
            item_series = concated_df.loc[mask, feat]
            label_series = concated_df.loc[mask_1, 'isDefault']
            # 取最近6个月的中位数
            issueDate_median[dt] = item_series.median()
            issueDate_label_mean[dt] = label_series.mean()
            item_rank = item_series.rank() / len(item_series)
            issueDate_item_rank[dt] = {}
            for item, rank in zip(item_series, item_rank):
                issueDate_item_rank[dt][item] = rank
        employmentLength_median = {}
        for et in employmentLength_lst:
            mask = concated_df['employmentLength'] == et
            item_series = concated_df.loc[mask, feat]
            employmentLength_median[et] = item_series.median()
        purpose_median = {}
        for pp in purpose_lst:
            mask = concated_df['purpose'] == pp
            item_series = concated_df.loc[mask, feat]
            purpose_median[pp] = item_series.median()
        homeOwnership_median = {}
        for ho in homeOwnership_lst:
            mask = concated_df['homeOwnership'] == ho
            item_series = concated_df.loc[mask, feat]
            homeOwnership_median[ho] = item_series.median()
        for df in dfs:
            print(feat, df.shape)
            df['label_issueDate_mean'] = df['issueDate'].apply(lambda x: issueDate_label_mean[x])
            df[feat + '_issueDate_median'] = df['issueDate'].apply(lambda x: issueDate_median[x])
            df[feat + '_issueDate_ratio'] = df.fillna(0).apply(lambda r: issueDate_item_rank[r['issueDate']][r[feat]],
                                                               axis=1)
            df[feat + '_employmentLength_ratio'] = df.fillna(0).apply(
                lambda r: r[feat] / employmentLength_median[r['employmentLength']], axis=1)
            df[feat + '_purpose_ratio'] = df.fillna(0).apply(lambda r: r[feat] / purpose_median[r['purpose']], axis=1)
            df[feat + '_homeOwnership_ratio'] = df.fillna(0).apply(
                lambda r: r[feat] / homeOwnership_median[r['homeOwnership']], axis=1)
            print(feat, df.shape)
    return dfs[0], dfs[1]

In [17]:
train_data, test_data = process([train_data, test_data])

(1000, 47)
(100, 46)
loanAmnt (1000, 68)
loanAmnt (1000, 74)
loanAmnt (100, 67)
loanAmnt (100, 73)
installment (1000, 74)
installment (1000, 79)
installment (100, 73)
installment (100, 78)
interestRate (1000, 79)
interestRate (1000, 84)
interestRate (100, 78)
interestRate (100, 83)
annualIncome (1000, 84)
annualIncome (1000, 89)
annualIncome (100, 83)
annualIncome (100, 88)
dti (1000, 89)
dti (1000, 94)
dti (100, 88)
dti (100, 93)
openAcc (1000, 94)
openAcc (1000, 99)
openAcc (100, 93)
openAcc (100, 98)
revolBal (1000, 99)
revolBal (1000, 104)
revolBal (100, 98)
revolBal (100, 103)
revolUtil (1000, 104)
revolUtil (1000, 109)
revolUtil (100, 103)
revolUtil (100, 108)
totalAcc (1000, 109)
totalAcc (1000, 114)
totalAcc (100, 108)
totalAcc (100, 113)


In [19]:
train_data['interestRate_bin'].value_counts()

19    12
49    11
34    11
62    11
61    11
      ..
6      8
66     8
68     8
46     8
96     7
Name: interestRate_bin, Length: 100, dtype: int64

In [20]:
# 删除不稳定特征

import toad

feat_lst = list(test_data.columns[1:])

# installment_homeOwnership_ratio        2.633922e-01
# installment_purpose_ratio              2.703396e-01
# revolBal_issueDate_ratio               2.992158e-01
# revolBal_loanAmnt                      3.035800e-01
# annualIncome_installment               3.122066e-01
# installment_issueDate_ratio            3.255245e-01
# installment_employmentLength_ratio     3.367582e-01
# revolUtil_issueDate_ratio              3.394691e-01
# revolBal_purpose_ratio                 3.754704e-01
# revolBal_homeOwnership_ratio           3.816513e-01
# revolBal_employmentLength_ratio        4.892058e-01
# dti_issueDate_ratio                    5.064817e-01

feat_lst.remove('installment_homeOwnership_ratio')
feat_lst.remove('installment_purpose_ratio')
feat_lst.remove('revolBal_issueDate_ratio')
feat_lst.remove('revolBal_loanAmnt')
feat_lst.remove('annualIncome_installment')
feat_lst.remove('installment_issueDate_ratio')
feat_lst.remove('installment_employmentLength_ratio')
feat_lst.remove('revolUtil_issueDate_ratio')
feat_lst.remove('revolBal_purpose_ratio')
feat_lst.remove('revolBal_homeOwnership_ratio')
feat_lst.remove('revolBal_employmentLength_ratio')
feat_lst.remove('dti_issueDate_ratio')

psi_df = toad.metrics.PSI(train_data[feat_lst], test_data[feat_lst]).sort_values(0)
psi_df

loanAmnt_dti_annualIncome           0.000000
installment_term_revolBal           0.000000
policyCode                          0.000000
revolBal_installment                0.000000
initialListStatus                   0.000016
                                      ...   
earliesCreditLine_bin               0.887026
openAcc_issueDate_ratio             0.976078
interestRate_homeOwnership_ratio    0.976104
postCode                            1.004782
revolUtil                           1.064223
Length: 100, dtype: float64

In [21]:
n_feat_lst = ['n0','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14']
for col in n_feat_lst:
    feat_lst.remove(col)

In [22]:
def model_test(model, X_test, Y_test):
    test_pred_proba = model.predict_proba(X_test)
    roc_auc_test = roc_auc_score(Y_test,test_pred_proba[:,1])
    get_ks = lambda y_pred,y_true: ks_2samp(y_pred[y_true==1], y_pred[y_true!=1]).statistic
    ks_test = get_ks(test_pred_proba[:,1], Y_test)
    print('roc_auc_test =', roc_auc_test)
    print('ks_test =', ks_test)

In [23]:
def Ordered_TS(train_data, test_data, target, cate_features):
#     print(train_data.head())
    train_data['reindex'] = np.random.permutation(train_data.shape[0])
    train_data['istest'] = 0
    test_data['reindex'] = train_data.shape[0] + np.random.permutation(test_data.shape[0])
    test_data['istest'] = 1
    data_df = pd.concat([train_data, test_data])
    a = 1
    p = train_data[target].mean()
    numerial_feat_name_lst = []
    for feat in cate_features:
        print(feat)
        numerial_feat_name = 'numerial_'+feat
        numerial_feat_name_lst.append(numerial_feat_name)
        numerial_feat_value_lst = []
        cate_feat_df = data_df.sort_values(by=[feat, 'reindex'])[[feat, target, 'reindex', 'istest']]
        y_sum = 0
        y_cnt = 0
        elem_pre = -1e9
        for indx in range(cate_feat_df.shape[0]):
            elem = cate_feat_df.iloc[indx, 0]
            y = cate_feat_df.iloc[indx, 1]
            reindex = cate_feat_df.iloc[indx, 2]
            istest = cate_feat_df.iloc[indx, 3]
            if elem != elem_pre:
                y_sum = 0
                y_cnt = 0
            val = (y_sum+a*p)/(y_cnt+a)
            if istest == 0:
                y_sum += y
                y_cnt += 1
            numerial_feat_value_lst.append(val)
            elem_pre = elem
        cate_feat_df[numerial_feat_name] = numerial_feat_value_lst
        if numerial_feat_name in train_data.columns:
            del train_data[numerial_feat_name]
        train_data = train_data.merge(cate_feat_df.loc[cate_feat_df['istest']==0, ['reindex', numerial_feat_name]], on='reindex', how='left')
        if numerial_feat_name in test_data.columns:
            del test_data[numerial_feat_name]
        test_data = test_data.merge(cate_feat_df.loc[cate_feat_df['istest']==1, ['reindex', numerial_feat_name]], on='reindex', how='left')
    return train_data[["id"]+cate_features+numerial_feat_name_lst], \
        test_data[["id"]+cate_features+numerial_feat_name_lst], \
        numerial_feat_name_lst#, data_df, cate_feat_df, train_data, test_data

def Ordered_TS_Transform(splited_train_data, splited_test_data, target, cate_features, s):
#     X_train, X_validation, y_train, y_validation = train_test_split(mt_data_201909_df.loc[:, ['id', 'target']+feat_lst].fillna(0),
#                                                                     mt_data_201909_df.loc[:, 'target'],
#                                                                     test_size=0.2 , random_state=i*1000)
#     splited_train_data = X_train
#     splited_test_data = X_validation
#     target = 'target'
#     cate_features = ['2000040013']
#     s = 1
    ordered_cate_feat_dfs = []
    test_cate_feat_dfs = []
    numerial_cate_features_dict = {}
    for i in range(s): 
        print(i)
        ordered_cate_feat_df, test_cate_feat_df, numerial_cate_features = \
            Ordered_TS(splited_train_data, splited_test_data, target, cate_features)
        ordered_cate_feat_dfs.append(ordered_cate_feat_df)
        test_cate_feat_dfs.append(test_cate_feat_df)
    splited_train_data.reset_index(drop=True, inplace=True)
    splited_test_data.reset_index(drop=True, inplace=True)
    for feat in numerial_cate_features:
        splited_train_data[feat] = 0
        splited_test_data[feat] = 0
        for r in range(s):
            splited_train_data[feat] += ordered_cate_feat_dfs[r][feat]/s
            splited_test_data[feat] += test_cate_feat_dfs[r][feat]/s
        numerial_cate_features_dict[feat] = [splited_train_data[feat].mean(), splited_train_data[feat].std()]
    return numerial_cate_features, numerial_cate_features_dict  #,splited_train_data,splited_test_data

def Get_Noised_Test_Data(X_train, X_validation):
    noised_numerial_cate_features = []
    numerial_cate_features_dict = {}
    for feat in cate_features:
        numerial_feat = "numerial_" + feat
        noised_feat = "noised_" + numerial_feat
        noised_numerial_cate_features.append(noised_feat)
        X_train[noised_feat] = X_train[numerial_feat]
        X_validation[noised_feat] = X_validation[numerial_feat]
        numerial_cate_features_dict[numerial_feat] \
            = X_train[[feat,numerial_feat]].groupby(by=feat).agg(['mean', 'std']).to_dict()
        mean_key = (numerial_feat, 'mean')
        std_key = (numerial_feat, 'std')
        for k in numerial_cate_features_dict[numerial_feat][mean_key]:
            mu = numerial_cate_features_dict[numerial_feat][mean_key][k]
            sigma = numerial_cate_features_dict[numerial_feat][std_key][k]
            if np.isnan(sigma):
                sigma = 0.0001
            sz = X_validation.loc[X_validation[feat]==k, noised_feat].shape
            X_validation.loc[X_validation[feat]==k, noised_feat] = np.random.normal(mu, sigma, sz)
    return noised_numerial_cate_features

In [24]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm.sklearn import LGBMClassifier, LGBMRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
# import xgboost as xgb
from scipy.stats import ks_2samp
from sklearn.metrics import roc_auc_score, f1_score
import numpy as np

def model_test(model, X_test, Y_test):
    test_pred_proba = model.predict_proba(X_test)
    roc_auc_test = roc_auc_score(Y_test,test_pred_proba[:,1])
    get_ks = lambda y_pred,y_true: ks_2samp(y_pred[y_true==1], y_pred[y_true!=1]).statistic
    ks_test = get_ks(test_pred_proba[:,1], Y_test)
    print('roc_auc_test =', roc_auc_test)
    print('ks_test =', ks_test)
    
def get_lr_model(name, train_df, test_df, columns_selected, target_col, early_stop=False, params={}):
    X_train = train_df[columns_selected].fillna(0)
    Y_train = train_df[target_col]
    X_test = test_df[columns_selected].fillna(0)
    Y_test = test_df[target_col]

    print(X_train.shape, X_test.shape, np.sum(Y_train), np.sum(Y_test))
    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    if early_stop == False:
        model = LogisticRegression(**params).fit(X_train,Y_train)
    else:
        model = LogisticRegression(**params).fit(X_train,Y_train, eval_metric=ks_metric, early_stopping_rounds=100, eval_set=[(X_test, Y_test)])
    
    train_pred_proba = model.predict_proba(X_train)
    test_pred_proba = model.predict_proba(X_test)
    roc_auc_test = roc_auc_score(Y_test,test_pred_proba[:,1])
    roc_auc_train = roc_auc_score(Y_train,train_pred_proba[:,1])
    get_ks = lambda y_pred,y_true: ks_2samp(y_pred[y_true==1], y_pred[y_true!=1]).statistic
    ks_train = get_ks(train_pred_proba[:,1], Y_train)
    ks_test = get_ks(test_pred_proba[:,1], Y_test)
    print('task name =', name)
    print('roc_auc_train =', roc_auc_train)
    print('roc_auc_test =', roc_auc_test)
    print('ks_train =', ks_train)
    print('ks_test =', ks_test)
    return model

def get_lr_model_with_woe(name, train_df, test_df, woe_transform, step_wise, columns_selected, target_col, params={}):
    # initialise
    c = toad.transform.Combiner()

    to_drop=[]
    
    if woe_transform == False:
        scaler = StandardScaler().fit(train_df[columns_selected])
        train_df[columns_selected] = scaler.transform(train_df[columns_selected])
        test_df[columns_selected] = scaler.transform(test_df[columns_selected])
        train_df[columns_selected] = np.nan_to_num(train_df[columns_selected])
        test_df[columns_selected] = np.nan_to_num(test_df[columns_selected])
    
    columns_selected.append(target_col)
    if woe_transform == True:
        # Train binning with the selected features from previous; use reliable Chi-squared binning, and control that each bucket has at least 5% sample.
        c.fit(train_df[columns_selected], y = target_col, method = 'chi', min_samples = 0.05, exclude = to_drop) #empty_separate = False
        # Initialise
        transer = toad.transform.WOETransformer()

        # transer.fit_transform() & combiner.transform(). Remember to exclude target
        train_woe = transer.fit_transform(c.transform(train_df[columns_selected]), train_df[target_col], exclude=to_drop+[target_col])
        test_woe = transer.transform(c.transform(test_df[columns_selected]))
    else:
        train_woe = train_df[columns_selected]
        test_woe = test_df[columns_selected]
    
    print(train_woe.shape)
    #print(train_woe.describe())
    
    if step_wise == True:
        # Apply stepwise regression on the WOE-transformed data
        final_data = toad.selection.stepwise(train_woe,target = target_col, estimator='ols', direction = 'both', criterion = 'aic', exclude = to_drop)
        #  Place the selected features to test / OOT sample
        final_test = test_woe[final_data.columns]
        columns_selected = list(final_data.drop(to_drop+[target_col],axis=1).columns)
    else:
        final_data = train_woe
        final_test = test_woe
        columns_selected.remove(target_col)
    print(columns_selected)
    X_train, Y_train = final_data[columns_selected], final_data[target_col]
    X_test, Y_test = final_test[columns_selected], final_test[target_col]
    
    print(final_data.shape) #  Out of 31 features, stepwise regression selected 10 of them.

    model = LogisticRegression(**params).fit(X_train,Y_train)

    train_pred_proba = model.predict_proba(X_train)
    test_pred_proba = model.predict_proba(X_test)
    roc_auc_test = roc_auc_score(Y_test,test_pred_proba[:,1])
    roc_auc_train = roc_auc_score(Y_train,train_pred_proba[:,1])
    get_ks = lambda y_pred,y_true: ks_2samp(y_pred[y_true==1], y_pred[y_true!=1]).statistic
    ks_train = get_ks(train_pred_proba[:,1], Y_train)
    ks_test = get_ks(test_pred_proba[:,1], Y_test)
    print('task name =', name)
    print('roc_auc_train =', roc_auc_train)
    print('roc_auc_test =', roc_auc_test)
    print('ks_train =', ks_train)
    print('ks_test =', ks_test)
    return model

In [25]:
from catboost import CatBoostClassifier
from lightgbm.sklearn import LGBMClassifier, LGBMRegressor
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

cate_features = ['employmentTitle', 'employmentLength_bin', 'purpose', 'postCode', 'earliesCreditLine_bin', \
                 'regionCode', 'title', 'issueDate_bin', 'term_bin', 'homeOwnership_bin']
for col in cate_features:
    print(col)
    train_data[col] = train_data[col].fillna(0).astype("int")

lgb_params = {'metric': 'auc', 
              'lambda_l1': 0,
              'lambda_l2': 3,
              'num_leaves': 10,
              'feature_fraction': 0.7,
              'bagging_fraction': 0.7,
              'bagging_freq': 3,
              'min_child_samples': 50,
              'learning_rate': 0.1,
              'num_round': 1000}

target_col = 'isDefault'

for i in range(1,3):
    X_train, X_validation, y_train, y_validation = train_test_split(train_data.loc[:, ['id', target_col]+feat_lst].fillna(0),
                                                                    train_data.loc[:, target_col],
                                                                    test_size=0.125 , random_state=i*1000)
    lgb_model = LGBMClassifier(**lgb_params).fit(X_train[cate_features].fillna(0),y_train, eval_metric='AUC', early_stopping_rounds=100, 
                                             eval_set=[(X_validation[cate_features].fillna(0), y_validation)])
    model_test(lgb_model, X_validation[cate_features].fillna(0), y_validation)
    cat_model = CatBoostClassifier(iterations=1000, cat_features=cate_features,eval_metric='AUC',logging_level='Verbose',
                                 learning_rate=0.1, depth=5, l2_leaf_reg=3, loss_function='CrossEntropy')
    cat_model.fit(X_train.loc[:, cate_features].fillna(0),y_train, eval_set=(X_validation.loc[:, cate_features].fillna(0), y_validation), plot=False)
    model_test(cat_model, X_validation[cate_features].fillna(0), y_validation)
    numerial_cate_features, numerial_cate_features_dict = Ordered_TS_Transform(X_train, X_validation, target_col, cate_features, 10)
    lgb_model = LGBMClassifier(**lgb_params).fit(X_train[numerial_cate_features].fillna(0),y_train, eval_metric='AUC', early_stopping_rounds=100, 
                                             eval_set=[(X_validation[numerial_cate_features].fillna(0), y_validation)])
    model_test(lgb_model, X_validation[numerial_cate_features].fillna(0), y_validation)
    lr_params = {'class_weight':'balanced', 'max_iter':1000, 'random_state':1, 'solver':'lbfgs'}
    get_lr_model("normal LR", X_train, X_validation, cate_features, target_col, early_stop=False, params=lr_params)
    get_lr_model_with_woe("woe + LR", X_train, X_validation, True, False, cate_features, target_col, params=lr_params)
    get_lr_model("ordered_TS + LR", X_train, X_validation, numerial_cate_features, target_col, early_stop=False, params=lr_params)
    noised_numerial_cate_features = Get_Noised_Test_Data(X_train, X_validation)
    model_test(lgb_model, X_validation[noised_numerial_cate_features].fillna(0), y_validation)
    get_lr_model("noised_ordered_TS + LR", X_train, X_validation, noised_numerial_cate_features, target_col, early_stop=False, params=lr_params)
    

employmentTitle
employmentLength_bin
purpose
postCode
earliesCreditLine_bin
regionCode
title
issueDate_bin
term_bin
homeOwnership_bin
[1]	valid_0's auc: 0.7158
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.6756
[3]	valid_0's auc: 0.6606
[4]	valid_0's auc: 0.7256
[5]	valid_0's auc: 0.7364
[6]	valid_0's auc: 0.7688
[7]	valid_0's auc: 0.7724
[8]	valid_0's auc: 0.7652
[9]	valid_0's auc: 0.7772
[10]	valid_0's auc: 0.7796
[11]	valid_0's auc: 0.7852
[12]	valid_0's auc: 0.7928
[13]	valid_0's auc: 0.8008
[14]	valid_0's auc: 0.79
[15]	valid_0's auc: 0.7912
[16]	valid_0's auc: 0.7896
[17]	valid_0's auc: 0.7832
[18]	valid_0's auc: 0.788
[19]	valid_0's auc: 0.7808
[20]	valid_0's auc: 0.774
[21]	valid_0's auc: 0.7668
[22]	valid_0's auc: 0.7748
[23]	valid_0's auc: 0.7812
[24]	valid_0's auc: 0.7756
[25]	valid_0's auc: 0.7808
[26]	valid_0's auc: 0.7828
[27]	valid_0's auc: 0.7852
[28]	valid_0's auc: 0.7884
[29]	valid_0's auc: 0.792
[30]	valid_0's auc: 0.796
[31]	vali



2:	test: 0.7478000	best: 0.7512000 (1)	total: 76.3ms	remaining: 25.4s
3:	test: 0.7476000	best: 0.7512000 (1)	total: 81.6ms	remaining: 20.3s
4:	test: 0.7492000	best: 0.7512000 (1)	total: 88.1ms	remaining: 17.5s
5:	test: 0.7468000	best: 0.7512000 (1)	total: 93.4ms	remaining: 15.5s
6:	test: 0.7468000	best: 0.7512000 (1)	total: 94.8ms	remaining: 13.4s
7:	test: 0.7528000	best: 0.7528000 (7)	total: 101ms	remaining: 12.5s
8:	test: 0.7470000	best: 0.7528000 (7)	total: 109ms	remaining: 12s
9:	test: 0.7672000	best: 0.7672000 (9)	total: 117ms	remaining: 11.5s
10:	test: 0.7672000	best: 0.7672000 (9)	total: 118ms	remaining: 10.6s
11:	test: 0.7688000	best: 0.7688000 (11)	total: 128ms	remaining: 10.5s
12:	test: 0.7748000	best: 0.7748000 (12)	total: 135ms	remaining: 10.3s
13:	test: 0.7724000	best: 0.7748000 (12)	total: 142ms	remaining: 10s
14:	test: 0.7680000	best: 0.7748000 (12)	total: 149ms	remaining: 9.79s
15:	test: 0.7736000	best: 0.7748000 (12)	total: 155ms	remaining: 9.53s
16:	test: 0.7704000	be

159:	test: 0.7820000	best: 0.8072000 (87)	total: 645ms	remaining: 3.39s
160:	test: 0.7828000	best: 0.8072000 (87)	total: 648ms	remaining: 3.38s
161:	test: 0.7828000	best: 0.8072000 (87)	total: 651ms	remaining: 3.37s
162:	test: 0.7824000	best: 0.8072000 (87)	total: 654ms	remaining: 3.36s
163:	test: 0.7724000	best: 0.8072000 (87)	total: 659ms	remaining: 3.36s
164:	test: 0.7720000	best: 0.8072000 (87)	total: 663ms	remaining: 3.36s
165:	test: 0.7712000	best: 0.8072000 (87)	total: 668ms	remaining: 3.36s
166:	test: 0.7716000	best: 0.8072000 (87)	total: 672ms	remaining: 3.35s
167:	test: 0.7680000	best: 0.8072000 (87)	total: 676ms	remaining: 3.35s
168:	test: 0.7664000	best: 0.8072000 (87)	total: 679ms	remaining: 3.34s
169:	test: 0.7672000	best: 0.8072000 (87)	total: 683ms	remaining: 3.33s
170:	test: 0.7668000	best: 0.8072000 (87)	total: 687ms	remaining: 3.33s
171:	test: 0.7668000	best: 0.8072000 (87)	total: 690ms	remaining: 3.32s
172:	test: 0.7660000	best: 0.8072000 (87)	total: 693ms	remaining

307:	test: 0.6976000	best: 0.8072000 (87)	total: 1.22s	remaining: 2.73s
308:	test: 0.6980000	best: 0.8072000 (87)	total: 1.22s	remaining: 2.73s
309:	test: 0.6944000	best: 0.8072000 (87)	total: 1.22s	remaining: 2.73s
310:	test: 0.6952000	best: 0.8072000 (87)	total: 1.23s	remaining: 2.72s
311:	test: 0.6944000	best: 0.8072000 (87)	total: 1.23s	remaining: 2.72s
312:	test: 0.6948000	best: 0.8072000 (87)	total: 1.24s	remaining: 2.71s
313:	test: 0.6952000	best: 0.8072000 (87)	total: 1.24s	remaining: 2.71s
314:	test: 0.6924000	best: 0.8072000 (87)	total: 1.24s	remaining: 2.7s
315:	test: 0.6924000	best: 0.8072000 (87)	total: 1.25s	remaining: 2.69s
316:	test: 0.6928000	best: 0.8072000 (87)	total: 1.25s	remaining: 2.69s
317:	test: 0.6932000	best: 0.8072000 (87)	total: 1.25s	remaining: 2.69s
318:	test: 0.6932000	best: 0.8072000 (87)	total: 1.25s	remaining: 2.68s
319:	test: 0.6952000	best: 0.8072000 (87)	total: 1.26s	remaining: 2.67s
320:	test: 0.6952000	best: 0.8072000 (87)	total: 1.26s	remaining:

464:	test: 0.6756000	best: 0.8072000 (87)	total: 1.78s	remaining: 2.05s
465:	test: 0.6760000	best: 0.8072000 (87)	total: 1.78s	remaining: 2.05s
466:	test: 0.6764000	best: 0.8072000 (87)	total: 1.79s	remaining: 2.04s
467:	test: 0.6748000	best: 0.8072000 (87)	total: 1.79s	remaining: 2.04s
468:	test: 0.6748000	best: 0.8072000 (87)	total: 1.8s	remaining: 2.04s
469:	test: 0.6752000	best: 0.8072000 (87)	total: 1.8s	remaining: 2.03s
470:	test: 0.6736000	best: 0.8072000 (87)	total: 1.81s	remaining: 2.03s
471:	test: 0.6696000	best: 0.8072000 (87)	total: 1.81s	remaining: 2.03s
472:	test: 0.6720000	best: 0.8072000 (87)	total: 1.81s	remaining: 2.02s
473:	test: 0.6732000	best: 0.8072000 (87)	total: 1.82s	remaining: 2.02s
474:	test: 0.6736000	best: 0.8072000 (87)	total: 1.82s	remaining: 2.01s
475:	test: 0.6748000	best: 0.8072000 (87)	total: 1.83s	remaining: 2.01s
476:	test: 0.6744000	best: 0.8072000 (87)	total: 1.83s	remaining: 2.01s
477:	test: 0.6724000	best: 0.8072000 (87)	total: 1.83s	remaining: 

622:	test: 0.6376000	best: 0.8072000 (87)	total: 2.35s	remaining: 1.42s
623:	test: 0.6372000	best: 0.8072000 (87)	total: 2.35s	remaining: 1.42s
624:	test: 0.6388000	best: 0.8072000 (87)	total: 2.36s	remaining: 1.42s
625:	test: 0.6388000	best: 0.8072000 (87)	total: 2.36s	remaining: 1.41s
626:	test: 0.6376000	best: 0.8072000 (87)	total: 2.37s	remaining: 1.41s
627:	test: 0.6384000	best: 0.8072000 (87)	total: 2.37s	remaining: 1.41s
628:	test: 0.6372000	best: 0.8072000 (87)	total: 2.38s	remaining: 1.4s
629:	test: 0.6368000	best: 0.8072000 (87)	total: 2.38s	remaining: 1.4s
630:	test: 0.6364000	best: 0.8072000 (87)	total: 2.38s	remaining: 1.39s
631:	test: 0.6376000	best: 0.8072000 (87)	total: 2.39s	remaining: 1.39s
632:	test: 0.6396000	best: 0.8072000 (87)	total: 2.39s	remaining: 1.39s
633:	test: 0.6396000	best: 0.8072000 (87)	total: 2.4s	remaining: 1.38s
634:	test: 0.6384000	best: 0.8072000 (87)	total: 2.4s	remaining: 1.38s
635:	test: 0.6384000	best: 0.8072000 (87)	total: 2.4s	remaining: 1.3

777:	test: 0.6036000	best: 0.8072000 (87)	total: 2.92s	remaining: 834ms
778:	test: 0.6028000	best: 0.8072000 (87)	total: 2.93s	remaining: 831ms
779:	test: 0.6036000	best: 0.8072000 (87)	total: 2.93s	remaining: 827ms
780:	test: 0.6044000	best: 0.8072000 (87)	total: 2.93s	remaining: 823ms
781:	test: 0.6040000	best: 0.8072000 (87)	total: 2.94s	remaining: 819ms
782:	test: 0.6040000	best: 0.8072000 (87)	total: 2.94s	remaining: 816ms
783:	test: 0.6064000	best: 0.8072000 (87)	total: 2.95s	remaining: 812ms
784:	test: 0.6060000	best: 0.8072000 (87)	total: 2.95s	remaining: 808ms
785:	test: 0.6068000	best: 0.8072000 (87)	total: 2.96s	remaining: 805ms
786:	test: 0.6052000	best: 0.8072000 (87)	total: 2.96s	remaining: 801ms
787:	test: 0.6076000	best: 0.8072000 (87)	total: 2.96s	remaining: 797ms
788:	test: 0.6088000	best: 0.8072000 (87)	total: 2.97s	remaining: 794ms
789:	test: 0.6084000	best: 0.8072000 (87)	total: 2.97s	remaining: 790ms
790:	test: 0.6088000	best: 0.8072000 (87)	total: 2.97s	remaining

936:	test: 0.6216000	best: 0.8072000 (87)	total: 3.49s	remaining: 235ms
937:	test: 0.6236000	best: 0.8072000 (87)	total: 3.5s	remaining: 231ms
938:	test: 0.6228000	best: 0.8072000 (87)	total: 3.5s	remaining: 228ms
939:	test: 0.6224000	best: 0.8072000 (87)	total: 3.51s	remaining: 224ms
940:	test: 0.6220000	best: 0.8072000 (87)	total: 3.51s	remaining: 220ms
941:	test: 0.6200000	best: 0.8072000 (87)	total: 3.51s	remaining: 216ms
942:	test: 0.6212000	best: 0.8072000 (87)	total: 3.52s	remaining: 213ms
943:	test: 0.6204000	best: 0.8072000 (87)	total: 3.52s	remaining: 209ms
944:	test: 0.6204000	best: 0.8072000 (87)	total: 3.52s	remaining: 205ms
945:	test: 0.6184000	best: 0.8072000 (87)	total: 3.53s	remaining: 201ms
946:	test: 0.6196000	best: 0.8072000 (87)	total: 3.53s	remaining: 198ms
947:	test: 0.6192000	best: 0.8072000 (87)	total: 3.54s	remaining: 194ms
948:	test: 0.6184000	best: 0.8072000 (87)	total: 3.54s	remaining: 190ms
949:	test: 0.6192000	best: 0.8072000 (87)	total: 3.54s	remaining: 



[1]	valid_0's auc: 0.6938
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.674
[3]	valid_0's auc: 0.6144
[4]	valid_0's auc: 0.6302
[5]	valid_0's auc: 0.608
[6]	valid_0's auc: 0.5926
[7]	valid_0's auc: 0.6732
[8]	valid_0's auc: 0.6772
[9]	valid_0's auc: 0.677
[10]	valid_0's auc: 0.708
[11]	valid_0's auc: 0.6864
[12]	valid_0's auc: 0.7052
[13]	valid_0's auc: 0.6948
[14]	valid_0's auc: 0.6928
[15]	valid_0's auc: 0.6716
[16]	valid_0's auc: 0.6844
[17]	valid_0's auc: 0.6912
[18]	valid_0's auc: 0.7188
[19]	valid_0's auc: 0.7132
[20]	valid_0's auc: 0.6908
[21]	valid_0's auc: 0.688
[22]	valid_0's auc: 0.6848
[23]	valid_0's auc: 0.6828
[24]	valid_0's auc: 0.674
[25]	valid_0's auc: 0.67
[26]	valid_0's auc: 0.6856
[27]	valid_0's auc: 0.684
[28]	valid_0's auc: 0.6804
[29]	valid_0's auc: 0.6788
[30]	valid_0's auc: 0.678
[31]	valid_0's auc: 0.6808
[32]	valid_0's auc: 0.68
[33]	valid_0's auc: 0.6792
[34]	valid_0's auc: 0.6916
[35]	valid_0's auc: 0.6944
[36]	valid_0's



21:	test: 0.6348039	best: 0.6405229 (19)	total: 91.8ms	remaining: 4.08s
22:	test: 0.6358932	best: 0.6405229 (19)	total: 99.4ms	remaining: 4.22s
23:	test: 0.6451525	best: 0.6451525 (23)	total: 106ms	remaining: 4.29s
24:	test: 0.6337146	best: 0.6451525 (23)	total: 113ms	remaining: 4.39s
25:	test: 0.6546841	best: 0.6546841 (25)	total: 119ms	remaining: 4.45s
26:	test: 0.6437908	best: 0.6546841 (25)	total: 127ms	remaining: 4.57s
27:	test: 0.6432462	best: 0.6546841 (25)	total: 132ms	remaining: 4.58s
28:	test: 0.6503268	best: 0.6546841 (25)	total: 137ms	remaining: 4.59s
29:	test: 0.6595861	best: 0.6595861 (29)	total: 142ms	remaining: 4.6s
30:	test: 0.6633987	best: 0.6633987 (30)	total: 147ms	remaining: 4.6s
31:	test: 0.6655773	best: 0.6655773 (31)	total: 152ms	remaining: 4.59s
32:	test: 0.6693900	best: 0.6693900 (32)	total: 156ms	remaining: 4.56s
33:	test: 0.6666667	best: 0.6693900 (32)	total: 157ms	remaining: 4.46s
34:	test: 0.6595861	best: 0.6693900 (32)	total: 162ms	remaining: 4.47s
35:	te

136:	test: 0.6356209	best: 0.6737473 (103)	total: 474ms	remaining: 2.98s
137:	test: 0.6367102	best: 0.6737473 (103)	total: 478ms	remaining: 2.98s
138:	test: 0.6388889	best: 0.6737473 (103)	total: 481ms	remaining: 2.98s
139:	test: 0.6350763	best: 0.6737473 (103)	total: 486ms	remaining: 2.98s
140:	test: 0.6318083	best: 0.6737473 (103)	total: 490ms	remaining: 2.98s
141:	test: 0.6307190	best: 0.6737473 (103)	total: 493ms	remaining: 2.98s
142:	test: 0.6312636	best: 0.6737473 (103)	total: 497ms	remaining: 2.98s
143:	test: 0.6361656	best: 0.6737473 (103)	total: 500ms	remaining: 2.98s
144:	test: 0.6356209	best: 0.6737473 (103)	total: 504ms	remaining: 2.97s
145:	test: 0.6339869	best: 0.6737473 (103)	total: 507ms	remaining: 2.96s
146:	test: 0.6350763	best: 0.6737473 (103)	total: 510ms	remaining: 2.96s
147:	test: 0.6345316	best: 0.6737473 (103)	total: 515ms	remaining: 2.96s
148:	test: 0.6350763	best: 0.6737473 (103)	total: 518ms	remaining: 2.96s
149:	test: 0.6350763	best: 0.6737473 (103)	total: 5

251:	test: 0.6138344	best: 0.6737473 (103)	total: 857ms	remaining: 2.54s
252:	test: 0.6138344	best: 0.6737473 (103)	total: 861ms	remaining: 2.54s
253:	test: 0.6122004	best: 0.6737473 (103)	total: 866ms	remaining: 2.54s
254:	test: 0.6149237	best: 0.6737473 (103)	total: 871ms	remaining: 2.54s
255:	test: 0.6132898	best: 0.6737473 (103)	total: 876ms	remaining: 2.55s
256:	test: 0.6132898	best: 0.6737473 (103)	total: 880ms	remaining: 2.54s
257:	test: 0.6105664	best: 0.6737473 (103)	total: 883ms	remaining: 2.54s
258:	test: 0.6078431	best: 0.6737473 (103)	total: 886ms	remaining: 2.53s
259:	test: 0.6083878	best: 0.6737473 (103)	total: 890ms	remaining: 2.53s
260:	test: 0.6116558	best: 0.6737473 (103)	total: 894ms	remaining: 2.53s
261:	test: 0.6171024	best: 0.6737473 (103)	total: 898ms	remaining: 2.53s
262:	test: 0.6149237	best: 0.6737473 (103)	total: 901ms	remaining: 2.52s
263:	test: 0.6160131	best: 0.6737473 (103)	total: 905ms	remaining: 2.52s
264:	test: 0.6198257	best: 0.6737473 (103)	total: 9

413:	test: 0.6040305	best: 0.6737473 (103)	total: 1.43s	remaining: 2.02s
414:	test: 0.6034858	best: 0.6737473 (103)	total: 1.43s	remaining: 2.02s
415:	test: 0.6051198	best: 0.6737473 (103)	total: 1.44s	remaining: 2.02s
416:	test: 0.6062092	best: 0.6737473 (103)	total: 1.45s	remaining: 2.02s
417:	test: 0.6078431	best: 0.6737473 (103)	total: 1.45s	remaining: 2.02s
418:	test: 0.6083878	best: 0.6737473 (103)	total: 1.45s	remaining: 2.02s
419:	test: 0.6105664	best: 0.6737473 (103)	total: 1.46s	remaining: 2.01s
420:	test: 0.6083878	best: 0.6737473 (103)	total: 1.46s	remaining: 2.01s
421:	test: 0.6078431	best: 0.6737473 (103)	total: 1.46s	remaining: 2s
422:	test: 0.6072985	best: 0.6737473 (103)	total: 1.47s	remaining: 2s
423:	test: 0.6100218	best: 0.6737473 (103)	total: 1.47s	remaining: 2s
424:	test: 0.6105664	best: 0.6737473 (103)	total: 1.47s	remaining: 1.99s
425:	test: 0.6100218	best: 0.6737473 (103)	total: 1.48s	remaining: 1.99s
426:	test: 0.6105664	best: 0.6737473 (103)	total: 1.48s	rema

527:	test: 0.6138344	best: 0.6737473 (103)	total: 1.81s	remaining: 1.62s
528:	test: 0.6127451	best: 0.6737473 (103)	total: 1.81s	remaining: 1.61s
529:	test: 0.6160131	best: 0.6737473 (103)	total: 1.82s	remaining: 1.61s
530:	test: 0.6165577	best: 0.6737473 (103)	total: 1.82s	remaining: 1.61s
531:	test: 0.6149237	best: 0.6737473 (103)	total: 1.83s	remaining: 1.61s
532:	test: 0.6149237	best: 0.6737473 (103)	total: 1.83s	remaining: 1.6s
533:	test: 0.6176471	best: 0.6737473 (103)	total: 1.83s	remaining: 1.6s
534:	test: 0.6176471	best: 0.6737473 (103)	total: 1.84s	remaining: 1.6s
535:	test: 0.6165577	best: 0.6737473 (103)	total: 1.84s	remaining: 1.59s
536:	test: 0.6149237	best: 0.6737473 (103)	total: 1.84s	remaining: 1.59s
537:	test: 0.6149237	best: 0.6737473 (103)	total: 1.85s	remaining: 1.59s
538:	test: 0.6165577	best: 0.6737473 (103)	total: 1.85s	remaining: 1.58s
539:	test: 0.6165577	best: 0.6737473 (103)	total: 1.85s	remaining: 1.58s
540:	test: 0.6143791	best: 0.6737473 (103)	total: 1.85

690:	test: 0.6214597	best: 0.6737473 (103)	total: 2.38s	remaining: 1.06s
691:	test: 0.6225490	best: 0.6737473 (103)	total: 2.38s	remaining: 1.06s
692:	test: 0.6214597	best: 0.6737473 (103)	total: 2.39s	remaining: 1.06s
693:	test: 0.6225490	best: 0.6737473 (103)	total: 2.39s	remaining: 1.05s
694:	test: 0.6220044	best: 0.6737473 (103)	total: 2.4s	remaining: 1.05s
695:	test: 0.6225490	best: 0.6737473 (103)	total: 2.4s	remaining: 1.05s
696:	test: 0.6203704	best: 0.6737473 (103)	total: 2.4s	remaining: 1.04s
697:	test: 0.6192810	best: 0.6737473 (103)	total: 2.41s	remaining: 1.04s
698:	test: 0.6203704	best: 0.6737473 (103)	total: 2.41s	remaining: 1.04s
699:	test: 0.6209150	best: 0.6737473 (103)	total: 2.42s	remaining: 1.03s
700:	test: 0.6225490	best: 0.6737473 (103)	total: 2.42s	remaining: 1.03s
701:	test: 0.6220044	best: 0.6737473 (103)	total: 2.42s	remaining: 1.03s
702:	test: 0.6220044	best: 0.6737473 (103)	total: 2.42s	remaining: 1.02s
703:	test: 0.6192810	best: 0.6737473 (103)	total: 2.43

849:	test: 0.6072985	best: 0.6737473 (103)	total: 2.95s	remaining: 521ms
850:	test: 0.6072985	best: 0.6737473 (103)	total: 2.96s	remaining: 518ms
851:	test: 0.6078431	best: 0.6737473 (103)	total: 2.96s	remaining: 515ms
852:	test: 0.6072985	best: 0.6737473 (103)	total: 2.97s	remaining: 511ms
853:	test: 0.6078431	best: 0.6737473 (103)	total: 2.97s	remaining: 508ms
854:	test: 0.6083878	best: 0.6737473 (103)	total: 2.98s	remaining: 505ms
855:	test: 0.6094771	best: 0.6737473 (103)	total: 2.98s	remaining: 501ms
856:	test: 0.6100218	best: 0.6737473 (103)	total: 2.98s	remaining: 498ms
857:	test: 0.6100218	best: 0.6737473 (103)	total: 2.99s	remaining: 494ms
858:	test: 0.6078431	best: 0.6737473 (103)	total: 2.99s	remaining: 491ms
859:	test: 0.6083878	best: 0.6737473 (103)	total: 3s	remaining: 488ms
860:	test: 0.6083878	best: 0.6737473 (103)	total: 3s	remaining: 484ms
861:	test: 0.6094771	best: 0.6737473 (103)	total: 3s	remaining: 481ms
862:	test: 0.6089325	best: 0.6737473 (103)	total: 3s	remaini

roc_auc_test = 0.6737472766884531
ks_test = 0.4471677559912854
0
employmentTitle
employmentLength_bin
purpose
postCode
earliesCreditLine_bin
regionCode
title
issueDate_bin
term_bin
homeOwnership_bin
1
employmentTitle
employmentLength_bin
purpose
postCode
earliesCreditLine_bin
regionCode
title
issueDate_bin
term_bin
homeOwnership_bin
2
employmentTitle
employmentLength_bin
purpose
postCode
earliesCreditLine_bin
regionCode
title
issueDate_bin
term_bin
homeOwnership_bin
3
employmentTitle
employmentLength_bin
purpose
postCode
earliesCreditLine_bin
regionCode
title
issueDate_bin
term_bin
homeOwnership_bin
4
employmentTitle
employmentLength_bin
purpose
postCode
earliesCreditLine_bin
regionCode
title
issueDate_bin
term_bin
homeOwnership_bin
5
employmentTitle
employmentLength_bin
purpose
postCode
earliesCreditLine_bin
regionCode
title
issueDate_bin
term_bin
homeOwnership_bin
6
employmentTitle
employmentLength_bin
purpose
postCode
earliesCreditLine_bin
regionCode
title
issueDate_bin
term_bin
hom



[102]	valid_0's auc: 0.662309
[103]	valid_0's auc: 0.66122
[104]	valid_0's auc: 0.663399
[105]	valid_0's auc: 0.663943
[106]	valid_0's auc: 0.668845
[107]	valid_0's auc: 0.666667
[108]	valid_0's auc: 0.669935
[109]	valid_0's auc: 0.668301
[110]	valid_0's auc: 0.668301
[111]	valid_0's auc: 0.667756
[112]	valid_0's auc: 0.671024
[113]	valid_0's auc: 0.670479
[114]	valid_0's auc: 0.671569
[115]	valid_0's auc: 0.673747
[116]	valid_0's auc: 0.675381
[117]	valid_0's auc: 0.67756
[118]	valid_0's auc: 0.680283
[119]	valid_0's auc: 0.678105
[120]	valid_0's auc: 0.678105
[121]	valid_0's auc: 0.681917
[122]	valid_0's auc: 0.686275
[123]	valid_0's auc: 0.686819
[124]	valid_0's auc: 0.690632
[125]	valid_0's auc: 0.691721
[126]	valid_0's auc: 0.692266
[127]	valid_0's auc: 0.69281
[128]	valid_0's auc: 0.692266
[129]	valid_0's auc: 0.6939
[130]	valid_0's auc: 0.692266
[131]	valid_0's auc: 0.687364
[132]	valid_0's auc: 0.685185
[133]	valid_0's auc: 0.690087
[134]	valid_0's auc: 0.687364
[135]	valid_0's

In [26]:
    noised_numerial_cate_features = Get_Noised_Test_Data(X_train, X_validation)
    model_test(lgb_model, X_validation[noised_numerial_cate_features].fillna(0), y_validation)
    get_lr_model("noised_ordered_TS + LR", X_train, X_validation, noised_numerial_cate_features, target_col, early_stop=False, params=lr_params)

roc_auc_test = 0.5893246187363834
ks_test = 0.28050108932461876
(875, 10) (125, 10) 188 17
task name = noised_ordered_TS + LR
roc_auc_train = 0.6827634798228499
roc_auc_test = 0.5190631808278867
ks_train = 0.3117780048933073
ks_test = 0.11220043572984749


LogisticRegression(class_weight='balanced', max_iter=1000, random_state=1)

In [27]:
import random

# noised_numerial_cate_features = []

# for feat in cate_features:
#     numerial_feat = "numerial_" + feat
#     noised_feat = "noised_" + numerial_feat
#     noised_numerial_cate_features.append(noised_feat)
#     X_train[noised_feat] = X_train[numerial_feat]
#     X_validation[noised_feat] = X_validation[numerial_feat]
#     mean_key = (numerial_feat, 'mean')
#     std_key = (numerial_feat, 'std')
#     for k in numerial_cate_features_dict[numerial_feat][mean_key]:
#         mu = numerial_cate_features_dict[numerial_feat][mean_key][k]
#         sigma = numerial_cate_features_dict[numerial_feat][std_key][k]
#         if np.isnan(sigma):
#             sigma = 0.0001
#         sz = X_validation.loc[X_validation[feat]==k, noised_feat].shape
#         X_validation.loc[X_validation[feat]==k, noised_feat] = np.random.normal(mu, sigma, sz)
        
    
lgb_model = LGBMClassifier(**lgb_params).fit(X_train[noised_numerial_cate_features].fillna(0),y_train, eval_metric='AUC', early_stopping_rounds=100, 
                                         eval_set=[(X_validation[noised_numerial_cate_features].fillna(0), y_validation)])
model_test(lgb_model, X_validation[noised_numerial_cate_features].fillna(0), y_validation)
lr_params = {'class_weight':'balanced', 'max_iter':1000, 'random_state':1, 'solver':'lbfgs'}
get_lr_model("normal LR", X_train, X_validation, cate_features, target_col, early_stop=False, params=lr_params)
# get_lr_model_with_woe("woe + LR", X_train, X_validation, True, False, cate_features, target_col, params=lr_params)
get_lr_model("ordered_TS + LR", X_train, X_validation, noised_numerial_cate_features, target_col, early_stop=False, params=lr_params)

[1]	valid_0's auc: 0.646242
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.638889
[3]	valid_0's auc: 0.606754
[4]	valid_0's auc: 0.590959
[5]	valid_0's auc: 0.633987
[6]	valid_0's auc: 0.62854
[7]	valid_0's auc: 0.626362
[8]	valid_0's auc: 0.629085
[9]	valid_0's auc: 0.627996
[10]	valid_0's auc: 0.616558
[11]	valid_0's auc: 0.61329
[12]	valid_0's auc: 0.609477
[13]	valid_0's auc: 0.594771
[14]	valid_0's auc: 0.590959
[15]	valid_0's auc: 0.592593
[16]	valid_0's auc: 0.589325
[17]	valid_0's auc: 0.593682
[18]	valid_0's auc: 0.60512
[19]	valid_0's auc: 0.610566
[20]	valid_0's auc: 0.604575
[21]	valid_0's auc: 0.602941
[22]	valid_0's auc: 0.607298
[23]	valid_0's auc: 0.61329
[24]	valid_0's auc: 0.608932
[25]	valid_0's auc: 0.618736
[26]	valid_0's auc: 0.6122
[27]	valid_0's auc: 0.622004
[28]	valid_0's auc: 0.62037
[29]	valid_0's auc: 0.622549
[30]	valid_0's auc: 0.620915
[31]	valid_0's auc: 0.625817
[32]	valid_0's auc: 0.619281
[33]	valid_0's auc: 0.6181



LogisticRegression(class_weight='balanced', max_iter=1000, random_state=1)

In [28]:
cate_features, noised_numerial_cate_features

(['employmentTitle',
  'employmentLength_bin',
  'purpose',
  'postCode',
  'earliesCreditLine_bin',
  'regionCode',
  'title',
  'issueDate_bin',
  'term_bin',
  'homeOwnership_bin'],
 ['noised_numerial_employmentTitle',
  'noised_numerial_employmentLength_bin',
  'noised_numerial_purpose',
  'noised_numerial_postCode',
  'noised_numerial_earliesCreditLine_bin',
  'noised_numerial_regionCode',
  'noised_numerial_title',
  'noised_numerial_issueDate_bin',
  'noised_numerial_term_bin',
  'noised_numerial_homeOwnership_bin'])

In [29]:
splited_train_data.reset_index(drop=True, inplace=True)
splited_test_data.reset_index(drop=True, inplace=True)

for feat in numerial_cate_features:
    splited_train_data[feat] = 0
    splited_test_data[feat] = 0
    for r in range(s):
        splited_train_data[feat] += ordered_cate_feat_dfs[r][feat]/s
        splited_test_data[feat] += test_cate_feat_dfs[r][feat]/s

NameError: name 'splited_train_data' is not defined

In [None]:
model_lst = []

# n1	0.016240	NaN	NaN	34.0
# employmentLength	0.012264	NaN	NaN	12.0
# n7	0.012249	NaN	NaN	71.0
# n10	0.010291	NaN	NaN	77.0
# n5	0.009016	NaN	NaN	66.0
# n0	0.008828	NaN	NaN	40.0
# n4	0.007679	NaN	NaN	47.0
# n13	0.007130	NaN	NaN	29.0
# n6	0.007099	NaN	NaN	108.0
# pubRec	0.006602	NaN	NaN	32.0
# n8	0.006585	NaN	NaN	103.0
# n12	0.005863	NaN	NaN	6.0
# revolBal	0.005592	NaN	NaN	71116.0

# cate_features = ['employmentTitle', 'employmentLength_bin']
# train_data['employmentTitle_bin'] = train_data['employmentTitle']
# test_data['employmentTitle_bin'] = test_data['employmentTitle']
# feat_lst.remove('employmentTitle')
# train_data['postCode_bin'] = train_data['postCode']
# test_data['postCode_bin'] = test_data['postCode']
# feat_lst.remove('postCode')
cate_features = ['employmentTitle', 'employmentLength_bin', 'purpose', 'postCode', 'subGrade', 'earliesCreditLine_bin', \
                 'regionCode', 'title', 'issueDate_bin', 'term_bin',\
                 'interestRate_bin', 'annualIncome_bin', 'loanAmnt_bin','homeOwnership_bin',\
                 'dti_bin','installment_bin','revolBal_bin','revolUtil_bin']

#feat_lst = list(set(feat_lst))

for cate in cate_features:
    train_data[cate] = train_data[cate].fillna(0).astype('int')
    test_data[cate] = test_data[cate].fillna(0).astype('int')

train_data = train_data.sort_values(by='issueDate')
    
for i in range(3):
    X_train, X_validation, y_train, y_validation = train_test_split(train_data.loc[:, feat_lst],
                                                                    train_data.loc[:, 'isDefault'],
                                                                    test_size=0.125 , random_state=i*2000)
    model = CatBoostClassifier(iterations=2500, cat_features=cate_features,eval_metric='AUC',logging_level='Verbose', 
                            learning_rate=0.05, depth=6, l2_leaf_reg=5, loss_function='Logloss')
    print(X_train.shape,
          y_train.shape, 
          X_validation.shape, 
          y_validation.shape)
    model.fit(X_train,y_train, eval_set=(X_validation, y_validation), plot=True)
    model_lst.append(model)

In [None]:
full_model = CatBoostClassifier(iterations=3500, depth=6, cat_features=cate_features, learning_rate=0.05, loss_function='CrossEntropy',
                                eval_metric='AUC',logging_level='Verbose', l2_leaf_reg=5)
test_data['isDefault'] = 0
full_model.fit(train_data[feat_lst],train_data['isDefault'],plot=False)
preds = full_model.predict_proba(test_data[feat_lst])
test_data['isDefault'] = preds[:, 1]
test_data[['id','isDefault']].to_csv('submit.csv', index=False)

In [None]:
set1 = set(train_data['employmentTitle']) - set(test_data['employmentTitle'])
set2 = set(train_data['earliesCreditLine']) - set(test_data['earliesCreditLine'])
train_data['flag1'] = train_data['employmentTitle'].apply(lambda x: 1 if x in set1 else 0)
train_data['flag2'] = train_data['earliesCreditLine'].apply(lambda x: 1 if x in set2 else 0)

In [None]:
train_mask = (train_data['flag1']==0)|(train_data['id']>=200000)
valid_mask = (train_data['flag1']==1)&(train_data['id']<200000)

sum(train_mask), sum(valid_mask)

In [None]:
cate_features = ['employmentTitle_bin', 'employmentLength_bin', 'purpose', 'postCode_bin', 'subGrade', 'earliesCreditLine_bin', \
                 'regionCode', 'title', 'issueDate_bin', 'term_bin',\
                 'interestRate_bin', 'annualIncome_bin', 'loanAmnt_bin','homeOwnership_bin',\
                 'dti_bin','installment_bin','revolBal_bin','revolUtil_bin']

feat_lst = list(set(feat_lst))

for cate in cate_features:
    train_data[cate] = train_data[cate].fillna(0).astype('int')
    test_data[cate] = test_data[cate].fillna(0).astype('int')


X_train, y_train, X_validation, y_validation = train_data.loc[train_mask, feat_lst], \
                                                train_data.loc[train_mask, 'isDefault'], \
                                                train_data.loc[valid_mask, feat_lst], \
                                                train_data.loc[valid_mask, 'isDefault']
model = CatBoostClassifier(iterations=3500, cat_features=cate_features,eval_metric='AUC',logging_level='Verbose',
                          learning_rate=0.06, depth=6, l2_leaf_reg=5, loss_function='CrossEntropy')
print(X_train.shape,
      y_train.shape, 
      X_validation.shape, 
      y_validation.shape)
model.fit(X_train,y_train, eval_set=(X_validation, y_validation), plot=True)

In [None]:
test_data['isDefault'] = 0
preds = model.predict_proba(test_data[feat_lst])
test_data['isDefault'] = preds[:, 1]
test_data[['id','isDefault']].to_csv('submit.csv', index=False)

In [None]:
model_lst = []

# cate_features = ['employmentTitle', 'employmentLength_bin', 'purpose', 'postCode', 'subGrade', 'earliesCreditLine_bin', \
#                  'regionCode', 'applicationType', 'initialListStatus', 'title', 'issueDate_bin', 'term_bin', \
#                  'ficoRangeHigh', 'annualIncome_bin','dti_bin','loanAmnt_bin','installment_bin','revolBal_bin','revolUtil_bin']
                 #'interestRate_bin']
cate_features = ['employmentTitle', 'employmentLength_bin', 'purpose', 'postCode', 'subGrade', 'earliesCreditLine_bin', \
                 'regionCode', 'title', 'issueDate_bin', 'term_bin',\
                 'interestRate_bin', 'annualIncome_bin', 'loanAmnt_bin','homeOwnership_bin',\
                 'dti_bin','installment_bin','revolBal_bin','revolUtil_bin']

# for i in range(3):
#     X_train, X_validation, y_train, y_validation = train_test_split(splited_train_data.loc[:, nume_features+numerial_cate_features],
#                                                                     train_data.loc[:, 'isDefault'],
#                                                                     test_size=0.125 , random_state=i*2000)
model = CatBoostClassifier(iterations=2500,eval_metric='AUC',logging_level='Verbose',
                          depth=6, loss_function='Logloss')
print(X_train.shape,
      y_train.shape, 
      X_validation.shape, 
      y_validation.shape)
model.fit(splited_train_data[nume_features+numerial_cate_features], splited_train_data['isDefault'], 
          eval_set=(splited_test_data[nume_features+numerial_cate_features], splited_test_data['isDefault']), plot=True)
model_lst.append(model)

In [None]:
from lightgbm.sklearn import LGBMClassifier, LGBMRegressor

# cate_features = ['employmentTitle', 'employmentLength_bin', 'purpose', 'postCode', 'subGrade', 'earliesCreditLine_bin', \
#                  'regionCode', 'title', 'issueDate_bin', 'term_bin',\
#                  'interestRate_bin', 'annualIncome_bin', 'loanAmnt_bin','homeOwnership_bin',\
#                  'dti_bin','installment_bin','revolBal_bin','revolUtil_bin']
cate_features = ['employmentTitle', 'issueDate_bin', 'subGrade']
# nume_features = []

# for feat in feat_lst:
#     if feat not in cate_features:
#         nume_features.append(feat)

# print(len(nume_features))
        
lgb_params = {'metric': 'auc', 
              'lambda_l1': 0,
              'lambda_l2': 5,
              'num_leaves': 64,
              'feature_fraction': 0.7,
              'bagging_fraction': 0.7,
              'bagging_freq': 3,
              'min_child_samples': 50,
              'learning_rate': 0.05,
              'num_round': 1000}
        

# X_train, X_test, y_train, y_test = train_test_split(train_data.loc[:, ['id', 'isDefault']+feat_lst],
#                                                     train_data.loc[:, 'isDefault'],
#                                                     test_size=0.125 , random_state=2000)
# splited_train_data = X_train
# splited_test_data = X_test
print(splited_train_data.shape, splited_test_data.shape)

model.fit(splited_train_data[nume_features+numerial_cate_features], splited_train_data['isDefault'], 
          eval_set=(splited_test_data[nume_features+numerial_cate_features], splited_test_data['isDefault']))

In [None]:
train_data['reindex'] = np.random.permutation(train_data.shape[0])
cate_feat_df = train_data.sort_values(by=['purpose', 'reindex'])[['purpose', 'isDefault']]

In [None]:
target = 'isDefault'
numerial_cate_feat_lst = []
a = 0.3
p = cate_feat_df[target].mean()
y_sum = 0
y_cnt = 0
elem_pre = -1e9
for indx in range(cate_feat_df.shape[0]):
    elem = cate_feat_df.iloc[indx, 0]
    y = cate_feat_df.iloc[indx, 1]
    if elem != elem_pre:
        y_sum = 0
        y_cnt = 0
    val = (y_sum+a*p)/(y_cnt+a)
    y_sum += y
    y_cnt += 1
    elem_pre = elem
    numerial_cate_feat_lst.append(val)
len(numerial_cate_feat_lst)

In [None]:
splited_train_data[numerial_cate_features].head()

In [None]:
# s = 1
# for feat in numerial_cate_features:
#     splited_test_data[feat] = 0
#     for r in range(s):
#         splited_test_data[feat] += test_cate_feat_dfs[r][feat]/s

for feat in numerial_cate_features:
    splited_test_data[feat] = test_cate_feat_dfs[r][feat]
    
splited_test_data.reindex()

In [None]:
import toad
from toad.plot import bin_plot, badrate_plot

bin_num = 10
train_data['tmp_bin'] = pd.qcut(train_data['revolUtil'], bin_num, labels=np.arange(bin_num), duplicates='drop')
# train_df['tmp_bin'] = train_df['homeOwnership'].apply(lambda x: 11 if x > 10 else x)
bin_plot(train_data[['tmp_bin','isDefault']], x='tmp_bin', target='isDefault', iv=True, annotate_format='.3f')
del train_data['tmp_bin']

In [None]:
X_train[['employmentLength_bin', 'numerial_employmentLength_bin']].groupby(by='employmentLength_bin').agg(['mean', 'count'])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(style="white", palette="muted", color_codes=True)

feat = 'employmentLength_bin'
x_set = set(X_train[feat])

for e in x_set:
    x_name = feat+'='+str(e)
    x = X_train.loc[X_train[feat]==e, 'numerial_'+feat]
    x = x.rename(columns={'numerial_'+feat:x_name})
    ax1=sns.kdeplot(x)

x_name = 'guest received'
# x = X_train.loc[:, ['numerial_'+feat]]
# x = x.rename(columns={'numerial_'+feat:x_name})[x_name]
# ax1=sns.distplot(x)

In [None]:
import random

noised_numerial_cate_features = []
count = 0

for feat in cate_features:
    numerial_feat = "numerial_" + feat
    noised_feat = "noised_" + numerial_feat
    noised_numerial_cate_features.append(noised_feat)
    X_train[noised_feat] = X_train[numerial_feat]
    X_validation[noised_feat] = X_validation[numerial_feat]
    mean_key = (numerial_feat, 'mean')
    std_key = (numerial_feat, 'std')
    for k in numerial_cate_features_dict[numerial_feat][mean_key]:
        count += 1
        if count % 10000 == 0:
            print(count, feat, k)
        mu = numerial_cate_features_dict[numerial_feat][mean_key][k]
        sigma = numerial_cate_features_dict[numerial_feat][std_key][k]
        if np.isnan(sigma):
            sigma = 0.0001
        sz = X_validation.loc[X_validation[feat]==k, noised_feat].shape
        X_validation.loc[X_validation[feat]==k, noised_feat] = np.random.normal(mu, sigma, sz)

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(style="white", palette="muted", color_codes=True)

feat = 'employmentLength_bin'
x_set = set(X_validation[feat])

# for e in x_set:
#     x_name = feat+'='+str(e)
#     x = X_validation.loc[X_validation[feat]==e, 'numerial_'+feat]
#     #x = x.rename(columns={'numerial_'+feat:x_name})
#     ax1=sns.distplot(x)

x_name = 'guest received'
x = X_validation.loc[:, ['noised_numerial_'+feat]]
x = x.rename(columns={'noised_numerial_'+feat:x_name})[x_name]
ax1=sns.distplot(x)

In [None]:
X_validation.