In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
import os
from datetime import date
import random 

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import sklearn
import xgboost as xgb
import lightgbm as lgb
from xgboost.sklearn import XGBClassifier 
from lightgbm import LGBMClassifier
from sklearn import metrics  
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split  
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Imputer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
pd.set_option('max_rows', 500)
pd.set_option('max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('expand_frame_repr', True)

In [3]:
def load_kaggle_csv(filepath, num_rows, root="D:/wangh/Kaggle/HomeCredit/Data"):
    csv_path = os.path.join(root, filepath)
    
    if not os.path.isfile(csv_path) :
            csv_path = os.path.join("E:/SparkExerciseData/Kaggle", filepath)
        
    return pd.read_csv(csv_path, nrows = num_rows)

In [4]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

In [5]:
# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [6]:
# Preprocess application_train.csv and application_test.csv
def application_train_test(num_rows = None, nan_as_category = True):
    # Read data and merge
    df = load_kaggle_csv('application_train.csv', num_rows)
    test_df = load_kaggle_csv('application_test.csv', num_rows)
    print("Train samples: {}, test samples: {}".format(len(df), len(test_df)))
    df = df.append(test_df).reset_index()
    
    # Categorical features: Binary features and One-Hot encoding
    for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
        df[bin_feature], uniques = pd.factorize(df[bin_feature])
    df, cat_cols = one_hot_encoder(df, nan_as_category)
    # NaN values for DAYS_EMPLOYED: 365.243 -> nan
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)
    
    #AMT_INCOME_TOTAL 117000000 -> nan
    df['AMT_INCOME_TOTAL'].replace(117000000, np.nan, inplace= True)
    
    # Some simple new features (percentages)
    df['DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
    df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    df['ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    del test_df
    gc.collect()
    return df

In [7]:
# Preprocess bureau.csv and bureau_balance.csv
def bureau_and_balance(num_rows = None, nan_as_category = True):
    bureau = load_kaggle_csv('bureau.csv', num_rows)
    bb = load_kaggle_csv('bureau_balance.csv', num_rows)
    bb, bb_cat = one_hot_encoder(bb, nan_as_category)
    bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)
    
    # Bureau balance: Perform aggregations and merge with bureau.csv
    bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
    for col in bb_cat:
        bb_aggregations[col] = ['mean']
    bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
    bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
    bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
    bureau.drop(columns= 'SK_ID_BUREAU', inplace= True)
    del bb, bb_agg
    gc.collect()
    
    # Bureau and bureau_balance numeric features
    num_aggregations = {
        'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
        'CREDIT_DAY_OVERDUE': ['max', 'mean'],
        'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
        'AMT_CREDIT_MAX_OVERDUE': ['mean'],
        'CNT_CREDIT_PROLONG': ['sum'],
        'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
        'DAYS_CREDIT_UPDATE': ['min', 'max', 'mean'],
        'AMT_ANNUITY': ['max', 'mean'],
        'MONTHS_BALANCE_MIN': ['min'],
        'MONTHS_BALANCE_MAX': ['max'],
        'MONTHS_BALANCE_SIZE': ['mean', 'sum']
    }
    # Bureau and bureau_balance categorical features
    cat_aggregations = {}
    for cat in bureau_cat: cat_aggregations[cat] = ['mean']
    for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean']
    
    bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
    # Bureau: Active credits - using only numerical aggregations
    active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
    active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
    active_agg.columns = pd.Index(['ACT_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR')
    del active, active_agg
    gc.collect()
    # Bureau: Closed credits - using only numerical aggregations
    closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
    closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
    closed_agg.columns = pd.Index(['CLS_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')
    del closed, closed_agg, bureau
    gc.collect()
    return bureau_agg

In [8]:
# Preprocess previous_applications.csv
def previous_applications(num_rows = None, nan_as_category = True):
    prev = load_kaggle_csv('previous_application.csv', num_rows)
    prev, cat_cols = one_hot_encoder(prev, nan_as_category= True)
    # Days 365.243 values -> nan
    prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
    prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
    # Add feature: value ask / value received percentage
    prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']
    # Previous applications numeric features
    num_aggregations = {
        'AMT_ANNUITY': ['min', 'max', 'mean'],
        'AMT_APPLICATION': ['min', 'max', 'mean'],
        'AMT_CREDIT': ['min', 'max', 'mean'],
        'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'],
        'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
        'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
        'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'DAYS_DECISION': ['min', 'max', 'mean'],
        'CNT_PAYMENT': ['mean', 'sum'],
    }
    # Previous applications categorical features
    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ['mean']
    
    prev_agg = prev.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
    # Previous Applications: Approved Applications - only numerical features
    approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
    approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
    approved_agg.columns = pd.Index(['APR_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
    prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')
    # Previous Applications: Refused Applications - only numerical features
    refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
    refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
    refused_agg.columns = pd.Index(['REF_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
    prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
    del refused, refused_agg, approved, approved_agg, prev
    gc.collect()
    return prev_agg

In [9]:
# Preprocess POS_CASH_balance.csv
def pos_cash(num_rows = None, nan_as_category = True):
    pos = load_kaggle_csv('POS_CASH_balance.csv', num_rows)
    pos, cat_cols = one_hot_encoder(pos, nan_as_category= True)
    # Features
    aggregations = {
        'MONTHS_BALANCE': ['max', 'mean', 'size'],
        'SK_DPD': ['max', 'mean'],
        'SK_DPD_DEF': ['max', 'mean']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    
    pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
    pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
    # Count pos cash accounts
    pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
    del pos
    gc.collect()
    return pos_agg

In [10]:
# Preprocess installments_payments.csv
def installments_payments(num_rows = None, nan_as_category = True):
    ins = load_kaggle_csv('installments_payments.csv', num_rows)
    ins, cat_cols = one_hot_encoder(ins, nan_as_category= True)
    # Percentage and difference paid in each installment (amount paid and installment value)
    ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
    ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']
    # Days past due and days before due (no negative values)
    ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
    ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
    ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
    ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)
    # Features: Perform aggregations
    aggregations = {
        'NUM_INSTALMENT_VERSION': ['nunique'],
        'DPD': ['max', 'mean', 'sum'],
        'DBD': ['max', 'mean', 'sum'],
        'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'],
        'PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'],
        'AMT_INSTALMENT': ['max', 'mean', 'sum'],
        'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
        'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
    ins_agg.columns = pd.Index(['INS_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
    # Count installments accounts
    ins_agg['INS_COUNT'] = ins.groupby('SK_ID_CURR').size()
    del ins
    gc.collect()
    return ins_agg

In [11]:
# Preprocess credit_card_balance.csv
def credit_card_balance(num_rows = None, nan_as_category = True):
    cc = load_kaggle_csv('credit_card_balance.csv', num_rows)
    cc, cat_cols = one_hot_encoder(cc, nan_as_category= True)
    # General aggregations
    cc.drop(columns = ['SK_ID_PREV'], inplace = True)
    cc_agg = cc.groupby('SK_ID_CURR').agg(['min', 'max', 'mean', 'sum', 'var'])
    cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
    # Count credit card lines
    cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()
    del cc
    gc.collect()
    return cc_agg

In [12]:
def print_AUC(alg, input_train, input_Y, input_test, input_test_Y) :
    predictions_train = alg.predict(input_train)
    predictions_prob_train = alg.predict_proba(input_train)[:,1]
    print("Train Accuracy Score:", metrics.accuracy_score(input_Y, predictions_train))
    print("Train AUC Score:", metrics.roc_auc_score(input_Y, predictions_prob_train))

    predictions = alg.predict(input_test)
    predictions_prob = alg.predict_proba(input_test)[:,1]

    print("Accuracy Score:", metrics.accuracy_score(input_test_Y, predictions))
    print("AUC Score:", metrics.roc_auc_score(input_test_Y, predictions_prob))

In [13]:
def cv_alg(alg, cv_df, num_folds=5, stratified = True):
    print("Starting CV {}. CV shape: {}".format(alg.__class__.__name__,cv_df.shape))
    
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1001)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001)
        
    feats = [f for f in cv_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV']]
    
    cv_df_feats = cv_df[feats]
    cv_df_TARGET = cv_df['TARGET']
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(cv_df_feats, cv_df_TARGET)):
        train_x, train_y = cv_df_feats.iloc[train_idx], cv_df_TARGET.iloc[train_idx]
        valid_x, valid_y = cv_df_feats.iloc[valid_idx], cv_df_TARGET.iloc[valid_idx]   
        alg.fit(train_x, train_y)
        print_AUC(alg, train_x, train_y, valid_x,  valid_y)


In [14]:
from sklearn.model_selection import StratifiedShuffleSplit

def fit_alg(alg, train_df, val_percent=0.2):
    print("Starting fit {}. fit shape: {}".format(alg.__class__.__name__,train_df.shape))

    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV']]
    
    X = train_df[feats]
    y = train_df['TARGET']
    
    if val_percent == 0 :
        alg.fit(X, y)
    else :
        split = StratifiedShuffleSplit(n_splits=1, test_size= val_percent, random_state=1001)
        for train_index, test_index in split.split(X, y):
            X_train, X_test = X.loc[train_index], X.loc[test_index]
            y_train, y_test = y.loc[train_index], y.loc[test_index]

        alg.fit(X_train, y_train)

        print_AUC(alg, X_train, y_train, X_test,  y_test)

In [15]:
# LightGBM GBDT with KFold or Stratified KFold
# Parameters from Tilii kernel: https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code
def kfold_lightgbm(df, clf, num_folds, stratified = False):
    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
       # clf = LGBMClassifier(
       #     nthread=4,
       #     n_estimators=10000,
       #     learning_rate=0.02,
       #     num_leaves=34,
       #     colsample_bytree=0.9497036,
       #     subsample=0.8715623,
       #     max_depth=8,
       #     reg_alpha=0.041545473,
       #     reg_lambda=0.0735294,
       #     min_split_gain=0.0222415,
       #     min_child_weight=39.3259775,
       #    silent=-1,
       #    verbose=-1, )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= 100, early_stopping_rounds= 100)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    # Write submission file and plot feature importance
    test_df['TARGET'] = sub_preds
    test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False)
    display_importances(feature_importance_df)
    return feature_importance_df

In [16]:
# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances-01.png')

# Data preparation

In [17]:
debug = False

In [18]:
#def prepare_data(debug = False):
num_rows = 10000 if debug else None
df = application_train_test(num_rows)
with timer("Process bureau and bureau_balance"):
    bureau = bureau_and_balance(num_rows)
    print("Bureau df shape:", bureau.shape)
    df = df.join(bureau, how='left', on='SK_ID_CURR')

with timer("Process previous_applications"):
    prev = previous_applications(num_rows)
    print("Previous applications df shape:", prev.shape)
    df = df.join(prev, how='left', on='SK_ID_CURR')

with timer("Process POS-CASH balance"):
    pos = pos_cash(num_rows)
    print("Pos-cash balance df shape:", pos.shape)
    df = df.join(pos, how='left', on='SK_ID_CURR')

with timer("Process installments payments"):
    ins = installments_payments(num_rows)
    print("Installments payments df shape:", ins.shape)
    df = df.join(ins, how='left', on='SK_ID_CURR')

with timer("Process credit card balance"):
    cc = credit_card_balance(num_rows)
    print("Credit card balance df shape:", cc.shape)
    df = df.join(cc, how='left', on='SK_ID_CURR')
    
del bureau
del prev
del pos
del ins  
del cc
gc.collect()
#with timer("Run LightGBM with kfold"):
    #feat_importance = kfold_lightgbm(df, num_folds= 5, stratified = False)



Train samples: 307511, test samples: 48744
Bureau df shape: (305811, 122)
Process bureau and bureau_balance - done in 27s
Previous applications df shape: (338857, 249)
Process previous_applications - done in 31s
Pos-cash balance df shape: (337252, 18)
Process POS-CASH balance - done in 16s
Installments payments df shape: (339587, 26)
Process installments payments - done in 35s
Credit card balance df shape: (103558, 141)
Process credit card balance - done in 23s


21

In [19]:
train_df = df[df['TARGET'].notnull()]
test_df = df[df['TARGET'].isnull()]

In [82]:
ignore_features_median =  ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV']
relevant_features_median = [col for col in df.columns if col not in ignore_features_imputer]
df_imp = df.replace([np.inf, -np.inf], np.nan)

for col in relevant_features_median :
    df_imp[col].fillna(df[col].median(), inplace=True)
train_df_imp = df_imp[df_imp['TARGET'].notnull()]
test_df_imp = df_imp[df_imp['TARGET'].isnull()]

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size= 0.3, random_state=1001)
for train_index, val_index in split.split(X, y):
    X_train, X_val = train_df.loc[train_index], train_df.loc[val_index]
    X_train_imp, X_val_imp = train_df_imp.loc[train_index], train_df_imp.loc[val_index]

In [84]:
df.shape, X_train.shape, X_val.shape, X_train_imp.shape, X_val_imp.shape

((356255, 816), (307511, 816), (48744, 816), (307511, 816), (48744, 816))

In [87]:
train_df[relevant_features_imputer].iloc[5687:5689, 654:657].head()

Unnamed: 0,INS_PAYMENT_PERC_MAX,INS_PAYMENT_PERC_MEAN,INS_PAYMENT_PERC_SUM
5687,inf,inf,inf
5688,1.0,0.857143,12.0


In [88]:
train_df_imp[relevant_features_imputer].iloc[5687:5689, 654:657].head()

Unnamed: 0,INS_PAYMENT_PERC_MAX,INS_PAYMENT_PERC_MEAN,INS_PAYMENT_PERC_SUM
5687,1.0,1.0,24.638587
5688,1.0,0.857143,12.0


In [86]:
train_df_imp.head()

Unnamed: 0,index,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,APARTMENTS_AVG,APARTMENTS_MEDI,APARTMENTS_MODE,BASEMENTAREA_AVG,BASEMENTAREA_MEDI,BASEMENTAREA_MODE,CNT_CHILDREN,CNT_FAM_MEMBERS,CODE_GENDER,COMMONAREA_AVG,COMMONAREA_MEDI,COMMONAREA_MODE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_ID_PUBLISH,DAYS_LAST_PHONE_CHANGE,DAYS_REGISTRATION,DEF_30_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,ELEVATORS_AVG,ELEVATORS_MEDI,ELEVATORS_MODE,ENTRANCES_AVG,ENTRANCES_MEDI,ENTRANCES_MODE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,FLAG_CONT_MOBILE,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_2,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_EMAIL,FLAG_EMP_PHONE,FLAG_MOBIL,FLAG_OWN_CAR,FLAG_OWN_REALTY,FLAG_PHONE,FLAG_WORK_PHONE,FLOORSMAX_AVG,FLOORSMAX_MEDI,FLOORSMAX_MODE,FLOORSMIN_AVG,FLOORSMIN_MEDI,FLOORSMIN_MODE,HOUR_APPR_PROCESS_START,LANDAREA_AVG,LANDAREA_MEDI,LANDAREA_MODE,LIVE_CITY_NOT_WORK_CITY,LIVE_REGION_NOT_WORK_REGION,LIVINGAPARTMENTS_AVG,LIVINGAPARTMENTS_MEDI,LIVINGAPARTMENTS_MODE,LIVINGAREA_AVG,LIVINGAREA_MEDI,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_AVG,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_AVG,NONLIVINGAREA_MEDI,NONLIVINGAREA_MODE,OBS_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,OWN_CAR_AGE,REGION_POPULATION_RELATIVE,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,SK_ID_CURR,TARGET,TOTALAREA_MODE,YEARS_BEGINEXPLUATATION_AVG,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_AVG,YEARS_BUILD_MEDI,YEARS_BUILD_MODE,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,EMERGENCYSTATE_MODE_nan,FONDKAPREMONT_MODE_not specified,FONDKAPREMONT_MODE_org spec account,FONDKAPREMONT_MODE_reg oper account,FONDKAPREMONT_MODE_reg oper spec account,FONDKAPREMONT_MODE_nan,HOUSETYPE_MODE_block of flats,HOUSETYPE_MODE_specific housing,HOUSETYPE_MODE_terraced house,HOUSETYPE_MODE_nan,NAME_CONTRACT_TYPE_Cash loans,NAME_CONTRACT_TYPE_Revolving loans,NAME_CONTRACT_TYPE_nan,NAME_EDUCATION_TYPE_Academic degree,NAME_EDUCATION_TYPE_Higher education,NAME_EDUCATION_TYPE_Incomplete higher,NAME_EDUCATION_TYPE_Lower secondary,NAME_EDUCATION_TYPE_Secondary / secondary special,NAME_EDUCATION_TYPE_nan,NAME_FAMILY_STATUS_Civil marriage,NAME_FAMILY_STATUS_Married,NAME_FAMILY_STATUS_Separated,NAME_FAMILY_STATUS_Single / not married,NAME_FAMILY_STATUS_Unknown,NAME_FAMILY_STATUS_Widow,NAME_FAMILY_STATUS_nan,NAME_HOUSING_TYPE_Co-op apartment,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Office apartment,NAME_HOUSING_TYPE_Rented apartment,NAME_HOUSING_TYPE_With parents,NAME_HOUSING_TYPE_nan,NAME_INCOME_TYPE_Businessman,NAME_INCOME_TYPE_Commercial associate,NAME_INCOME_TYPE_Maternity leave,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_State servant,NAME_INCOME_TYPE_Student,NAME_INCOME_TYPE_Unemployed,NAME_INCOME_TYPE_Working,NAME_INCOME_TYPE_nan,NAME_TYPE_SUITE_Children,NAME_TYPE_SUITE_Family,NAME_TYPE_SUITE_Group of people,NAME_TYPE_SUITE_Other_A,NAME_TYPE_SUITE_Other_B,"NAME_TYPE_SUITE_Spouse, partner",NAME_TYPE_SUITE_Unaccompanied,NAME_TYPE_SUITE_nan,OCCUPATION_TYPE_Accountants,OCCUPATION_TYPE_Cleaning staff,OCCUPATION_TYPE_Cooking staff,OCCUPATION_TYPE_Core staff,OCCUPATION_TYPE_Drivers,OCCUPATION_TYPE_HR staff,OCCUPATION_TYPE_High skill tech staff,OCCUPATION_TYPE_IT staff,OCCUPATION_TYPE_Laborers,OCCUPATION_TYPE_Low-skill Laborers,OCCUPATION_TYPE_Managers,OCCUPATION_TYPE_Medicine staff,OCCUPATION_TYPE_Private service staff,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff,OCCUPATION_TYPE_nan,ORGANIZATION_TYPE_Advertising,ORGANIZATION_TYPE_Agriculture,ORGANIZATION_TYPE_Bank,ORGANIZATION_TYPE_Business Entity Type 1,ORGANIZATION_TYPE_Business Entity Type 2,ORGANIZATION_TYPE_Business Entity Type 3,ORGANIZATION_TYPE_Cleaning,ORGANIZATION_TYPE_Construction,ORGANIZATION_TYPE_Culture,ORGANIZATION_TYPE_Electricity,ORGANIZATION_TYPE_Emergency,ORGANIZATION_TYPE_Government,ORGANIZATION_TYPE_Hotel,ORGANIZATION_TYPE_Housing,ORGANIZATION_TYPE_Industry: type 1,ORGANIZATION_TYPE_Industry: type 10,ORGANIZATION_TYPE_Industry: type 11,ORGANIZATION_TYPE_Industry: type 12,ORGANIZATION_TYPE_Industry: type 13,ORGANIZATION_TYPE_Industry: type 2,ORGANIZATION_TYPE_Industry: type 3,ORGANIZATION_TYPE_Industry: type 4,ORGANIZATION_TYPE_Industry: type 5,ORGANIZATION_TYPE_Industry: type 6,ORGANIZATION_TYPE_Industry: type 7,ORGANIZATION_TYPE_Industry: type 8,ORGANIZATION_TYPE_Industry: type 9,ORGANIZATION_TYPE_Insurance,ORGANIZATION_TYPE_Kindergarten,ORGANIZATION_TYPE_Legal Services,ORGANIZATION_TYPE_Medicine,ORGANIZATION_TYPE_Military,ORGANIZATION_TYPE_Mobile,ORGANIZATION_TYPE_Other,ORGANIZATION_TYPE_Police,ORGANIZATION_TYPE_Postal,ORGANIZATION_TYPE_Realtor,ORGANIZATION_TYPE_Religion,ORGANIZATION_TYPE_Restaurant,ORGANIZATION_TYPE_School,ORGANIZATION_TYPE_Security,ORGANIZATION_TYPE_Security Ministries,ORGANIZATION_TYPE_Self-employed,ORGANIZATION_TYPE_Services,ORGANIZATION_TYPE_Telecom,ORGANIZATION_TYPE_Trade: type 1,ORGANIZATION_TYPE_Trade: type 2,ORGANIZATION_TYPE_Trade: type 3,ORGANIZATION_TYPE_Trade: type 4,ORGANIZATION_TYPE_Trade: type 5,ORGANIZATION_TYPE_Trade: type 6,ORGANIZATION_TYPE_Trade: type 7,ORGANIZATION_TYPE_Transport: type 1,ORGANIZATION_TYPE_Transport: type 2,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_Transport: type 4,ORGANIZATION_TYPE_University,ORGANIZATION_TYPE_XNA,ORGANIZATION_TYPE_nan,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,WALLSMATERIAL_MODE_nan,WEEKDAY_APPR_PROCESS_START_FRIDAY,WEEKDAY_APPR_PROCESS_START_MONDAY,...,PREV_PRODUCT_COMBINATION_POS mobile with interest_MEAN,PREV_PRODUCT_COMBINATION_POS mobile without interest_MEAN,PREV_PRODUCT_COMBINATION_POS other with interest_MEAN,PREV_PRODUCT_COMBINATION_POS others without interest_MEAN,PREV_PRODUCT_COMBINATION_nan_MEAN,APR_AMT_ANNUITY_MIN,APR_AMT_ANNUITY_MAX,APR_AMT_ANNUITY_MEAN,APR_AMT_APPLICATION_MIN,APR_AMT_APPLICATION_MAX,APR_AMT_APPLICATION_MEAN,APR_AMT_CREDIT_MIN,APR_AMT_CREDIT_MAX,APR_AMT_CREDIT_MEAN,APR_APP_CREDIT_PERC_MIN,APR_APP_CREDIT_PERC_MAX,APR_APP_CREDIT_PERC_MEAN,APR_APP_CREDIT_PERC_VAR,APR_AMT_DOWN_PAYMENT_MIN,APR_AMT_DOWN_PAYMENT_MAX,APR_AMT_DOWN_PAYMENT_MEAN,APR_AMT_GOODS_PRICE_MIN,APR_AMT_GOODS_PRICE_MAX,APR_AMT_GOODS_PRICE_MEAN,APR_HOUR_APPR_PROCESS_START_MIN,APR_HOUR_APPR_PROCESS_START_MAX,APR_HOUR_APPR_PROCESS_START_MEAN,APR_RATE_DOWN_PAYMENT_MIN,APR_RATE_DOWN_PAYMENT_MAX,APR_RATE_DOWN_PAYMENT_MEAN,APR_DAYS_DECISION_MIN,APR_DAYS_DECISION_MAX,APR_DAYS_DECISION_MEAN,APR_CNT_PAYMENT_MEAN,APR_CNT_PAYMENT_SUM,REF_AMT_ANNUITY_MIN,REF_AMT_ANNUITY_MAX,REF_AMT_ANNUITY_MEAN,REF_AMT_APPLICATION_MIN,REF_AMT_APPLICATION_MAX,REF_AMT_APPLICATION_MEAN,REF_AMT_CREDIT_MIN,REF_AMT_CREDIT_MAX,REF_AMT_CREDIT_MEAN,REF_APP_CREDIT_PERC_MIN,REF_APP_CREDIT_PERC_MAX,REF_APP_CREDIT_PERC_MEAN,REF_APP_CREDIT_PERC_VAR,REF_AMT_DOWN_PAYMENT_MIN,REF_AMT_DOWN_PAYMENT_MAX,REF_AMT_DOWN_PAYMENT_MEAN,REF_AMT_GOODS_PRICE_MIN,REF_AMT_GOODS_PRICE_MAX,REF_AMT_GOODS_PRICE_MEAN,REF_HOUR_APPR_PROCESS_START_MIN,REF_HOUR_APPR_PROCESS_START_MAX,REF_HOUR_APPR_PROCESS_START_MEAN,REF_RATE_DOWN_PAYMENT_MIN,REF_RATE_DOWN_PAYMENT_MAX,REF_RATE_DOWN_PAYMENT_MEAN,REF_DAYS_DECISION_MIN,REF_DAYS_DECISION_MAX,REF_DAYS_DECISION_MEAN,REF_CNT_PAYMENT_MEAN,REF_CNT_PAYMENT_SUM,POS_MONTHS_BALANCE_MAX,POS_MONTHS_BALANCE_MEAN,POS_MONTHS_BALANCE_SIZE,POS_SK_DPD_MAX,POS_SK_DPD_MEAN,POS_SK_DPD_DEF_MAX,POS_SK_DPD_DEF_MEAN,POS_NAME_CONTRACT_STATUS_Active_MEAN,POS_NAME_CONTRACT_STATUS_Amortized debt_MEAN,POS_NAME_CONTRACT_STATUS_Approved_MEAN,POS_NAME_CONTRACT_STATUS_Canceled_MEAN,POS_NAME_CONTRACT_STATUS_Completed_MEAN,POS_NAME_CONTRACT_STATUS_Demand_MEAN,POS_NAME_CONTRACT_STATUS_Returned to the store_MEAN,POS_NAME_CONTRACT_STATUS_Signed_MEAN,POS_NAME_CONTRACT_STATUS_XNA_MEAN,POS_NAME_CONTRACT_STATUS_nan_MEAN,POS_COUNT,INS_NUM_INSTALMENT_VERSION_NUNIQUE,INS_DPD_MAX,INS_DPD_MEAN,INS_DPD_SUM,INS_DBD_MAX,INS_DBD_MEAN,INS_DBD_SUM,INS_PAYMENT_PERC_MAX,INS_PAYMENT_PERC_MEAN,INS_PAYMENT_PERC_SUM,INS_PAYMENT_PERC_VAR,INS_PAYMENT_DIFF_MAX,INS_PAYMENT_DIFF_MEAN,INS_PAYMENT_DIFF_SUM,INS_PAYMENT_DIFF_VAR,INS_AMT_INSTALMENT_MAX,INS_AMT_INSTALMENT_MEAN,INS_AMT_INSTALMENT_SUM,INS_AMT_PAYMENT_MIN,INS_AMT_PAYMENT_MAX,INS_AMT_PAYMENT_MEAN,INS_AMT_PAYMENT_SUM,INS_DAYS_ENTRY_PAYMENT_MAX,INS_DAYS_ENTRY_PAYMENT_MEAN,INS_DAYS_ENTRY_PAYMENT_SUM,INS_COUNT,CC_MONTHS_BALANCE_MIN,CC_MONTHS_BALANCE_MAX,CC_MONTHS_BALANCE_MEAN,CC_MONTHS_BALANCE_SUM,CC_MONTHS_BALANCE_VAR,CC_AMT_BALANCE_MIN,CC_AMT_BALANCE_MAX,CC_AMT_BALANCE_MEAN,CC_AMT_BALANCE_SUM,CC_AMT_BALANCE_VAR,CC_AMT_CREDIT_LIMIT_ACTUAL_MIN,CC_AMT_CREDIT_LIMIT_ACTUAL_MAX,CC_AMT_CREDIT_LIMIT_ACTUAL_MEAN,CC_AMT_CREDIT_LIMIT_ACTUAL_SUM,CC_AMT_CREDIT_LIMIT_ACTUAL_VAR,CC_AMT_DRAWINGS_ATM_CURRENT_MIN,CC_AMT_DRAWINGS_ATM_CURRENT_MAX,CC_AMT_DRAWINGS_ATM_CURRENT_MEAN,CC_AMT_DRAWINGS_ATM_CURRENT_SUM,CC_AMT_DRAWINGS_ATM_CURRENT_VAR,CC_AMT_DRAWINGS_CURRENT_MIN,CC_AMT_DRAWINGS_CURRENT_MAX,CC_AMT_DRAWINGS_CURRENT_MEAN,CC_AMT_DRAWINGS_CURRENT_SUM,CC_AMT_DRAWINGS_CURRENT_VAR,CC_AMT_DRAWINGS_OTHER_CURRENT_MIN,CC_AMT_DRAWINGS_OTHER_CURRENT_MAX,CC_AMT_DRAWINGS_OTHER_CURRENT_MEAN,CC_AMT_DRAWINGS_OTHER_CURRENT_SUM,CC_AMT_DRAWINGS_OTHER_CURRENT_VAR,CC_AMT_DRAWINGS_POS_CURRENT_MIN,CC_AMT_DRAWINGS_POS_CURRENT_MAX,CC_AMT_DRAWINGS_POS_CURRENT_MEAN,CC_AMT_DRAWINGS_POS_CURRENT_SUM,CC_AMT_DRAWINGS_POS_CURRENT_VAR,CC_AMT_INST_MIN_REGULARITY_MIN,CC_AMT_INST_MIN_REGULARITY_MAX,CC_AMT_INST_MIN_REGULARITY_MEAN,CC_AMT_INST_MIN_REGULARITY_SUM,CC_AMT_INST_MIN_REGULARITY_VAR,CC_AMT_PAYMENT_CURRENT_MIN,CC_AMT_PAYMENT_CURRENT_MAX,CC_AMT_PAYMENT_CURRENT_MEAN,CC_AMT_PAYMENT_CURRENT_SUM,CC_AMT_PAYMENT_CURRENT_VAR,CC_AMT_PAYMENT_TOTAL_CURRENT_MIN,CC_AMT_PAYMENT_TOTAL_CURRENT_MAX,CC_AMT_PAYMENT_TOTAL_CURRENT_MEAN,CC_AMT_PAYMENT_TOTAL_CURRENT_SUM,CC_AMT_PAYMENT_TOTAL_CURRENT_VAR,CC_AMT_RECEIVABLE_PRINCIPAL_MIN,CC_AMT_RECEIVABLE_PRINCIPAL_MAX,CC_AMT_RECEIVABLE_PRINCIPAL_MEAN,CC_AMT_RECEIVABLE_PRINCIPAL_SUM,CC_AMT_RECEIVABLE_PRINCIPAL_VAR,CC_AMT_RECIVABLE_MIN,CC_AMT_RECIVABLE_MAX,CC_AMT_RECIVABLE_MEAN,CC_AMT_RECIVABLE_SUM,CC_AMT_RECIVABLE_VAR,CC_AMT_TOTAL_RECEIVABLE_MIN,CC_AMT_TOTAL_RECEIVABLE_MAX,CC_AMT_TOTAL_RECEIVABLE_MEAN,CC_AMT_TOTAL_RECEIVABLE_SUM,CC_AMT_TOTAL_RECEIVABLE_VAR,CC_CNT_DRAWINGS_ATM_CURRENT_MIN,CC_CNT_DRAWINGS_ATM_CURRENT_MAX,CC_CNT_DRAWINGS_ATM_CURRENT_MEAN,CC_CNT_DRAWINGS_ATM_CURRENT_SUM,CC_CNT_DRAWINGS_ATM_CURRENT_VAR,CC_CNT_DRAWINGS_CURRENT_MIN,CC_CNT_DRAWINGS_CURRENT_MAX,CC_CNT_DRAWINGS_CURRENT_MEAN,CC_CNT_DRAWINGS_CURRENT_SUM,CC_CNT_DRAWINGS_CURRENT_VAR,CC_CNT_DRAWINGS_OTHER_CURRENT_MIN,CC_CNT_DRAWINGS_OTHER_CURRENT_MAX,CC_CNT_DRAWINGS_OTHER_CURRENT_MEAN,CC_CNT_DRAWINGS_OTHER_CURRENT_SUM,CC_CNT_DRAWINGS_OTHER_CURRENT_VAR,CC_CNT_DRAWINGS_POS_CURRENT_MIN,CC_CNT_DRAWINGS_POS_CURRENT_MAX,CC_CNT_DRAWINGS_POS_CURRENT_MEAN,CC_CNT_DRAWINGS_POS_CURRENT_SUM,CC_CNT_DRAWINGS_POS_CURRENT_VAR,CC_CNT_INSTALMENT_MATURE_CUM_MIN,CC_CNT_INSTALMENT_MATURE_CUM_MAX,CC_CNT_INSTALMENT_MATURE_CUM_MEAN,CC_CNT_INSTALMENT_MATURE_CUM_SUM,CC_CNT_INSTALMENT_MATURE_CUM_VAR,CC_SK_DPD_MIN,CC_SK_DPD_MAX,CC_SK_DPD_MEAN,CC_SK_DPD_SUM,CC_SK_DPD_VAR,CC_SK_DPD_DEF_MIN,CC_SK_DPD_DEF_MAX,CC_SK_DPD_DEF_MEAN,CC_SK_DPD_DEF_SUM,CC_SK_DPD_DEF_VAR,CC_NAME_CONTRACT_STATUS_Active_MIN,CC_NAME_CONTRACT_STATUS_Active_MAX,CC_NAME_CONTRACT_STATUS_Active_MEAN,CC_NAME_CONTRACT_STATUS_Active_SUM,CC_NAME_CONTRACT_STATUS_Active_VAR,CC_NAME_CONTRACT_STATUS_Approved_MIN,CC_NAME_CONTRACT_STATUS_Approved_MAX,CC_NAME_CONTRACT_STATUS_Approved_MEAN,CC_NAME_CONTRACT_STATUS_Approved_SUM,CC_NAME_CONTRACT_STATUS_Approved_VAR,CC_NAME_CONTRACT_STATUS_Completed_MIN,CC_NAME_CONTRACT_STATUS_Completed_MAX,CC_NAME_CONTRACT_STATUS_Completed_MEAN,CC_NAME_CONTRACT_STATUS_Completed_SUM,CC_NAME_CONTRACT_STATUS_Completed_VAR,CC_NAME_CONTRACT_STATUS_Demand_MIN,CC_NAME_CONTRACT_STATUS_Demand_MAX,CC_NAME_CONTRACT_STATUS_Demand_MEAN,CC_NAME_CONTRACT_STATUS_Demand_SUM,CC_NAME_CONTRACT_STATUS_Demand_VAR,CC_NAME_CONTRACT_STATUS_Refused_MIN,CC_NAME_CONTRACT_STATUS_Refused_MAX,CC_NAME_CONTRACT_STATUS_Refused_MEAN,CC_NAME_CONTRACT_STATUS_Refused_SUM,CC_NAME_CONTRACT_STATUS_Refused_VAR,CC_NAME_CONTRACT_STATUS_Sent proposal_MIN,CC_NAME_CONTRACT_STATUS_Sent proposal_MAX,CC_NAME_CONTRACT_STATUS_Sent proposal_MEAN,CC_NAME_CONTRACT_STATUS_Sent proposal_SUM,CC_NAME_CONTRACT_STATUS_Sent proposal_VAR,CC_NAME_CONTRACT_STATUS_Signed_MIN,CC_NAME_CONTRACT_STATUS_Signed_MAX,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MIN,CC_NAME_CONTRACT_STATUS_nan_MAX,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT
0,0,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0247,0.025,0.0252,0.0369,0.0369,0.0383,0,1.0,0,0.0143,0.0144,0.0144,-9461,-637.0,-2120,-1134.0,-3648.0,2.0,2.0,0.0,0.0,0.0,0.069,0.069,0.069,0.083037,0.262949,0.139376,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,1,0,0.0833,0.0833,0.0833,0.125,0.125,0.125,10,0.0369,0.0375,0.0377,0,0,0.0202,0.0205,0.022,0.019,0.0193,0.0198,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,9.0,0.018801,2,2,0,0,0,0,100002,1.0,0.0149,0.9722,0.9722,0.9722,0.6192,0.6243,0.6341,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,9251.775,9251.775,9251.775,179055.0,179055.0,179055.0,179055.0,179055.0,179055.0,1.0,1.0,1.0,0.010021,0.0,0.0,0.0,179055.0,179055.0,179055.0,9.0,9.0,9.0,0.0,0.0,0.0,-606.0,-606.0,-606.0,24.0,24.0,11240.865,17581.5675,14743.06875,90000.0,220500.0,157500.0,108643.5,225000.0,180000.0,0.924908,1.0,0.96049,0.003205,0.0,0.0,0.0,134957.0475,225000.0,191250.0,11.0,13.0,12.333333,0.0,0.0,0.0,-642.0,-403.0,-551.0,14.0,24.0,-1.0,-10.0,19.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,2.0,0.0,0.0,0.0,31.0,20.421053,388.0,1.0,1.0,19.0,0.0,0.0,0.0,0.0,0.0,53093.745,11559.247105,219625.695,9251.775,53093.745,11559.247105,219625.695,-49.0,-315.421053,-5993.0,19.0,-22.0,-1.0,-12.0,-253.0,42.166667,0.0,96107.175,24997.602995,701364.0825,753542600.0,45000.0,180000.0,149000.0,3825000.0,477396800.0,0.0,90000.0,4500.0,60750.0,258971100.0,0.0,67500.0,3329.348636,143100.0,191823200.0,0.0,0.0,0.0,0.0,0.0,0.0,7408.53,365.790547,0.0,2649883.0,0.0,4760.2575,1623.508258,37493.9325,2873010.0,0.0,65250.0,9856.811065,140431.2075,160963100.0,0.0,23436.0,3986.601378,118969.8525,35088930.0,0.0,90000.0,23914.47654,671808.1725,690497800.0,0.0,95477.3325,24765.001041,691906.0275,752354400.0,0.0,95513.49,24770.597235,692068.7475,752550700.0,0.0,3.0,0.260137,3.0,0.529088,0.0,3.0,0.205128,8.0,0.4674,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.052083,0.0,0.0625,0.0,7.0,3.8,48.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0
1,1,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0959,0.0968,0.0924,0.0529,0.0529,0.0538,0,2.0,1,0.0605,0.0608,0.0497,-16765,-1188.0,-291,-828.0,-1186.0,0.0,0.0,0.08,0.08,0.0806,0.0345,0.0345,0.0345,0.311267,0.622246,0.533482,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,1,1,0,0.2917,0.2917,0.2917,0.3333,0.3333,0.3333,11,0.013,0.0132,0.0128,0,0,0.0773,0.0787,0.079,0.0549,0.0558,0.0554,0.0039,0.0039,0.0,0.0098,0.01,0.0,1.0,1.0,9.0,0.003541,1,1,0,0,0,0,100003,0.0,0.0714,0.9851,0.9851,0.9851,0.796,0.7987,0.804,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,6737.31,98356.995,56553.99,68809.5,900000.0,435436.5,68053.5,1035882.0,484191.0,0.868825,1.011109,0.949329,0.005324,0.0,6885.0,3442.5,68809.5,900000.0,435436.5,12.0,17.0,14.666667,0.0,0.100061,0.05003,-2341.0,-746.0,-1305.0,10.0,30.0,11240.865,17581.5675,14743.06875,90000.0,220500.0,157500.0,108643.5,225000.0,180000.0,0.924908,1.0,0.96049,0.003205,0.0,0.0,0.0,134957.0475,225000.0,191250.0,11.0,13.0,12.333333,0.0,0.0,0.0,-642.0,-403.0,-551.0,14.0,24.0,-18.0,-43.785714,28.0,0.0,0.0,0.0,0.0,0.928571,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,28.0,2.0,0.0,0.0,0.0,14.0,7.16,179.0,1.0,1.0,25.0,0.0,0.0,0.0,0.0,0.0,560835.36,64754.586,1618864.65,6662.97,560835.36,64754.586,1618864.65,-544.0,-1385.32,-34633.0,25.0,-22.0,-1.0,-12.0,-253.0,42.166667,0.0,96107.175,24997.602995,701364.0825,753542600.0,45000.0,180000.0,149000.0,3825000.0,477396800.0,0.0,90000.0,4500.0,60750.0,258971100.0,0.0,67500.0,3329.348636,143100.0,191823200.0,0.0,0.0,0.0,0.0,0.0,0.0,7408.53,365.790547,0.0,2649883.0,0.0,4760.2575,1623.508258,37493.9325,2873010.0,0.0,65250.0,9856.811065,140431.2075,160963100.0,0.0,23436.0,3986.601378,118969.8525,35088930.0,0.0,90000.0,23914.47654,671808.1725,690497800.0,0.0,95477.3325,24765.001041,691906.0275,752354400.0,0.0,95513.49,24770.597235,692068.7475,752550700.0,0.0,3.0,0.260137,3.0,0.529088,0.0,3.0,0.205128,8.0,0.4674,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.052083,0.0,0.0625,0.0,7.0,3.8,48.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0
2,2,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,0.0,0.088,0.0874,0.084,0.0765,0.0761,0.0749,0,1.0,0,0.0213,0.021,0.0192,-19046,-225.0,-2531,-815.0,-4260.0,0.0,0.0,0.0,0.0,0.0,0.1379,0.1379,0.1379,0.506155,0.555912,0.729567,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,1,0.1667,0.1667,0.1667,0.2083,0.2083,0.2083,9,0.0482,0.0487,0.0459,0,0,0.0756,0.077,0.0771,0.0749,0.0754,0.0733,0.0,0.0,0.0,0.0036,0.0031,0.0011,0.0,0.0,26.0,0.010032,2,2,0,0,0,0,100004,0.0,0.069,0.9816,0.9816,0.9816,0.7552,0.7585,0.7648,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,...,0.0,1.0,0.0,0.0,0.0,5357.25,5357.25,5357.25,24282.0,24282.0,24282.0,20106.0,20106.0,20106.0,1.207699,1.207699,1.207699,0.010021,4860.0,4860.0,4860.0,24282.0,24282.0,24282.0,5.0,5.0,5.0,0.212008,0.212008,0.212008,-815.0,-815.0,-815.0,4.0,4.0,11240.865,17581.5675,14743.06875,90000.0,220500.0,157500.0,108643.5,225000.0,180000.0,0.924908,1.0,0.96049,0.003205,0.0,0.0,0.0,134957.0475,225000.0,191250.0,11.0,13.0,12.333333,0.0,0.0,0.0,-642.0,-403.0,-551.0,14.0,24.0,-24.0,-25.5,4.0,0.0,0.0,0.0,0.0,0.75,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,4.0,2.0,0.0,0.0,0.0,11.0,7.666667,23.0,1.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,10573.965,7096.155,21288.465,5357.25,10573.965,7096.155,21288.465,-727.0,-761.666667,-2285.0,3.0,-22.0,-1.0,-12.0,-253.0,42.166667,0.0,96107.175,24997.602995,701364.0825,753542600.0,45000.0,180000.0,149000.0,3825000.0,477396800.0,0.0,90000.0,4500.0,60750.0,258971100.0,0.0,67500.0,3329.348636,143100.0,191823200.0,0.0,0.0,0.0,0.0,0.0,0.0,7408.53,365.790547,0.0,2649883.0,0.0,4760.2575,1623.508258,37493.9325,2873010.0,0.0,65250.0,9856.811065,140431.2075,160963100.0,0.0,23436.0,3986.601378,118969.8525,35088930.0,0.0,90000.0,23914.47654,671808.1725,690497800.0,0.0,95477.3325,24765.001041,691906.0275,752354400.0,0.0,95513.49,24770.597235,692068.7475,752550700.0,0.0,3.0,0.260137,3.0,0.529088,0.0,3.0,0.205128,8.0,0.4674,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.052083,0.0,0.0625,0.0,7.0,3.8,48.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0
3,3,29686.5,312682.5,297000.0,135000.0,0.0,0.0,0.0,0.0,0.0,1.0,0.088,0.0874,0.084,0.0765,0.0761,0.0749,0,2.0,1,0.0213,0.021,0.0192,-19005,-3039.0,-2437,-617.0,-9833.0,0.0,0.0,0.0,0.0,0.0,0.1379,0.1379,0.1379,0.506155,0.650442,0.533482,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0.1667,0.1667,0.1667,0.2083,0.2083,0.2083,17,0.0482,0.0487,0.0459,0,0,0.0756,0.077,0.0771,0.0749,0.0754,0.0733,0.0,0.0,0.0,0.0036,0.0031,0.0011,2.0,2.0,9.0,0.008019,2,2,0,0,0,0,100006,0.0,0.069,0.9816,0.9816,0.9816,0.7552,0.7585,0.7648,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,2482.92,39954.51,21842.19,26912.34,675000.0,352265.868,24219.0,675000.0,343728.9,0.943934,1.250017,1.061032,0.014849,2693.34,66987.0,34840.17,26912.34,675000.0,352265.868,12.0,15.0,14.4,0.108994,0.21783,0.163412,-617.0,-181.0,-345.6,18.0,90.0,32696.1,32696.1,32696.1,688500.0,688500.0,688500.0,906615.0,906615.0,906615.0,0.759418,0.759418,0.759418,0.003205,0.0,0.0,0.0,688500.0,688500.0,688500.0,15.0,15.0,15.0,0.0,0.0,0.0,-181.0,-181.0,-181.0,48.0,48.0,-1.0,-9.619048,21.0,0.0,0.0,0.0,0.0,0.857143,0.0,0.0,0.0,0.095238,0.0,0.047619,0.0,0.0,0.0,21.0,2.0,0.0,0.0,0.0,77.0,19.375,310.0,1.0,1.0,16.0,0.0,0.0,0.0,0.0,0.0,691786.89,62947.088438,1007153.415,2482.92,691786.89,62947.088438,1007153.415,-12.0,-271.625,-4346.0,16.0,-6.0,-1.0,-3.5,-21.0,3.5,0.0,0.0,0.0,0.0,0.0,270000.0,270000.0,270000.0,1620000.0,0.0,0.0,90000.0,4500.0,0.0,258971100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7408.53,365.790547,0.0,2649883.0,0.0,0.0,0.0,0.0,0.0,0.0,65250.0,9856.811065,0.0,160963100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.260137,0.0,0.529088,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.052083,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
4,4,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,0.0,0.088,0.0874,0.084,0.0765,0.0761,0.0749,0,1.0,0,0.0213,0.021,0.0192,-19932,-3038.0,-3458,-1106.0,-4311.0,0.0,0.0,0.0,0.0,0.0,0.1379,0.1379,0.1379,0.506155,0.322738,0.533482,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0.1667,0.1667,0.1667,0.2083,0.2083,0.2083,11,0.0482,0.0487,0.0459,1,0,0.0756,0.077,0.0771,0.0749,0.0754,0.0733,0.0,0.0,0.0,0.0036,0.0031,0.0011,0.0,0.0,9.0,0.028663,2,2,0,1,0,0,100007,0.0,0.069,0.9816,0.9816,0.9816,0.7552,0.7585,0.7648,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0.166667,0.0,0.0,0.0,0.0,1834.29,22678.785,12278.805,17176.5,247500.0,150530.25,14616.0,284400.0,166638.75,0.791139,1.175185,0.96965,0.016456,3105.0,3676.5,3390.75,17176.5,247500.0,150530.25,8.0,15.0,12.333333,0.100143,0.21889,0.159516,-2357.0,-374.0,-1222.833333,20.666667,124.0,11240.865,17581.5675,14743.06875,90000.0,220500.0,157500.0,108643.5,225000.0,180000.0,0.924908,1.0,0.96049,0.003205,0.0,0.0,0.0,134957.0475,225000.0,191250.0,11.0,13.0,12.333333,0.0,0.0,0.0,-642.0,-403.0,-551.0,14.0,24.0,-1.0,-33.636364,66.0,0.0,0.0,0.0,0.0,0.939394,0.0,0.0,0.0,0.045455,0.0,0.0,0.015152,0.0,0.0,66.0,2.0,12.0,0.954545,63.0,31.0,4.590909,303.0,1.0,0.954545,63.0,0.043995,22655.655,452.384318,29857.365,8084830.0,22678.785,12666.444545,835985.34,0.18,22678.785,12214.060227,806127.975,-14.0,-1032.242424,-68128.0,66.0,-22.0,-1.0,-12.0,-253.0,42.166667,0.0,96107.175,24997.602995,701364.0825,753542600.0,45000.0,180000.0,149000.0,3825000.0,477396800.0,0.0,90000.0,4500.0,60750.0,258971100.0,0.0,67500.0,3329.348636,143100.0,191823200.0,0.0,0.0,0.0,0.0,0.0,0.0,7408.53,365.790547,0.0,2649883.0,0.0,4760.2575,1623.508258,37493.9325,2873010.0,0.0,65250.0,9856.811065,140431.2075,160963100.0,0.0,23436.0,3986.601378,118969.8525,35088930.0,0.0,90000.0,23914.47654,671808.1725,690497800.0,0.0,95477.3325,24765.001041,691906.0275,752354400.0,0.0,95513.49,24770.597235,692068.7475,752550700.0,0.0,3.0,0.260137,3.0,0.529088,0.0,3.0,0.205128,8.0,0.4674,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.052083,0.0,0.0625,0.0,7.0,3.8,48.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0


In [89]:
#train_df_imp.isnull().values

In [85]:
gc.collect()

383

# Final Models

In [24]:
lgb_clf = lgb.LGBMClassifier(
    boosting_type='gbdt', num_leaves=30, max_depth=8,
    learning_rate=0.01, n_estimators=5000,  
    objective='binary', min_split_gain=0.02, min_child_weight=1, 
    min_child_samples=300, subsample=0.85, subsample_freq=0, colsample_bytree=0.95,
    reg_alpha=0.1, reg_lambda=0.1, random_state=27, n_jobs=-1, silent=False
)

with timer("fit lgb.LGBMClassifier"):
    fit_alg(lgb_clf, X_train)
    fit_alg(lgb_clf, X_train,0)

Starting fit LGBMClassifier. fit shape: (307511, 816)


  if diff:


Train Accuracy Score: 0.9293925400799974
Train AUC Score: 0.9203207502573733


  if diff:


Accuracy Score: 0.920133977204364
AUC Score: 0.7830139463278216
Starting fit LGBMClassifier. fit shape: (307511, 816)
fit lgb.LGBMClassifier - done in 2424s


In [30]:
xgb_clf = XGBClassifier(
        learning_rate =0.05,
        n_estimators=2000,
        max_depth=5,
        min_child_weight=5,
        gamma=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        objective= 'binary:logistic',
        nthread=8,
        silent=False,
        scale_pos_weight=1,
        seed=27)

with timer("fit XGBClassifier"):
    fit_alg(xgb_clf, X_train)
    fit_alg(xgb_clf, X_train, 0)

Starting fit XGBClassifier. fit shape: (307511, 816)


  if diff:


Train Accuracy Score: 0.9361687424799193
Train AUC Score: 0.9445815886488865


  if diff:


Accuracy Score: 0.9195811586426679
AUC Score: 0.7814832199231687
Starting fit XGBClassifier. fit shape: (307511, 816)
fit XGBClassifier - done in 12205s


In [None]:
log_clf = LogisticRegression(penalty='l1', C=1, random_state=1001)

with timer("fit LogisticRegression"):
    cv_alg(log_clf, X_train_imp)
    fit_alg(log_clf, X_train_imp, 0)

Starting CV LogisticRegression. CV shape: (307511, 816)


In [None]:
rnd_clf = RandomForestClassifier(n_estimators=5000, 
                        max_leaf_nodes=30, 
                        max_features=0.1,
                        max_depth=5,
                        bootstrap=True, n_jobs=-1, random_state=27)
with timer("fit RandomForestClassifier"):
    fit_alg(rnd_clf, X_train_imp)
    fit_alg(rnd_clf, X_train_imp, 0)

In [25]:
gc.collect()

0

In [None]:
gbm_level2 = xgb.XGBClassifier(
 learning_rate = 0.01,
 n_estimators= 1000,
 max_depth=2,
 min_child_weight=1,
 gamma=0.1,  
 subsample=0.6, 
 colsample_bytree=1,
 reg_alpha = 0.2,
 objective= 'binary:logistic',
 nthread= -1,
 silent=False,
 scale_pos_weight=1)

seclvl_train_all = np.concatenate(( np.array([log_clf.predict_proba(strat_trainX_val_imp)[:,1]]).T,
                                np.array([rnd_clf.predict_proba(strat_trainX_val_imp)[:,1]]).T,
                                np.array([xgb_clf.predict_proba(strat_trainX_val)[:,1]]).T,
                                np.array([lgb_clf.predict_proba(strat_trainX_val)[:,1]]).T
                            ), axis=1)

cv_alg(log_clf, test_df)

# Generate the final result

In [31]:
feats = [f for f in test_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV']]

for clf in [lgb_clf, xgb_clf,log_clf, rnd_clf]: #[lgb_clf]: [lgb_clf, xgb_clf]:
    result_df=pd.DataFrame()
    result_df['SK_ID_CURR'] = test_df['SK_ID_CURR']
    result_df['TARGET'] = clf.predict_proba(test_df[feats])[:, 1]

    submissionFilePath = "D:/wangh/Kaggle/HomeCredit/result_submission7_{}.csv".format(clf.__class__.__name__)
    result_df.to_csv(submissionFilePath, index=False, float_format='%.6f')
    
    del result_df
    gc.collect()


# Tune Models

In [32]:
feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV']]

df_feats = train_df[feats]
df_TARGET = train_df['TARGET']

In [None]:
lgb_train = lgb.Dataset(data=df_feats, label=df_TARGET)
params = {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 
          'learning_rate': 0.01, 'num_leaves': 48, 'num_iteration': 5000, 'verbose': 10 ,
          'colsample_bytree':.8, 'subsample':.9, 'max_depth':7, 'reg_alpha':.1, 'reg_lambda':.1, 
          'min_split_gain':.01, 'min_child_weight':1}
lgb_result=lgb.cv(params, lgb_train, stratified=True, metrics={'auc'}, num_boost_round=5000,
                      nfold=5, early_stopping_rounds=150, verbose_eval=20)
lgb_result



[20]	cv_agg's auc: 0.738788 + 0.0032822
[40]	cv_agg's auc: 0.739973 + 0.00340554


In [None]:
#grid search
lgb_param_test1 = {
    'max_depth':[8], #range(2,10,2),
    'num_leaves':[30], #range(30,60,10),
    'min_child_samples':range(200,1000,300)
    'min_child_weigh':range(20,100,20)
    
}
lgb_gsearch1 = GridSearchCV(estimator = lgb.LGBMClassifier(
    boosting_type='gbdt', num_leaves=48, max_depth=7,
    learning_rate=0.01, n_estimators=len(lgb_result['auc-mean']),  
    objective='binary', min_split_gain=0.02, min_child_weight=1, 
    min_child_samples=300, subsample=0.85, subsample_freq=0, colsample_bytree=0.95,
    reg_alpha=0.1, reg_lambda=0.1, random_state=1001, n_jobs=-1, silent=False
), param_grid = lgb_param_test1, scoring='roc_auc',n_jobs=4,verbose=10,iid=False, cv=5)


lgb_gsearch1.fit(df_feats, df_TARGET, eval_metric='auc', verbose = 50)
lgb_gsearch1.grid_scores_, lgb_gsearch1.best_params_, lgb_gsearch1.best_score_ 

In [None]:
def modelfit(alg, train, train_target, useTrainCV=True, cv_folds=5, early_stopping_rounds=150, val_percent=0.2):
    
    split = StratifiedShuffleSplit(n_splits=1, test_size= val_percent, random_state=1001)
    for train_index, test_index in split.split(train, train_target):
        dtrain, dtest = train.loc[train_index], train.loc[test_index]
        dtrain_target, dtest_target = train_target.loc[train_index], train_target.loc[test_index]
            
            
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain.values, label=dtrain_target.values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics={'auc'}, early_stopping_rounds=early_stopping_rounds)
        print("n_estimators use : %d" % cvresult.shape[0])
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain, dtrain_target,eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain)
    dtrain_predprob = alg.predict_proba(dtrain)[:,1]
        
    #Print model report:
    print("\ndtrain Model Report")
    print("dtrain Accuracy : %.4g" % metrics.accuracy_score(dtrain_target, dtrain_predictions))
    print("dtrain AUC Score: %f" % metrics.roc_auc_score(dtrain_target, dtrain_predprob)) 
    
    #     Predict on validation data:
    dtest_predictions = alg.predict(dtest)
    dtest_predprob = alg.predict_proba(dtest)[:,1]
        
    #Print model report:
    print("\ndtest Model Report")
    print("dtest Accuracy : %.4g" % metrics.accuracy_score(dtest_target, dtest_predictions))
    print("dtest AUC Score: %f" % metrics.roc_auc_score(dtest_target, dtest_predprob))

In [None]:
xgb1 = xgb.XGBClassifier(
        learning_rate=0.1,
        n_estimators=1000,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        silent=False,    
        seed=1001)
modelfit(xgb1, df_feats, df_TARGET)

In [None]:
param_test1 = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, 
                                                  n_estimators=xgb1.get_params()['n_estimators'], 
                                        max_depth=5,
                                        min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'binary:logistic', nthread=4, scale_pos_weight=1, 
                                                         random_state=1001, silent=False,  seed=1001, verbose=10), 
                       param_grid = param_test1, scoring='roc_auc',n_jobs=4,verbose=10,iid=False, cv=5)
gsearch1.fit(df_feats, df_TARGET)

In [None]:
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
param_test3 = {
    'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=xgb1.get_params()['n_estimators'],
                                                  max_depth=gsearch1.best_params_['max_depth'],
                                        min_child_weight=gsearch1.best_params_['min_child_weight'],
                                                  gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'binary:logistic', nthread=4, scale_pos_weight=1,
                                                  random_state=27, silent=False, seed=27), 
                       param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False,verbose=10, cv=5)
gsearch3.fit(df_feats, df_TARGET)

In [None]:
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

In [None]:
param_test4 = {
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=1000, 
                                                  max_depth=gsearch1.best_params_['max_depth'],
                                        min_child_weight=gsearch1.best_params_['min_child_weight'],
                                                  gamma=gsearch3.best_params_['gamma'], 
                                        subsample=0.8, colsample_bytree=0.8,
                                        objective= 'binary:logistic', nthread=2, verbose=10,silent=False,
                                                  scale_pos_weight=1,seed=27), 
                       param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, verbose=10,cv=5)
gsearch4.fit(df_feats, df_TARGET)

In [None]:
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

In [None]:
param_test6 = {
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch6 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=232,
                                                  max_depth=gsearch1.best_params_['max_depth'],
                                        min_child_weight=gsearch1.best_params_['min_child_weight'],
                                                  gamma=gsearch3.best_params_['gamma'],  
                                                  subsample=gsearch4.best_params_["subsample"], 
                                        colsample_bytree=gsearch4.best_params_["colsample_bytree"],
                                                  silent=False,
                                        objective= 'binary:logistic', nthread=2, verbose=10,
                                                  scale_pos_weight=1,seed=27), 
                       param_grid = param_test6, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch6.fit(df_feats, df_TARGET)

In [None]:
gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_

In [None]:
xgb3 = XGBClassifier(
        learning_rate =0.1,
        n_estimators=1000,
        max_depth==gsearch1.best_params_['max_depth'],
        min_child_weight=gsearch1.best_params_['min_child_weight'],
        gamma=gsearch3.best_params_['gamma'], 
        subsample=gsearch4.best_params_["subsample"],
        colsample_bytree=gsearch4.best_params_["colsample_bytree"],
        reg_alpha=gsearch6.best_params_["reg_alpha"],
        objective= 'binary:logistic',
        nthread=8,
        scale_pos_weight=1,
        silent=False,
        seed=27)
modelfit(xgb3, df_feats, df_TARGET)

In [None]:
xgb3

In [None]:
xgb4 = XGBClassifier(
        learning_rate =0.01,
        n_estimators=5000,
        max_depth=gsearch1.best_params_['max_depth'],
        min_child_weight=gsearch1.best_params_['min_child_weight'],
        gamma=gsearch3.best_params_['gamma'],
        subsample=gsearch4.best_params_["subsample"],
        colsample_bytree=gsearch4.best_params_["colsample_bytree"],
        reg_alpha=gsearch6.best_params_["reg_alpha"],
        objective= 'binary:logistic',
        nthread=8,
        silent=False,
        scale_pos_weight=1,
        seed=27)
modelfit(xgb4, df_feats, df_TARGET)

In [None]:
xgb4