In [9]:
# HOME CREDIT DEFAULT RISK COMPETITION
# Most features are created by applying min, max, mean, sum and var functions to grouped tables.
# Little feature selection is done and overfitting might be a problem since many features are related.
# The following key ideas were used:
# - Divide or subtract important features to get rates (like annuity and income)
# - In Bureau Data: create specific features for Active credits and Closed credits
# - In Previous Applications: create specific features for Approved and Refused applications
# - Modularity: one function for each table (except bureau_balance and application_test)
# - One-hot encoding for categorical features
# All tables are joined with the application DF using the SK_ID_CURR key (except bureau_balance).

# You can use LightGBM with KFold or Stratified KFold. Please upvote if you find usefull, thanks!

import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
# import fancyimpute
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

PATH ='../../data'

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))


# LightGBM GBDT with KFold or Stratified KFold
# Parameters from Tilii kernel: https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code

def kfold_lightgbm(train_df, test_df, num_folds, stratified=False, debug=False):

    # Divide in training/validation and test data
    #train_df = df[df['TARGET'].notnull()]
    #test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    #del df
    gc.collect()
    
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1001)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001)
    
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns 
             if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index']]

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        clf = LGBMClassifier(
            nthread=4,
            n_estimators=10000, 
            learning_rate=0.01,
            num_leaves=50,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.001, 
            reg_lambda=0.01,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1, )

        clf.fit(train_x, train_y, 
                eval_set=[(train_x, train_y), (valid_x, valid_y)],
                eval_metric='auc', 
                verbose=100, 
                early_stopping_rounds=200)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, 
                                                 num_iteration=clf.best_iteration_)[:, 1]
        
        sub_preds += clf.predict_proba(test_df[feats], 
                                       num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
        
        test_df['TARGET'] = sub_preds

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))

    return feature_importance_df, test_df[['SK_ID_CURR', 'TARGET']]


# Display/plot feature importance
def display_importances(feature_importance_df_, save=True):
    
    cols = feature_importance_df_[["feature", "importance"]]\
    .groupby("feature")\
    .mean()\
    .sort_values(by="importance",
                 ascending=False)[:40]\
    .index
    
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", 
                data=best_features.sort_values(by="importance", ascending=False))
    
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()

    if (save):
        dt = time.time()
        plt.savefig('lgbm_importances_{0}.png'.format(dt))


def main(debug=True, num_folds=2):

    df = pd.read_pickle('application_ext_all.pkl')
    # df_train = pd.read_csv('application_train.csv')
    # df_test = pd.read_csv('application_test.csv')
    df_train = df[df['TARGET'].notnull()]
    df_test  = df[df['TARGET'].isnull()]
    del df
    
    ## START DM ##
    
    # Train# 
    
    # NULL COLUMNS
    has_null = [col for col in df_train.columns 
                if sum(df_train[col].isnull())]
    
    # numerical
    has_null_num = [col for col in has_null 
                    if df_train[col].dtype == 'float']
    
    # categorical
    has_null_cat = [col for col in has_null 
                    if df_train[col].dtype == 'object']
    
    # IMPUTE
    # impute with mean
    for col in has_null_num:
        df_train[col] = df_train[col].fillna(df_train[col].mean())
    # impute with default value
    # for col in has_null_cat:
        # df[col] = df[col].fillna('unknown_'+col)
        
    # ENCODING
    col_cat = [col for col in df_train.columns
               if df_train[col].dtype == 'object']
    
    for col in col_cat:
        dummy = pd.get_dummies(df_train[col], 
                               prefix=col, 
                               drop_first=True)    

        df_train = df_train.join(dummy)
        del dummy
    gc.collect()
    df_train = df_train.drop(col_cat, axis=1)  
    
#    X = fancyimpute.MICE().complete(df_train)
#    df_train = pd.DataFrame(X, columns=df_train.columns)
#    del X
    
    # Test #  
    
    # NULL COLUMNS
    has_null = [col for col in df_test.columns 
                if sum(df_test[col].isnull())]
    
    # numerical
    has_null_num = [col for col in has_null 
                    if df_test[col].dtype == 'float']
    
    # categorical
    has_null_cat = [col for col in has_null 
                    if df_test[col].dtype == 'object']
    
    # IMPUTE
    # impute with mean
    for col in has_null_num:
        df_test[col] = df_test[col].fillna(df_test[col].mean())
    # impute with default value
    # for col in has_null_cat:
    #     df[col] = df[col].fillna('unknown_'+col)
        
    # ENCODING
    col_cat = [col for col in df_test.columns
               if df_test[col].dtype == 'object']
   
    for col in col_cat:
        dummy = pd.get_dummies(df_test[col], 
                               prefix=col, 
                               drop_first=True)    

        df_test = df_test.join(dummy)
        del dummy
    gc.collect()
    df_test = df_test.drop(col_cat, axis=1)
    
#    X = fancyimpute.MICE().complete(df_test)
#    df_test = pd.DataFrame(X, columns=df_test.columns)
#    del X
    
    df_test['TARGET'] = 0
    missing_list = list(set(df_train.columns) - set(df_test.columns))
    df_train = df_train.drop(missing_list, axis=1)
    
#    ones = (df_train['TARGET'] == 1).sum()
#    df_0 = df_train[df_train['TARGET'] == 0].sample(ones)
#    df_train_sub = df_0.append(df_train[df_train['TARGET'] == 1])
#    del df_0
    
    ## END DM ##
    
    with timer("Run LightGBM plus kfold {0}".format(num_folds)):
        feature_importance_df, test_preds = \
        kfold_lightgbm(df_train, df_test, num_folds=5, stratified=True, debug=debug)
        test_preds.to_csv('submission_{}.csv'.format(time.time()), index=False)
        display_importances(feature_importance_df)

if __name__ == "__main__":
    print("{0}".format(time.time()))
    with timer("Full model run"):
        main()

1534036011.477325
Starting LightGBM. Train shape: (307511, 108), test shape: (0, 108)
Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.74254	valid_1's auc: 0.734289
[200]	training's auc: 0.754817	valid_1's auc: 0.741823
[300]	training's auc: 0.764538	valid_1's auc: 0.746458
[400]	training's auc: 0.772089	valid_1's auc: 0.749356
[500]	training's auc: 0.77767	valid_1's auc: 0.75099
[600]	training's auc: 0.782607	valid_1's auc: 0.752241
[700]	training's auc: 0.787075	valid_1's auc: 0.753018
[800]	training's auc: 0.79096	valid_1's auc: 0.753426
[900]	training's auc: 0.794526	valid_1's auc: 0.753817
[1000]	training's auc: 0.797651	valid_1's auc: 0.754018
[1100]	training's auc: 0.800728	valid_1's auc: 0.754108
[1200]	training's auc: 0.803433	valid_1's auc: 0.754194
[1300]	training's auc: 0.806234	valid_1's auc: 0.754207
[1400]	training's auc: 0.80895	valid_1's auc: 0.754212
Early stopping, best iteration is:
[1234]	training's auc: 0.804379	valid_1's auc:

ValueError: Input data must be 2 dimensional and non empty.

In [6]:
df = pd.read_pickle('combined_data.pkl')

In [7]:
df.to_csv('combined_data_small.csv', index=False)