In [17]:
# HOME CREDIT DEFAULT RISK COMPETITION
# Most features are created by applying min, max, mean, sum and var functions to grouped tables. 
# Little feature selection is done and overfitting might be a problem since many features are related.
# The following key ideas were used:
# - Divide or subtract important features to get rates (like annuity and income)
# - In Bureau Data: create specific features for Active credits and Closed credits
# - In Previous Applications: create specific features for Approved and Refused applications
# - Modularity: one function for each table (except bureau_balance and application_test)
# - One-hot encoding for categorical features
# All tables are joined with the application DF using the SK_ID_CURR key (except bureau_balance).
# You can use LightGBM with KFold or Stratified KFold.

# Update 16/06/2018:
# - Added Payment Rate feature
# - Removed index from features
# - Use standard KFold CV (not stratified)

import numpy as np
import pandas as pd
import gc
#import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

'''
1. # ile kometlenen kodlari sildik
2. kfold = lightGBM() fonk u icin      stratified= False    secenegini iptal ettik.
   
3. grafigi iptal ettik
   
4.  agg ler icin tek fonk kalsin  (cat icin degil)
    
5.    
'''




def application_train_test(num_rows = None):                         # nan_as_category=False
    df = pd.read_csv('../data/application_train.csv', nrows= num_rows)
    test_df = pd.read_csv('../data/application_test.csv', nrows= num_rows)

    df = df.append(test_df).reset_index()

    
    for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
        df[bin_feature], uniques = pd.factorize(df[bin_feature])
        
    
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']  #df ayni
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= False)     
    cat_cols = [c for c in df.columns if c not in original_columns]
    
    

    
    del test_df
    gc.collect()
    return df



def bureau_and_balance(num_rows = None):                            # nan_as_category=True
    bureau = pd.read_csv('../data/bureau.csv', nrows = num_rows)
    bb = pd.read_csv('../data/bureau_balance.csv', nrows = num_rows)
    
    original_columns = list(bb.columns)
    categorical_columns = [col for col in bb.columns if bb[col].dtype == 'object']  #df -> 
    bb = pd.get_dummies(bb, columns= categorical_columns, dummy_na= True)   
    bb_cat = [c for c in bb.columns if c not in original_columns]
    
    
    original_columns = list(bureau.columns)
    categorical_columns = [col for col in bureau.columns if bureau[col].dtype == 'object']  #df ->
    bureau = pd.get_dummies(bureau, columns= categorical_columns, dummy_na= True)       
    bureau_cat = [c for c in bureau.columns if c not in original_columns]    
    
    
    bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
    for col in bb_cat:
        bb_aggregations[col] = ['mean']
    bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
    bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
    bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
    
    bureau.drop(['SK_ID_BUREAU'], axis=1, inplace= True)
    
    del bb, bb_agg
    gc.collect()
    
    # Bureau and bureau_balance numeric features
    num_aggregations = {
        'DAYS_CREDIT': ['min',],
        'DAYS_CREDIT_ENDDATE': ['min'],
        'DAYS_CREDIT_UPDATE': ['mean'],
        'CREDIT_DAY_OVERDUE': ['max'],
        'AMT_CREDIT_MAX_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM': ['max', 'mean'],
        'AMT_CREDIT_SUM_DEBT': ['max'],
        'AMT_CREDIT_SUM_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM_LIMIT': ['mean'],
        'AMT_ANNUITY': ['max', 'mean'],
        'CNT_CREDIT_PROLONG': ['sum'],
        'MONTHS_BALANCE_MIN': ['min'],
        'MONTHS_BALANCE_MAX': ['max'],
        'MONTHS_BALANCE_SIZE': ['mean']
    }
    # Bureau and bureau_balance categorical features
    cat_aggregations = {}
    for cat in bureau_cat: cat_aggregations[cat] = ['mean']
    for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean']
    
    bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])

    

    del bureau
    gc.collect()
    return bureau_agg



def previous_applications(num_rows = None):                                     # nan_as_category=True
    prev = pd.read_csv('../data/previous_application.csv', nrows = num_rows)
    
    original_columns = list(prev.columns)
    categorical_columns = [col for col in prev.columns if prev[col].dtype == 'object']
    prev = pd.get_dummies(prev, columns= categorical_columns, dummy_na= True)
    cat_cols = [c for c in prev.columns if c not in original_columns]    
    


    num_aggregations = {
        'AMT_ANNUITY': ['mean'],
        'AMT_APPLICATION': ['mean'],
        'AMT_CREDIT': ['mean'],
        'AMT_DOWN_PAYMENT': ['mean'],
        'AMT_GOODS_PRICE': ['mean'],
        'HOUR_APPR_PROCESS_START':['mean'],
        'RATE_DOWN_PAYMENT': ['mean'],
        'DAYS_DECISION': ['mean'],
        'CNT_PAYMENT':['mean'],
    }

    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ['mean']
    
    prev_agg = prev.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
 
    
    del prev
    gc.collect()
    
    return prev_agg



def pos_cash(num_rows = None ):                                           # nan_as_category=True
    pos = pd.read_csv('../data/POS_CASH_balance.csv', nrows = num_rows)
    
    original_columns = list(pos.columns)
    categorical_columns = [col for col in pos.columns if pos[col].dtype == 'object']
    pos = pd.get_dummies(pos, columns= categorical_columns, dummy_na= True)
    cat_cols = [c for c in pos.columns if c not in original_columns]
    
    
    # Features
    aggregations = {
        'MONTHS_BALANCE': ['mean'],
        'SK_DPD': ['mean'],
        'SK_DPD_DEF': ['mean']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    
    pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
    pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])

    
    del pos
    gc.collect()
    
    return pos_agg
    
    
    
def installments_payments(num_rows = None ):                                   # nan_as_category=True
    ins = pd.read_csv('../data/installments_payments.csv', nrows = num_rows)
    
    original_columns = list(ins.columns)
    categorical_columns = [col for col in ins.columns if ins[col].dtype == 'object']
    ins = pd.get_dummies(ins, columns= categorical_columns, dummy_na= True)
    cat_cols = [c for c in ins.columns if c not in original_columns]
    

    # Features: Perform aggregations
    aggregations = {
        'NUM_INSTALMENT_VERSION': ['nunique'],
        'AMT_INSTALMENT': ['mean'],
        'AMT_PAYMENT':['mean'],
        'DAYS_ENTRY_PAYMENT': ['mean']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
    ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])

    
    del ins
    gc.collect()
    
    return ins_agg



def credit_card_balance(num_rows = None ):                                  # nan_as_category=True
    cc = pd.read_csv('../data/credit_card_balance.csv', nrows = num_rows)
    
    original_columns = list(cc.columns)
    categorical_columns = [col for col in cc.columns if cc[col].dtype == 'object']
    cc = pd.get_dummies(cc, columns= categorical_columns, dummy_na= True)
    cat_cols = [c for c in cc.columns if c not in original_columns]    
    
    
    # General aggregations
    cc.drop(['SK_ID_PREV'], axis= 1, inplace = True)
    cc_agg = cc.groupby('SK_ID_CURR').agg(['min', 'max', 'mean', 'sum', 'var'])
    cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
    
    
    del cc
    gc.collect()
    
    return cc_agg



# LightGBM GBDT with KFold or Stratified KFold
def kfold_lightgbm(df, num_folds, debug= False):           # , stratified = False
    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()
    # Cross validation model
#    if stratified:
#        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001)     <<<<<<<
#    else:
    folds = KFold(n_splits= num_folds, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            nthread=4,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1, )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'auc', verbose= 200, early_stopping_rounds= 200)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    # Write submission file and plot feature importance
    if not debug:
        test_df['TARGET'] = sub_preds
        test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index= False)
#    display_importances(feature_importance_df)                                      <<<<<<<<<<<<<
    return feature_importance_df

'''
# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances01.png')
'''

def main(debug = False):
    num_rows = 1000 if debug else None
    df = application_train_test(num_rows)

    bureau = bureau_and_balance(num_rows)
    df = df.join(bureau, how='left', on='SK_ID_CURR')
    del bureau
    gc.collect()
    
    prev = previous_applications(num_rows)
    df = df.join(prev, how='left', on='SK_ID_CURR')
    del prev
    gc.collect()
        
    pos = pos_cash(num_rows)
    df = df.join(pos, how='left', on='SK_ID_CURR')
    del pos
    gc.collect()
    
    ins = installments_payments(num_rows)
    df = df.join(ins, how='left', on='SK_ID_CURR')
    del ins
    gc.collect()
    
    cc = credit_card_balance(num_rows)
    df = df.join(cc, how='left', on='SK_ID_CURR')
    del cc
    gc.collect()
    
    feat_importance = kfold_lightgbm(df, num_folds= 10, debug= debug)    # , stratified= False

if __name__ == "__main__":
    submission_file_name = "submission_kernel02.csv"

    main(True)

Starting LightGBM. Train shape: (1000, 530), test shape: (1000, 530)
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.5	training's binary_logloss: 0.367786	valid_1's auc: 0.5	valid_1's binary_logloss: 0.367786
Early stopping, best iteration is:
[1]	training's auc: 0.5	training's binary_logloss: 0.367786	valid_1's auc: 0.5	valid_1's binary_logloss: 0.367786
Fold  1 AUC : 0.500000
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.5	training's binary_logloss: 0.360609	valid_1's auc: 0.5	valid_1's binary_logloss: 0.477113
Early stopping, best iteration is:
[1]	training's auc: 0.5	training's binary_logloss: 0.360609	valid_1's auc: 0.5	valid_1's binary_logloss: 0.477113
Fold  2 AUC : 0.500000
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.5	training's binary_logloss: 0.349555	valid_1's auc: 0.5	valid_1's binary_logloss: 0.646815
Early stopping, best iteration is:
[1]	training's auc