In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
plt.rcParams['figure.figsize'] = (10, 8)

In [2]:
train_df = pd.read_csv('./data/application_train.csv.zip')
train_df = train_df.rename(lambda x: x.lower(), axis=1)
train_df.head()

Unnamed: 0,sk_id_curr,target,name_contract_type,code_gender,flag_own_car,flag_own_realty,cnt_children,amt_income_total,amt_credit,amt_annuity,...,flag_document_18,flag_document_19,flag_document_20,flag_document_21,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
test_df = pd.read_csv('./data/application_test.csv.zip')
test_df = test_df.rename(lambda x: x.lower(), axis=1)
test_df.head()

Unnamed: 0,sk_id_curr,name_contract_type,code_gender,flag_own_car,flag_own_realty,cnt_children,amt_income_total,amt_credit,amt_annuity,amt_goods_price,...,flag_document_18,flag_document_19,flag_document_20,flag_document_21,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,0,,,,,,


### Columns

In [4]:
target_column = 'target'

id_column = 'sk_id_curr'

# numeric columns
child_cnt_column = 'cnt_children'
amt_columns = ['amt_income_total', 'amt_credit', 'amt_annuity', 'amt_goods_price']
reg_pop_column = 'region_population_relative'
days_columns = ['days_birth', 'days_employed', 'days_registration', 'days_id_publish']
car_age_column = 'own_car_age'

# categorical columns
contract_column = 'name_contract_type'
gender_column = 'code_gender'
flag_own_columns = ['flag_own_car', 'flag_own_realty']
name_columns = ['name_type_suite', 'name_income_type', 'name_education_type', 
                'name_family_status', 'name_housing_type']
flag_columns = ['flag_mobil', 'flag_emp_phone', 'flag_work_phone', 'flag_cont_mobile',
                'flag_phone', 'flag_email']

In [None]:
num_columns = ['cnt_children', 'amt_income_total', 'amt_credit', 'amt_annuity', 'amt_goods_price',
               'region_population_relative', 'days_employed', 'days_registration', 'days_id_publish',
               'own_car_age']
cat_columns = ['name_contract_type', 'code_gender', 'flag_own_car', 'flag_own_realty', 
               'name_type_suite', 'name_income_type', 'name_education_type', 'name_family_status', 
               'name_housing_type', 'flag_mobil', 'flag_emp_phone', 'flag_work_phone', 'flag_cont_mobile',
                'flag_phone', 'flag_email']

In [5]:
X_train = train_df.drop([target_column, id_column], axis=1)
y_train = train_df[target_column].copy()
X_train.shape, y_train.shape

((307511, 120), (307511,))

In [6]:
y_train_count = train_df[target_column].value_counts()[1]
y_train_count

24825

In [7]:
sample_size = 1000
random_state = 42

### Pipeline

In [8]:
import warnings
warnings.filterwarnings(action='default')

In [19]:
from scipy.stats import randint as sp_randint

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, CategoricalEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, learning_curve
from sklearn.decomposition import PCA, KernelPCA
from sklearn.metrics import roc_auc_score

from imblearn.under_sampling import RandomUnderSampler
import imblearn.pipeline as imbp

import xgboost as xgb
from xgboost import XGBClassifier

In [20]:
class ColumnExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.columns]

In [21]:
class CategoricalImputer(BaseEstimator, TransformerMixin):
    def __init__(self, label=None):
        self.label = label
    
    def fit(self, X, y=None):
        if self.label:
            self.fill_values_ = self.label 
        else:
            self.fill_values_ = X.mode().iloc[0]
        return self
    
    def transform(self, X):
        return X.fillna(self.fill_values_).values

In [22]:
num_columns = [child_cnt_column] + amt_columns + [reg_pop_column] + days_columns + [car_age_column]
num_columns

['cnt_children',
 'amt_income_total',
 'amt_credit',
 'amt_annuity',
 'amt_goods_price',
 'region_population_relative',
 'days_birth',
 'days_employed',
 'days_registration',
 'days_id_publish',
 'own_car_age']

In [23]:
cat_columns = [contract_column] + [gender_column] + flag_own_columns + name_columns + flag_columns
cat_columns

['name_contract_type',
 'code_gender',
 'flag_own_car',
 'flag_own_realty',
 'name_type_suite',
 'name_income_type',
 'name_education_type',
 'name_family_status',
 'name_housing_type',
 'flag_mobil',
 'flag_emp_phone',
 'flag_work_phone',
 'flag_cont_mobile',
 'flag_phone',
 'flag_email']

In [156]:
sample_sizes = {0: y_train_count, 1: y_train_count}
pipeline = imbp.Pipeline([
    ('features', FeatureUnion([
        ('numeric', Pipeline([
            ('extract', ColumnExtractor(columns=num_columns)),
            ('impute', SimpleImputer(strategy='mean')),
            ('normalize', StandardScaler()),
        ])),
        ('categorical', Pipeline([
            ('extract', ColumnExtractor(columns=cat_columns)),
            ('impute', CategoricalImputer()),
            ('encoding', CategoricalEncoder('onehot-dense', handle_unknown='ignore'))
        ]))
    ])),
    #('pca', PCA(n_components=0.99)),
    #('sampling', RandomUnderSampler(ratio=sample_sizes, random_state=random_state))
])

In [193]:
X_train_trans = pipeline.fit_transform(X_train, y_train)
y_train_trans = y_train.values
X_train_trans.shape, y_train_trans.shape

((307511, 64), (307511,))

In [194]:
rs = RandomUnderSampler(ratio=sample_sizes, random_state=random_state)
X_train_trans, y_train_trans = rs.fit_sample(X_train_trans, y_train_trans)
X_train_trans.shape, y_train_trans.shape

((49650, 64), (49650,))

In [195]:
X_test_trans = pipeline.transform(test_df)
X_test_trans.shape

(48744, 64)

In [196]:
def xbg_fit(model, X_train, y_train, useTrainCV=True, cv_folds=5, early_stopping_rounds=10, verbose=False):
    
    if useTrainCV:
        xgb_param = model.get_xgb_params()
        xgb_train = xgb.DMatrix(X_train, y_train)

        cv_results = xgb.cv(xgb_param, xgb_train, num_boost_round=model.get_params()['n_estimators'],
                            stratified=False, nfold=cv_folds, metrics='auc', 
                            early_stopping_rounds=early_stopping_rounds,
                            verbose_eval=verbose)
        
        print('# estimators: {}'.format(cv_results.shape[0]))
        print('Last iteration scores: \n{}'.format(cv_results.iloc[-1, :]))
        
        model.set_params(n_estimators=cv_results.shape[0])
        model.fit(X_train, y_train, eval_metric='auc')   

In [181]:
xgb_model_1 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5,
                            min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                            objective='binary:logistic', scale_pos_weight=1,
                            random_state=random_state, n_jobs=-1)
xbg_fit(xgb_model_1, X_train_trans, y_train_trans, verbose=False)

# estimators: 136
Last iteration scores:
test-auc-mean     0.684864
test-auc-std      0.005248
train-auc-mean    0.755746
train-auc-std     0.001749
Name: 135, dtype: float64


In [183]:
param_grid = {
    'max_depth': range(3,10,2),
    'min_child_weight': range(1,6,2)
}
grid_search_1 = GridSearchCV(estimator = XGBClassifier(learning_rate=0.1, n_estimators=136, max_depth=5,
                        min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                        objective='binary:logistic', n_jobs=-1, scale_pos_weight=1, random_state=random_state), 
                        param_grid=param_grid, scoring='roc_auc', n_jobs=-1, cv=5)
grid_search_1.fit(X_train_trans, y_train_trans)
grid_search_1.grid_scores_, grid_search_1.best_params_, grid_search_1.best_score_



([mean: 0.68183, std: 0.00413, params: {'max_depth': 3, 'min_child_weight': 1},
  mean: 0.68191, std: 0.00386, params: {'max_depth': 3, 'min_child_weight': 3},
  mean: 0.68197, std: 0.00367, params: {'max_depth': 3, 'min_child_weight': 5},
  mean: 0.68470, std: 0.00430, params: {'max_depth': 5, 'min_child_weight': 1},
  mean: 0.68498, std: 0.00447, params: {'max_depth': 5, 'min_child_weight': 3},
  mean: 0.68456, std: 0.00485, params: {'max_depth': 5, 'min_child_weight': 5},
  mean: 0.68130, std: 0.00427, params: {'max_depth': 7, 'min_child_weight': 1},
  mean: 0.68088, std: 0.00496, params: {'max_depth': 7, 'min_child_weight': 3},
  mean: 0.68015, std: 0.00499, params: {'max_depth': 7, 'min_child_weight': 5},
  mean: 0.67479, std: 0.00552, params: {'max_depth': 9, 'min_child_weight': 1},
  mean: 0.67487, std: 0.00476, params: {'max_depth': 9, 'min_child_weight': 3},
  mean: 0.67547, std: 0.00390, params: {'max_depth': 9, 'min_child_weight': 5}],
 {'max_depth': 5, 'min_child_weight': 3

In [184]:
param_grid = {
    'max_depth': [4, 5, 6],
    'min_child_weight': [2, 3, 4]
}
grid_search_2 = GridSearchCV(estimator = XGBClassifier(learning_rate=0.1, n_estimators=136, max_depth=5,
                        min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                        objective='binary:logistic', n_jobs=-1, scale_pos_weight=1, random_state=random_state), 
                        param_grid=param_grid, scoring='roc_auc', n_jobs=-1, cv=5)
grid_search_2.fit(X_train_trans, y_train_trans)
grid_search_2.grid_scores_, grid_search_2.best_params_, grid_search_2.best_score_



([mean: 0.68437, std: 0.00417, params: {'max_depth': 4, 'min_child_weight': 2},
  mean: 0.68485, std: 0.00405, params: {'max_depth': 4, 'min_child_weight': 3},
  mean: 0.68487, std: 0.00449, params: {'max_depth': 4, 'min_child_weight': 4},
  mean: 0.68491, std: 0.00499, params: {'max_depth': 5, 'min_child_weight': 2},
  mean: 0.68498, std: 0.00447, params: {'max_depth': 5, 'min_child_weight': 3},
  mean: 0.68505, std: 0.00459, params: {'max_depth': 5, 'min_child_weight': 4},
  mean: 0.68284, std: 0.00543, params: {'max_depth': 6, 'min_child_weight': 2},
  mean: 0.68258, std: 0.00367, params: {'max_depth': 6, 'min_child_weight': 3},
  mean: 0.68236, std: 0.00501, params: {'max_depth': 6, 'min_child_weight': 4}],
 {'max_depth': 5, 'min_child_weight': 4},
 0.68505405715131806)

In [185]:
param_grid = {
    'gamma': np.linspace(0, 0.4, 5)
}
grid_search_3 = GridSearchCV(estimator = XGBClassifier(learning_rate=0.1, n_estimators=136, max_depth=5,
                        min_child_weight=4, gamma=0, subsample=0.8, colsample_bytree=0.8,
                        objective='binary:logistic', n_jobs=-1, scale_pos_weight=1, random_state=random_state), 
                        param_grid=param_grid, scoring='roc_auc', n_jobs=-1, cv=5)
grid_search_3.fit(X_train_trans, y_train_trans)
grid_search_3.grid_scores_, grid_search_3.best_params_, grid_search_3.best_score_



([mean: 0.68505, std: 0.00459, params: {'gamma': 0.0},
  mean: 0.68511, std: 0.00423, params: {'gamma': 0.10000000000000001},
  mean: 0.68486, std: 0.00462, params: {'gamma': 0.20000000000000001},
  mean: 0.68414, std: 0.00480, params: {'gamma': 0.30000000000000004},
  mean: 0.68411, std: 0.00461, params: {'gamma': 0.40000000000000002}],
 {'gamma': 0.10000000000000001},
 0.6851091578613232)

In [187]:
param_grid = {
    'subsample': np.linspace(0.6, 0.9, 4),
    'colsample_bytree': np.linspace(0.6, 0.9, 4) 
}
grid_search_4 = GridSearchCV(estimator = XGBClassifier(learning_rate=0.1, n_estimators=136, max_depth=5,
                        min_child_weight=4, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
                        objective='binary:logistic', n_jobs=-1, scale_pos_weight=1, random_state=random_state), 
                        param_grid=param_grid, scoring='roc_auc', n_jobs=-1, cv=5)
grid_search_4.fit(X_train_trans, y_train_trans)
grid_search_4.grid_scores_, grid_search_4.best_params_, grid_search_4.best_score_



([mean: 0.68192, std: 0.00481, params: {'colsample_bytree': 0.59999999999999998, 'subsample': 0.59999999999999998},
  mean: 0.68380, std: 0.00417, params: {'colsample_bytree': 0.59999999999999998, 'subsample': 0.69999999999999996},
  mean: 0.68528, std: 0.00484, params: {'colsample_bytree': 0.59999999999999998, 'subsample': 0.80000000000000004},
  mean: 0.68490, std: 0.00375, params: {'colsample_bytree': 0.59999999999999998, 'subsample': 0.90000000000000002},
  mean: 0.68400, std: 0.00357, params: {'colsample_bytree': 0.69999999999999996, 'subsample': 0.59999999999999998},
  mean: 0.68408, std: 0.00559, params: {'colsample_bytree': 0.69999999999999996, 'subsample': 0.69999999999999996},
  mean: 0.68407, std: 0.00425, params: {'colsample_bytree': 0.69999999999999996, 'subsample': 0.80000000000000004},
  mean: 0.68563, std: 0.00348, params: {'colsample_bytree': 0.69999999999999996, 'subsample': 0.90000000000000002},
  mean: 0.68369, std: 0.00469, params: {'colsample_bytree': 0.8000000000

In [188]:
param_grid = {
    'reg_alpha': np.logspace(-5, 2, 8),
}
grid_search_5 = GridSearchCV(estimator = XGBClassifier(learning_rate=0.1, n_estimators=136, max_depth=5,
                        min_child_weight=4, gamma=0.1, subsample=0.9, colsample_bytree=0.9,
                        objective='binary:logistic', n_jobs=-1, scale_pos_weight=1, random_state=random_state), 
                        param_grid=param_grid, scoring='roc_auc', n_jobs=-1, cv=5)
grid_search_5.fit(X_train_trans, y_train_trans)
grid_search_5.grid_scores_, grid_search_5.best_params_, grid_search_5.best_score_



([mean: 0.68606, std: 0.00354, params: {'reg_alpha': 1.0000000000000001e-05},
  mean: 0.68606, std: 0.00354, params: {'reg_alpha': 0.0001},
  mean: 0.68574, std: 0.00344, params: {'reg_alpha': 0.001},
  mean: 0.68584, std: 0.00345, params: {'reg_alpha': 0.01},
  mean: 0.68545, std: 0.00423, params: {'reg_alpha': 0.10000000000000001},
  mean: 0.68533, std: 0.00490, params: {'reg_alpha': 1.0},
  mean: 0.68594, std: 0.00564, params: {'reg_alpha': 10.0},
  mean: 0.67701, std: 0.00432, params: {'reg_alpha': 100.0}],
 {'reg_alpha': 1.0000000000000001e-05},
 0.68605948791591487)

In [189]:
param_grid = {
    'reg_alpha': np.linspace(0.5, 1.5, 5),
}
grid_search_6 = GridSearchCV(estimator = XGBClassifier(learning_rate=0.1, n_estimators=136, max_depth=5,
                        min_child_weight=4, gamma=0.1, subsample=0.9, colsample_bytree=0.9,
                        objective='binary:logistic', n_jobs=-1, scale_pos_weight=1, random_state=random_state), 
                        param_grid=param_grid, scoring='roc_auc', n_jobs=-1, cv=5)
grid_search_6.fit(X_train_trans, y_train_trans)
grid_search_6.grid_scores_, grid_search_6.best_params_, grid_search_6.best_score_



([mean: 0.68520, std: 0.00371, params: {'reg_alpha': 0.5},
  mean: 0.68578, std: 0.00416, params: {'reg_alpha': 0.75},
  mean: 0.68533, std: 0.00490, params: {'reg_alpha': 1.0},
  mean: 0.68612, std: 0.00411, params: {'reg_alpha': 1.25},
  mean: 0.68560, std: 0.00468, params: {'reg_alpha': 1.5}],
 {'reg_alpha': 1.25},
 0.68612221096517501)

In [197]:
xgb_model_2 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5,
                            min_child_weight=4, gamma=0.1, subsample=0.9, colsample_bytree=0.9,
                            objective='binary:logistic', scale_pos_weight=1, reg_alpha=1.25,
                            random_state=random_state, n_jobs=-1)
xbg_fit(xgb_model_2, X_train_trans, y_train_trans, verbose=False)

# estimators: 162
Last iteration scores: 
test-auc-mean     0.686932
test-auc-std      0.005408
train-auc-mean    0.762183
train-auc-std     0.001474
Name: 161, dtype: float64


### Predict on testing dataset by best model

In [39]:
test_df.shape

(48744, 121)

In [40]:
test_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48744 entries, 0 to 48743
Data columns (total 121 columns):
sk_id_curr                      int64
name_contract_type              object
code_gender                     object
flag_own_car                    object
flag_own_realty                 object
cnt_children                    int64
amt_income_total                float64
amt_credit                      float64
amt_annuity                     float64
amt_goods_price                 float64
name_type_suite                 object
name_income_type                object
name_education_type             object
name_family_status              object
name_housing_type               object
region_population_relative      float64
days_birth                      int64
days_employed                   int64
days_registration               float64
days_id_publish                 int64
own_car_age                     float64
flag_mobil                      int64
flag_emp_phone                 

In [198]:
y_pred = xgb_model_2.predict_proba(X_test)
y_pred.shape

(48744, 2)

In [199]:
y_pred[:10, 1]

array([ 0.55576771,  0.4859249 ,  0.31613174,  0.36880374,  0.62004709,
        0.28780553,  0.43782464,  0.50248498,  0.35639647,  0.19418734], dtype=float32)

### Save predictions to file

In [200]:
submission_file = './submissions/submission.csv'

In [201]:
result_df = DataFrame({'SK_ID_CURR': test_df['sk_id_curr'],
                       'TARGET': y_pred[:, 1]})
result_df.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.555768
1,100005,0.485925
2,100013,0.316132
3,100028,0.368804
4,100038,0.620047


In [202]:
result_df.to_csv(submission_file, index=False)
!head {submission_file}

SK_ID_CURR,TARGET
100001,0.5557677149772644
100005,0.4859248995780945
100013,0.31613174080848694
100028,0.3688037395477295
100038,0.6200470924377441
100042,0.2878055274486542
100057,0.43782463669776917
100065,0.5024849772453308
100066,0.35639646649360657


### Submit to Kaggle

In [203]:
!kaggle competitions submit -c home-credit-default-risk -f {submission_file} -m " "

Successfully submitted to Home Credit Default Risk

In [204]:
!kaggle competitions submissions -c home-credit-default-risk

fileName        date                 description  status    publicScore  privateScore  
--------------  -------------------  -----------  --------  -----------  ------------  
submission.csv  2018-06-03 16:26:59               complete  0.690        None          
submission.csv  2018-06-03 14:27:03               complete  0.684        None          
submission.csv  2018-06-03 12:31:14               complete  0.638        None          
submission.csv  2018-06-02 22:57:59               complete  0.650        None          
submission.csv  2018-06-02 10:05:31               complete  0.690        None          
submission.csv  2018-06-01 14:10:18               complete  0.673        None          
submission.csv  2018-06-01 13:44:23               complete  0.648        None          
submission.csv  2018-05-31 12:17:41               complete  0.663        None          
submission.csv  2018-05-31 09:39:38  Baseline     complete  0.639        None          
submission.csv  2018-