In [3]:
import pandas as pd
import numpy as np
from scipy.stats import skew
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from math import sqrt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import gc

In [4]:
NFOLDS=3
SEED=0
NROWS=None

In [39]:
data=pd.read_csv('./input/application_train.csv')
test=pd.read_csv('./input/application_test.csv')
prev=pd.read_csv('./input/previous_application.csv')

In [40]:
categorical_feats=[f for f in data.columns if data[f].dtype=='object']
for f_ in categorical_feats:
    data[f_],indexer=pd.factorize(data[f_])
    test[f_]=indexer.get_indexer(test[f_])
gc.enable()

In [41]:
y_train=data['TARGET']
del data['TARGET']

In [42]:
prev_cat_features = [f_ for f_ in prev.columns if prev[f_].dtype == 'object']
for f_ in prev_cat_features:
    prev[f_], _ = pd.factorize(prev[f_])
    
avg_prev = prev.groupby('SK_ID_CURR').mean()
cnt_prev = prev[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
avg_prev['nb_app'] = cnt_prev['SK_ID_PREV']
del avg_prev['SK_ID_PREV']

In [43]:
x_train = data.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')
x_test = test.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')

x_train = x_train.fillna(0)
x_test= x_test.fillna(0)

ntrain = x_train.shape[0]
ntest = x_test.shape[0]

excluded_feats = ['SK_ID_CURR']
features = [f_ for f_ in x_train.columns if f_ not in excluded_feats]

x_train = x_train[features]
x_test = x_test[features]

kf = KFold(n_splits = NFOLDS, shuffle=True, random_state=SEED)

In [44]:
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]

class CatboostWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_seed'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]
        
class LightGBMWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['feature_fraction_seed'] = seed
        params['bagging_seed'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]


class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))


def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train.loc[train_index]
        y_tr = y_train.loc[train_index]
        x_te = x_train.loc[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [45]:
et_params = {
    'n_jobs': 16,
    'n_estimators': 200,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

rf_params = {
    'n_jobs': 16,
    'n_estimators': 200,
    'max_features': 0.2,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'binary:logistic',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'nrounds': 200
}

catboost_params = {
    'iterations': 200,
    'learning_rate': 0.5,
    'depth': 3,
    'l2_leaf_reg': 40,
    'bootstrap_type': 'Bernoulli',
    'subsample': 0.7,
    'scale_pos_weight': 5,
    'eval_metric': 'AUC',
    'od_type': 'Iter',
    'allow_writing_files': False
}

lightgbm_params = {
    'n_estimators':200,
    'learning_rate':0.1,
    'num_leaves':123,
    'colsample_bytree':0.8,
    'subsample':0.9,
    'max_depth':15,
    'reg_alpha':0.1,
    'reg_lambda':0.1,
    'min_split_gain':0.01,
    'min_child_weight':2    
}

In [46]:
xg = XgbWrapper(seed=SEED, params=xgb_params)
et = SklearnWrapper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
cb = CatboostWrapper(clf= CatBoostClassifier, seed = SEED, params=catboost_params)
lg = LightGBMWrapper(clf = LGBMClassifier, seed = SEED, params = lightgbm_params)

xg_oof_train, xg_oof_test = get_oof(xg)
et_oof_train, et_oof_test = get_oof(et)
rf_oof_train, rf_oof_test = get_oof(rf)
cb_oof_train, cb_oof_test = get_oof(cb)

print("XG-CV: {}".format(sqrt(mean_squared_error(y_train, xg_oof_train))))
print("ET-CV: {}".format(sqrt(mean_squared_error(y_train, et_oof_train))))
print("RF-CV: {}".format(sqrt(mean_squared_error(y_train, rf_oof_train))))
print("RF-CV: {}".format(sqrt(mean_squared_error(y_train, cb_oof_train))))

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


0:	total: 34.7ms	remaining: 6.9s
1:	total: 68.2ms	remaining: 

6:	total: 233ms	remaining: 6.42s
7:	total: 269ms	remaining: 6.47s
8:	total: 303ms	remaining: 6.43s
9:	total: 340ms	remaining: 6.46s
10:	total: 371ms	remaining: 6.37s
11:	total: 404ms	remaining: 6.32s
12:	total: 439ms	remaining: 6.31s
13:	total: 476ms	remaining: 6.32s
14:	total: 506ms	remaining: 6.24s
15:	total: 539ms	remaining: 6.19s
16:	total: 572ms	remaining: 6.16s
17:	total: 605ms	remaining: 6.12s
18:	total: 640ms	remaining: 6.09s
19:	total: 673ms	remaining: 6.06s
20:	total: 709ms	remaining: 6.04s
21:	total: 746ms	remaining: 6.03s
22:	total: 786ms	remaining: 6.05s
23:	total: 820ms	remaining: 6.01s
24:	total: 856ms	remaining: 5.99s
25:	total: 895ms	remaining: 5.99s
26:	total: 938ms	remaining: 6.01s
27:	total: 979ms	remaining: 6.01s
28:	total: 1.02s	remaining: 6s
29:	total: 1.05s	remaining: 5.98s
30:	total: 1.09s	remaining: 5.96s
31:	total: 1.13s	remaining: 5.95s
32:	total: 1.18s	remaining: 5.95s
33:	total: 1.21s	remaining: 5.91s
34:	total: 1.25s	remaining: 5.87s
35:	total: 1.28s	rema

50:	total: 1.82s	remaining: 5.31s
51:	total: 1.84s	remaining: 5.25s
52:	total: 1.88s	remaining: 5.22s
53:	total: 1.92s	remaining: 5.19s
54:	total: 1.96s	remaining: 5.17s
55:	total: 1.99s	remaining: 5.13s
56:	total: 2.03s	remaining: 5.1s
57:	total: 2.07s	remaining: 5.07s
58:	total: 2.11s	remaining: 5.04s
59:	total: 2.15s	remaining: 5.02s
60:	total: 2.19s	remaining: 4.99s
61:	total: 2.23s	remaining: 4.96s
62:	total: 2.27s	remaining: 4.93s
63:	total: 2.3s	remaining: 4.9s
64:	total: 2.34s	remaining: 4.86s
65:	total: 2.38s	remaining: 4.83s
66:	total: 2.42s	remaining: 4.8s
67:	total: 2.46s	remaining: 4.77s
68:	total: 2.49s	remaining: 4.74s
69:	total: 2.53s	remaining: 4.7s
70:	total: 2.57s	remaining: 4.67s
71:	total: 2.61s	remaining: 4.64s
72:	total: 2.65s	remaining: 4.6s
73:	total: 2.68s	remaining: 4.56s
74:	total: 2.72s	remaining: 4.53s
75:	total: 2.75s	remaining: 4.49s
76:	total: 2.79s	remaining: 4.46s
77:	total: 2.83s	remaining: 4.42s
78:	total: 2.86s	remaining: 4.38s
79:	total: 2.9s	rema

In [47]:
x_train = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train, cb_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test, cb_oof_test), axis=1)

print("{},{}".format(x_train.shape, x_test.shape))

(307511, 4),(48744, 4)


In [48]:
logistic_regression = LogisticRegression()
logistic_regression.fit(x_train,y_train)

test['TARGET'] = logistic_regression.predict_proba(x_test)[:,1]
test

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,TARGET
0,100001,0,1,0,0,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.039195
1,100005,0,0,0,0,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0,0.111690
2,100013,0,0,1,0,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0,0.026765
3,100028,0,1,0,0,2,315000.0,1575000.0,49018.5,1575000.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0,0.038718
4,100038,0,0,1,1,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,,,,,,,0.160123
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,456221,0,1,0,0,0,121500.0,412560.0,17473.5,270000.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.040224
48740,456222,0,1,0,1,2,157500.0,622413.0,31909.5,495000.0,...,0,0,0,,,,,,,0.083475
48741,456223,0,1,1,0,1,202500.0,315000.0,33205.5,315000.0,...,0,0,0,0.0,0.0,0.0,0.0,3.0,1.0,0.030538
48742,456224,0,0,0,1,0,225000.0,450000.0,25128.0,450000.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0,0.062575


In [49]:
test[['SK_ID_CURR','TARGET']].to_csv('first_submission.csv',index=False,float_format='%.8f')