In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
import xgboost as xgb
from sklearn.cross_validation import KFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
import gc
import os
SEED = 7    # CR7 always..
subm = pd.read_csv("../submissions/sample_submission.csv")



In [2]:
#----------
input_dir = os.path.join(os.pardir, 'input')
#print('Input files:\n{}'.format(os.listdir(input_dir)))
print('Loading data sets...')

sample_size = None

Loading data sets...


In [3]:
# import data
app_train_df = pd.read_csv("../data/raw/application_train.csv")
app_test_df  = pd.read_csv("../data/raw/application_test.csv")
bureau_df  = pd.read_csv("../data/raw/bureau.csv")
bureau_balance_df  = pd.read_csv("../data/raw/bureau_balance.csv")
prev_app_df  = pd.read_csv("../data/raw/previous_application.csv")
credit_card_df  = pd.read_csv("../data/raw/credit_card_balance.csv")
pos_cash_df  = pd.read_csv("../data/raw/POS_CASH_balance.csv")
install_df  = pd.read_csv("../data/raw/installments_payments.csv")

print('Data loaded.\nMain application training data set shape = {}'.format(app_train_df.shape))
print('Main application test data set shape = {}'.format(app_test_df.shape))
print('Positive target proportion = {:.2f}'.format(app_train_df['TARGET'].mean()))

Data loaded.
Main application training data set shape = (307511, 122)
Main application test data set shape = (48744, 121)
Positive target proportion = 0.08


In [4]:
def agg_and_merge(left_df, right_df, agg_method, right_suffix):
    """ Aggregate a df by 'SK_ID_CURR' and merge it onto another.
    This method allows feature name """
    
    agg_df = right_df.groupby('SK_ID_CURR').agg(agg_method)
    merged_df = left_df.merge(agg_df, left_on='SK_ID_CURR', right_index=True, how='left',
                              suffixes=['', '_' + right_suffix + agg_method.upper()])
    return merged_df

In [5]:
def feature_engineering(app_data, bureau_df, bureau_balance_df, credit_card_df,
                        pos_cash_df, prev_app_df, install_df):
    """ 
    Process the input dataframes into a single one containing all the features. Requires
    a lot of aggregating of the supplementary datasets such that they have an entry per
    customer.
    
    Also, add any new features created from the existing ones
    """
    
    # # Add new features
    
    ###################
        
    # income ratios
    app_data["CREDIT_BY_INCOME"]      = app_data["AMT_CREDIT"]      / app_data["AMT_INCOME_TOTAL"]
    app_data["ANNUITY_BY_INCOME"]     = app_data["AMT_ANNUITY"]     / app_data["AMT_INCOME_TOTAL"]
    app_data["GOODS_PRICE_BY_INCOME"] = app_data["AMT_GOODS_PRICE"] / app_data["AMT_INCOME_TOTAL"]
    app_data["INCOME_PER_PERSON"]     = app_data["AMT_INCOME_TOTAL"] / app_data["CNT_FAM_MEMBERS"]
    
    # career ratio
    app_data["PERCENT_WORKED"] = app_data["DAYS_EMPLOYED"] / app_data["DAYS_BIRTH"]
    app_data["PERCENT_WORKED"][app_data["PERCENT_WORKED"] < 0] = None
    
    # number of adults
    app_data["CNT_ADULTS"] = app_data["CNT_FAM_MEMBERS"] - app_data["CNT_CHILDREN"]
    app_data['CHILDREN_RATIO'] = app_data['CNT_CHILDREN'] / app_data['CNT_FAM_MEMBERS']
    
    # number of overall payments
    app_data['ANNUITY LENGTH'] = app_data['AMT_CREDIT'] / app_data['AMT_ANNUITY']
    
    # external sources
    app_data["EXT_SOURCE_MEAN"] = app_data[["EXT_SOURCE_1", "EXT_SOURCE_1", "EXT_SOURCE_3"]].mean(axis = 1)
    app_data["EXT_SOURCE_SD"]   = app_data[["EXT_SOURCE_1", "EXT_SOURCE_1", "EXT_SOURCE_3"]].std(axis = 1)
    app_data["NUM_EXT_SOURCES"] = 3 - (app_data["EXT_SOURCE_1"].isnull().astype(int) +
                                   app_data["EXT_SOURCE_2"].isnull().astype(int) +
                                   app_data["EXT_SOURCE_3"].isnull().astype(int))
    
    # number of documents
    doc_vars = ["FLAG_DOCUMENT_2",  "FLAG_DOCUMENT_3",  "FLAG_DOCUMENT_4",  "FLAG_DOCUMENT_5",  "FLAG_DOCUMENT_6",
                "FLAG_DOCUMENT_7",  "FLAG_DOCUMENT_8",  "FLAG_DOCUMENT_9",  "FLAG_DOCUMENT_10", "FLAG_DOCUMENT_11",
                "FLAG_DOCUMENT_12", "FLAG_DOCUMENT_13", "FLAG_DOCUMENT_14", "FLAG_DOCUMENT_15", "FLAG_DOCUMENT_16",
                "FLAG_DOCUMENT_17", "FLAG_DOCUMENT_18", "FLAG_DOCUMENT_19", "FLAG_DOCUMENT_20", "FLAG_DOCUMENT_21"]
    app_data["NUM_DOCUMENTS"] = app_data[doc_vars].sum(axis = 1)
    
    # age ratios
    app_data["OWN_CAR_AGE_RATIO"] = app_data["OWN_CAR_AGE"] / app_data["DAYS_BIRTH"]
    app_data["DAYS_ID_PUBLISHED_RATIO"] = app_data["DAYS_ID_PUBLISH"] / app_data["DAYS_BIRTH"]
    app_data["DAYS_REGISTRATION_RATIO"] = app_data["DAYS_REGISTRATION"] / app_data["DAYS_BIRTH"]
    app_data["DAYS_LAST_PHONE_CHANGE_RATIO"] = app_data["DAYS_LAST_PHONE_CHANGE"] / app_data["DAYS_BIRTH"]
    
    # amount ratios
    bureau_df["AMT_SUM_OVERDUE_RATIO_1"] = bureau_df["AMT_CREDIT_SUM_OVERDUE"] / bureau_df["AMT_ANNUITY"]
    bureau_df["AMT_SUM_OVERDUE_RATIO_2"] = bureau_df["AMT_CREDIT_SUM_OVERDUE"] / bureau_df["AMT_CREDIT_SUM"]
    bureau_df["AMT_MAX_OVERDUE_RATIO_1"] = bureau_df["AMT_CREDIT_MAX_OVERDUE"] / bureau_df["AMT_ANNUITY"]
    bureau_df["AMT_MAX_OVERDUE_RATIO_2"] = bureau_df["AMT_CREDIT_MAX_OVERDUE"] / bureau_df["AMT_CREDIT_SUM"]
    bureau_df["AMT_SUM_DEBT_RATIO_1"]    = bureau_df["AMT_CREDIT_SUM_DEBT"] / bureau_df["AMT_CREDIT_SUM"]
    bureau_df["AMT_SUM_DEBT_RATIO_2"]    = bureau_df["AMT_CREDIT_SUM_DEBT"] / bureau_df["AMT_CREDIT_SUM_LIMIT"]
    
    # day differences
    bureau_df["DAYS_END_DIFF_1"] = bureau_df["DAYS_ENDDATE_FACT"]   - bureau_df["DAYS_CREDIT_ENDDATE"]
    bureau_df["DAYS_END_DIFF_2"] = bureau_df["DAYS_CREDIT_UPDATE"]  - bureau_df["DAYS_CREDIT_ENDDATE"]
    bureau_df["DAYS_DURATION_1"] = bureau_df["DAYS_CREDIT_ENDDATE"] - bureau_df["DAYS_CREDIT"]
    bureau_df["DAYS_DURATION_2"] = bureau_df["DAYS_ENDDATE_FACT"]   - bureau_df["DAYS_CREDIT"]
    
    # days past due and days before due (no negative values)
    install_df['DPD'] = install_df['DAYS_ENTRY_PAYMENT'] - install_df['DAYS_INSTALMENT']
    install_df['DBD'] = install_df['DAYS_INSTALMENT'] - install_df['DAYS_ENTRY_PAYMENT']
    install_df['DPD'] = install_df['DPD'].apply(lambda x: x if x > 0 else 0)
    install_df['DBD'] = install_df['DBD'].apply(lambda x: x if x > 0 else 0)
    
    # percentage and difference paid in each installment 
    install_df['PAYMENT_PERC'] = install_df['AMT_PAYMENT'] / install_df['AMT_INSTALMENT']
    install_df['PAYMENT_DIFF'] = install_df['AMT_INSTALMENT'] - install_df['AMT_PAYMENT']
    
    # installments percentage
    pos_cash_df["INSTALLMENTS_PERCENT"] = pos_cash_df["CNT_INSTALMENT_FUTURE"] / pos_cash_df["CNT_INSTALMENT"]
    
    # amount ratios
    prev_app_df["AMT_GIVEN_RATIO_1"]  = prev_app_df["AMT_CREDIT"] / prev_app_df["AMT_APPLICATION"]
    prev_app_df["AMT_GIVEN_RATIO_2"]  = prev_app_df["AMT_GOODS_PRICE"] / prev_app_df["AMT_APPLICATION"]
    prev_app_df["DOWN_PAYMENT_RATIO"] = prev_app_df["AMT_DOWN_PAYMENT"] / prev_app_df["AMT_APPLICATION"]
    
    # day differences
    prev_app_df["DAYS_DUE_DIFF_1"] = prev_app_df["DAYS_LAST_DUE_1ST_VERSION"] - prev_app_df["DAYS_FIRST_DUE"]
    prev_app_df["DAYS_DUE_DIFF_2"] = prev_app_df["DAYS_LAST_DUE"] - prev_app_df["DAYS_FIRST_DUE"]
    prev_app_df["DAYS_TERMINATION_DIFF_1"] = prev_app_df["DAYS_TERMINATION"] - prev_app_df["DAYS_FIRST_DRAWING"]
    prev_app_df["DAYS_TERMINATION_DIFF_2"] = prev_app_df["DAYS_TERMINATION"] - prev_app_df["DAYS_FIRST_DUE"]
    prev_app_df["DAYS_TERMINATION_DIFF_3"] = prev_app_df["DAYS_TERMINATION"] - prev_app_df["DAYS_LAST_DUE"]
    
    #######################
    
    # A lot of the continuous days variables have integers as missing value indicators.
    prev_app_df['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True)
    prev_app_df['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True)
    prev_app_df['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
    prev_app_df['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True)
    prev_app_df['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True)
    
    # # Aggregate and merge supplementary datasets

    # Previous applications
    print('Combined train & test input shape before any merging  = {}'.format(app_data.shape))
    agg_funs = {'SK_ID_CURR': 'count', 'AMT_CREDIT': 'sum'}
    prev_apps = prev_app_df.groupby('SK_ID_CURR').agg(agg_funs)
    prev_apps.columns = ['PREV APP COUNT', 'TOTAL PREV LOAN AMT']
    merged_df = app_data.merge(prev_apps, left_on='SK_ID_CURR', right_index=True, how='left')

    # Average the rest of the previous app data
    for agg_method in ['mean', 'max', 'min']:
        merged_df = agg_and_merge(merged_df, prev_app_df, agg_method, 'PRV')
    print('Shape after merging with previous apps num data = {}'.format(merged_df.shape))
    
    # Previous app categorical features
    prev_app_df, cat_feats, _ = process_dataframe(prev_app_df)
    prev_apps_cat_avg = prev_app_df[cat_feats + ['SK_ID_CURR']].groupby('SK_ID_CURR')\
                             .agg({k: lambda x: str(x.mode().iloc[0]) for k in cat_feats})
    merged_df = merged_df.merge(prev_apps_cat_avg, left_on='SK_ID_CURR', right_index=True,
                            how='left', suffixes=['', '_BAVG'])
    print('Shape after merging with previous apps cat data = {}'.format(merged_df.shape))

    # Credit card data - numerical features
    wm = lambda x: np.average(x, weights=-1/credit_card_df.loc[x.index, 'MONTHS_BALANCE'])
    credit_card_avgs = credit_card_df.groupby('SK_ID_CURR').agg(wm)   
    merged_df = merged_df.merge(credit_card_avgs, left_on='SK_ID_CURR', right_index=True,
                                how='left', suffixes=['', '_CC_WAVG'])
    for agg_method in ['mean', 'max', 'min']:
        merged_df = agg_and_merge(merged_df, credit_card_avgs, agg_method, 'CC')
    print('Shape after merging with previous apps num data = {}'.format(merged_df.shape))
    
    # Credit card data - categorical features
    most_recent_index = credit_card_df.groupby('SK_ID_CURR')['MONTHS_BALANCE'].idxmax()
    cat_feats = credit_card_df.columns[credit_card_df.dtypes == 'object'].tolist()  + ['SK_ID_CURR']
    merged_df = merged_df.merge(credit_card_df.loc[most_recent_index, cat_feats], left_on='SK_ID_CURR', right_on='SK_ID_CURR',
                       how='left', suffixes=['', '_CCAVG'])
    print('Shape after merging with credit card data = {}'.format(merged_df.shape))

    # Credit bureau data - numerical features
    for agg_method in ['mean', 'max', 'min']:
        merged_df = agg_and_merge(merged_df, bureau_df, agg_method, 'B')
    print('Shape after merging with credit bureau data = {}'.format(merged_df.shape))
    
    # Bureau balance data
    most_recent_index = bureau_balance_df.groupby('SK_ID_BUREAU')['MONTHS_BALANCE'].idxmax()
    bureau_balance_df = bureau_balance_df.loc[most_recent_index, :]
    merged_df = merged_df.merge(bureau_balance_df, left_on='SK_ID_BUREAU', right_on='SK_ID_BUREAU',
                            how='left', suffixes=['', '_B_B'])
    print('Shape after merging with bureau balance data = {}'.format(merged_df.shape))

    # Pos cash data - weight values by recency when averaging
    wm = lambda x: np.average(x, weights=-1/pos_cash_df.loc[x.index, 'MONTHS_BALANCE'])
    f = {'CNT_INSTALMENT': wm, 'CNT_INSTALMENT_FUTURE': wm, 'SK_DPD': wm, 'SK_DPD_DEF':wm}
    cash_avg = pos_cash_df.groupby('SK_ID_CURR')['CNT_INSTALMENT','CNT_INSTALMENT_FUTURE',
                                                 'SK_DPD', 'SK_DPD_DEF'].agg(f)
    merged_df = merged_df.merge(cash_avg, left_on='SK_ID_CURR', right_index=True,
                                how='left', suffixes=['', '_CAVG'])
                                
    # Unweighted aggregations of numeric features
    for agg_method in ['mean', 'max', 'min']:
        merged_df = agg_and_merge(merged_df, pos_cash_df, agg_method, 'PC')
    
    # Pos cash data data - categorical features
    most_recent_index = pos_cash_df.groupby('SK_ID_CURR')['MONTHS_BALANCE'].idxmax()
    cat_feats = pos_cash_df.columns[pos_cash_df.dtypes == 'object'].tolist()  + ['SK_ID_CURR']
    merged_df = merged_df.merge(pos_cash_df.loc[most_recent_index, cat_feats], left_on='SK_ID_CURR', right_on='SK_ID_CURR',
                       how='left', suffixes=['', '_CAVG'])
    print('Shape after merging with pos cash data = {}'.format(merged_df.shape))

    # Installments data
    for agg_method in ['mean', 'max', 'min']:
        merged_df = agg_and_merge(merged_df, install_df, agg_method, 'I')    
    print('Shape after merging with installments data = {}'.format(merged_df.shape))
    
    # Add more value counts
    merged_df = merged_df.merge(pd.DataFrame(bureau_df['SK_ID_CURR'].value_counts()), left_on='SK_ID_CURR', 
                                right_index=True, how='left', suffixes=['', '_CNT_BUREAU'])
    merged_df = merged_df.merge(pd.DataFrame(credit_card_df['SK_ID_CURR'].value_counts()), left_on='SK_ID_CURR', 
                                right_index=True, how='left', suffixes=['', '_CNT_CRED_CARD'])
    merged_df = merged_df.merge(pd.DataFrame(pos_cash_df['SK_ID_CURR'].value_counts()), left_on='SK_ID_CURR', 
                                right_index=True, how='left', suffixes=['', '_CNT_POS_CASH'])
    merged_df = merged_df.merge(pd.DataFrame(install_df['SK_ID_CURR'].value_counts()), left_on='SK_ID_CURR', 
                                right_index=True, how='left', suffixes=['', '_CNT_INSTALL'])
    print('Shape after merging with counts data = {}'.format(merged_df.shape))

    return merged_df

In [14]:
def process_dataframe(input_df, encoder_dict=None):
    """ Process a dataframe into a form useable by LightGBM """

    # Label encode categoricals
    print('Label encoding categorical features...')
    categorical_feats = input_df.columns[input_df.dtypes == 'object']
    for feat in categorical_feats:
        encoder = LabelEncoder()
        input_df[feat] = encoder.fit_transform(input_df[feat].fillna('NULL').astype(str))
    print('Label encoding complete.')

    return input_df, categorical_feats.tolist(), encoder_dict

In [7]:
# Merge the datasets into a single one for training
len_train = len(app_train_df)
app_both = pd.concat([app_train_df, app_test_df])
print('before feat engg')
merged_df = feature_engineering(app_both, bureau_df, bureau_balance_df, credit_card_df,
                                pos_cash_df, prev_app_df, install_df)
print('after feat engg')
merged_df.to_csv('processed_input_data.csv', index=False)

before feat engg
Shape after merging with previous apps num data = (356255, 252)
Label encoding categorical features...
Label encoding complete.
Shape after merging with previous apps cat data = (356255, 268)
Shape after merging with previous apps num data = (356255, 352)
Shape after merging with credit card data = (356255, 353)
Shape after merging with credit bureau data = (356255, 428)
Shape after merging with bureau balance data = (356255, 430)
Shape after merging with pos cash data = (356255, 458)
Shape after merging with installments data = (356255, 491)
Shape after merging with counts data = (356255, 495)
after feat engg
Label encoding categorical features...


TypeError: '<' not supported between instances of 'str' and 'float'

In [16]:
# Separate metadata
meta_cols = ['SK_ID_CURR']
meta_df = merged_df[meta_cols]
merged_df.drop(columns=meta_cols, inplace=True)

# Process the data set.
merged_df, categorical_feats, encoder_dict = process_dataframe(input_df=merged_df)

# Capture other categorical features not as object data types:
non_obj_categoricals = [
    'FONDKAPREMONT_MODE', 'HOUR_APPR_PROCESS_START', 'HOUSETYPE_MODE',
    'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',
    'NAME_INCOME_TYPE', 'NAME_TYPE_SUITE', 'OCCUPATION_TYPE',
    'ORGANIZATION_TYPE', 'STATUS', 'NAME_CONTRACT_STATUS_CAVG',
    'WALLSMATERIAL_MODE', 'WEEKDAY_APPR_PROCESS_START', 'NAME_CONTRACT_TYPE_BAVG',
    'WEEKDAY_APPR_PROCESS_START_BAVG', 'NAME_CASH_LOAN_PURPOSE', 'NAME_CONTRACT_STATUS', 
    'NAME_PAYMENT_TYPE', 'CODE_REJECT_REASON', 'NAME_TYPE_SUITE_BAVG', 
    'NAME_CLIENT_TYPE', 'NAME_GOODS_CATEGORY', 'NAME_PORTFOLIO', 
    'NAME_PRODUCT_TYPE', 'CHANNEL_TYPE', 'NAME_SELLER_INDUSTRY', 
    'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION', 'NAME_CONTRACT_STATUS_CCAVG' 
]
categorical_feats = categorical_feats + non_obj_categoricals

# Re-separate into train and test
train_df = merged_df[:len_train]
test_df = merged_df[len_train:]
del merged_df, app_test_df, bureau_df, bureau_balance_df, credit_card_df, pos_cash_df, prev_app_df
gc.collect()

1085

In [None]:
""" Train the model """
target = train_df.pop('TARGET')
test_df.drop(columns='TARGET', inplace=True)
#---------------
print('data done')
data_train = train_df
data_test = test_df

data_train.fillna(-1, inplace=True)
data_test.fillna(-1, inplace=True)
cols = data_train.columns

ntrain = data_train.shape[0]
ntest = data_test.shape[0]

print(data_train.shape)

In [17]:
from sklearn.cross_validation import KFold
kf = KFold(data_train.shape[0], n_folds=5, shuffle=True, random_state=7)
NFOLDS = 5
x_train = np.array(data_train)
x_test = np.array(data_test)
y_train = np.array(target)

data done
(307511, 493)


In [18]:
# from https://www.kaggle.com/mmueller/stacking-starter?scriptVersionId=390867/code
class SklearnWrapper(object):
    def __init__(self, clf, seed=7, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        print("Training..")
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        print("Predicting..")
        return self.clf.predict_proba(x)


class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        print("Training..")
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        print("Predicting..")
        return self.gbdt.predict(xgb.DMatrix(x))


def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)[:,1]  # or [:,0]
        oof_test_skf[i, :] = clf.predict(x_test)[:,1]  # or [:,0]

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)
    
def get_oof_xgb(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)  # or [:,0]
        oof_test_skf[i, :] = clf.predict(x_test)  # or [:,0]

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [19]:
et_params = {
    'n_jobs': 16,
    'n_estimators': 500,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

rf_params = {
    'n_jobs': 16,
    'n_estimators': 500,
    'max_features': 0.2,
    'max_depth': 8,
    'min_samples_leaf': 2,
}

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'reg:linear',
    'max_depth': 7,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'auc',
    'nrounds': 3000
}

cb_params = {
    'iterations':3000,
    'learning_rate':0.1,
    'depth':6,
    'l2_leaf_reg':40,
    'bootstrap_type':'Bernoulli',
    'subsample':0.7,
    'scale_pos_weight':5,
    'eval_metric':'AUC',
    'metric_period':50,
    'od_type':'Iter',
    'od_wait':45,
    'allow_writing_files':False    
}

In [26]:
#xg = XgbWrapper(seed=SEED, params=xgb_params)
#et = SklearnWrapper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
#rf = SklearnWrapper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
#cb = SklearnWrapper(clf=CatBoostClassifier, seed=SEED, params=cb_params)

#print("xg..")
#xg_oof_train, xg_oof_test = get_oof_xgb(xg)
#print("et..")
#et_oof_train, et_oof_test = get_oof(et)
#print("rf..")
#rf_oof_train, rf_oof_test = get_oof(rf)
print("cb..")
cb_oof_train, cb_oof_test = get_oof(cb)

x_train = np.concatenate((xg_oof_train, cb_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, cb_oof_test), axis=1)

np.save('x_train', x_train)
np.save('x_test', x_test)
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test)

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.8,
    'silent': 1,
    'subsample': 0.6,
    'learning_rate': 0.01,
    'objective': 'reg:linear',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'auc',
}

print("xgb cv..")
res = xgb.cv(xgb_params, dtrain, num_boost_round=500, nfold=4, seed=SEED, stratified=False,
             early_stopping_rounds=25, verbose_eval=10, show_stdv=True)
best_nrounds = res.shape[0] - 1

print("meta xgb train..")
gbdt = xgb.train(xgb_params, dtrain, best_nrounds)
fi = gbdt.predict(dtest)
fi = np.array(fi)
np.save('fi', fi)

cb..
Training..
0:	learn: 0.7128868	total: 729ms	remaining: 36m 25s
50:	learn: 0.7814423	total: 32.5s	remaining: 31m 16s
100:	learn: 0.7956133	total: 1m 2s	remaining: 29m 49s
150:	learn: 0.8049980	total: 1m 30s	remaining: 28m 35s
200:	learn: 0.8124595	total: 1m 59s	remaining: 27m 48s
250:	learn: 0.8192672	total: 2m 29s	remaining: 27m 21s
300:	learn: 0.8255154	total: 3m	remaining: 26m 56s


KeyboardInterrupt: 

In [28]:
xg_oof_test.shape

(48744, 1)

In [29]:
subm['TARGET'] = xg_oof_test
subm.to_csv('xg_oof_test.csv', index=False)