## functions

In [2]:
import pandas as pd
import numpy as np
import os
import gc

import addict
import lightgbm as lgb

def save_dataframe(path, dataframe):
    np.save(path + ".data", dataframe.values)
    np.save(path + ".header", dataframe.columns)


def load_dataframe(path):
    data = np.load(path + ".data.npy")
    header = np.load(path + ".header.npy")
    return pd.DataFrame(data=data, columns=header)


def save_dataframe32(path, dataframe, keep=[]):
    col64 = [col_ for col_ in dataframe.columns if col_ in keep]
    col32 = [col_ for col_ in dataframe.columns if col_ not in keep]
    dataframe64 = dataframe[col64]
    dataframe32 = dataframe[col32]
    np.save(path + ".data64", dataframe64.values)
    np.save(path + ".header64", col64)
    np.save(path + ".data32", dataframe32.values.astype(np.float32))
    np.save(path + ".header32", col32)


def load_dataframe32(path, nrows=None):
    path_data32 = path + ".data32.npy"
    path_header32 = path + ".header32.npy"
    path_data64 = path + ".data64.npy"
    path_header64 = path + ".header64.npy"
    result = pd.DataFrame()
    if os.path.exists(path_data32):
        data32 = np.load(path_data32)
        header32 = np.load(path_header32)
        df32 = pd.DataFrame(data=data32, columns=header32)
        result = pd.concat([result, df32], axis=1)
    if os.path.exists(path_data64):
        data64 = np.load(path_data64)
        header64 = np.load(path_header64)
        df64 = pd.DataFrame(data=data64, columns=header64)
        result = pd.concat([result, df64], axis=1)
    if nrows and nrows > 0:
        return result.head(nrows)
    return result

In [1]:
from sklearn.metrics import roc_auc_score

In [3]:
import xgboost as xgb

class XgbWrapper:
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

## check data

In [4]:
# nrows = None  
nrows = None

features = list(np.load('./neptune-features/features_246008_1174_0.npy'))
train_app = load_dataframe32("./bindata/application_train")
auc_valid_stack = []
pred_valid_stack = []
pred_test_stack = []

for i in range(5):
    train_idx_fn = "./neptune-features/train_idx_{}.npy".format(i)
    valid_idx_fn = "./neptune-features/valid_idx_{}.npy".format(i)

    train_offset, valid_offset, test_offset = i * 4, i * 4 + 1, i * 4 + 3
    
    train_data_fn = "./neptune-features/data_246009_1174_{}".format(train_offset)
    valid_data_fn = "./neptune-features/data_61502_1174_{}".format(valid_offset)
    test_data_fn = "./neptune-features/data_48744_1174_{}".format(test_offset)
    
    if i == 0:
        train_data_fn = train_data_fn.replace("246009", "246008")
        valid_data_fn = valid_data_fn.replace("61502", "61503")
    
    gc.collect()

    train_idx = np.load(train_idx_fn)
    valid_idx = np.load(valid_idx_fn)
    
    if nrows:
        train_idx, valid_idx = train_idx[:nrows].copy(), valid_idx[:nrows].copy()

    train_data = load_dataframe32(train_data_fn, nrows)
    valid_data = load_dataframe32(valid_data_fn, nrows)
    test_data = load_dataframe32(test_data_fn, nrows)
    gc.collect()
    
    X_train = train_data.values
    X_valid = valid_data.values
    X_test = test_data.values
    y_train = train_app.loc[train_idx].TARGET
    y_valid = train_app.loc[valid_idx].TARGET
    
    print(train_idx_fn, valid_idx_fn)
    print(train_idx.shape, valid_idx.shape)
    print(train_data_fn, train_data.shape)
    print(valid_data_fn, valid_data.shape)
    print(test_data_fn, test_data.shape)
    
    print(X_train.shape, y_train.shape)
    print(X_valid.shape, y_valid.shape)
    print(X_test.shape)
    

./neptune-features/train_idx_0.npy ./neptune-features/valid_idx_0.npy
(246008,) (61503,)
./neptune-features/data_246008_1174_0 (246008, 1174)
./neptune-features/data_61503_1174_1 (61503, 1174)
./neptune-features/data_48744_1174_3 (48744, 1174)
(246008, 1174) (246008,)
(61503, 1174) (61503,)
(48744, 1174)
./neptune-features/train_idx_1.npy ./neptune-features/valid_idx_1.npy
(246009,) (61502,)
./neptune-features/data_246009_1174_4 (246009, 1174)
./neptune-features/data_61502_1174_5 (61502, 1174)
./neptune-features/data_48744_1174_7 (48744, 1174)
(246009, 1174) (246009,)
(61502, 1174) (61502,)
(48744, 1174)
./neptune-features/train_idx_2.npy ./neptune-features/valid_idx_2.npy
(246009,) (61502,)
./neptune-features/data_246009_1174_8 (246009, 1174)
./neptune-features/data_61502_1174_9 (61502, 1174)
./neptune-features/data_48744_1174_11 (48744, 1174)
(246009, 1174) (246009,)
(61502, 1174) (61502,)
(48744, 1174)
./neptune-features/train_idx_3.npy ./neptune-features/valid_idx_3.npy
(246009,) (

In [5]:
train_data.head()

Unnamed: 0,annuity_income_percentage,car_to_birth_ratio,car_to_employ_ratio,children_ratio,credit_to_annuity_ratio,credit_to_goods_ratio,credit_to_income_ratio,days_employed_percentage,income_credit_percentage,income_per_child,...,NAME_HOUSING_TYPE,OCCUPATION_TYPE,ORGANIZATION_TYPE,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,WALLSMATERIAL_MODE,WEEKDAY_APPR_PROCESS_START,nan_count
0,0.121978,,,0.0,16.461103,1.158397,2.007889,0.067329,0.498036,202500.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,119.0
1,0.132217,,,0.0,36.234085,1.145199,4.79075,0.070862,0.208736,270000.0,...,1.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,139.0
2,0.1,-0.001365,-0.115556,0.0,20.0,1.0,2.0,0.011814,0.5,67500.0,...,1.0,1.0,3.0,1.0,1.0,1.0,1.0,0.0,2.0,157.0
3,0.2199,,,0.0,10.532818,1.052803,2.316167,0.159905,0.431748,135000.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,211.0
4,0.179963,,,0.0,23.461618,1.0,4.222222,0.152418,0.236842,121500.0,...,1.0,2.0,4.0,1.0,2.0,1.0,1.0,0.0,3.0,203.0


## xgb

In [4]:
xgb_params = addict.Dict()
xgb_params.booster = "gbtree"
xgb_params.tree_method = "hist" # gpu_hist  # auto  hist
xgb_params.objective = "binary:logistic"
xgb_params.eval_metric = "auc"
xgb_params.nrounds = 50000
xgb_params.early_stopping_rounds = 100
xgb_params.eta = 0.005 # 0.001
xgb_params.max_leaves = 40
xgb_params.max_depth = 16
xgb_params.max_bin = 255
xgb_params.subsample = 0.5
xgb_params.colsample_bytree = 0.5
xgb_params.colsample_bylevel = 1
xgb_params.min_child_weight = 4
xgb_params["lambda"] = 0.001
xgb_params.alpha = 0.001
xgb_params.scale_pos_weight = 1

In [15]:
def feval_roc_auc_score(preds, xdata):
    return "roc", roc_auc_score(xdata.get_label(), preds)

In [9]:
# nrows = None  
nrows = None
# nrows = 1000

features = list(np.load('./neptune-features/features_246008_1174_0.npy'))
train_app = load_dataframe32("./bindata/application_train")
auc_valid_stack = []
pred_valid_stack = []
pred_test_stack = []

for i in range(5):
    train_idx_fn = "./neptune-features/train_idx_{}.npy".format(i)
    valid_idx_fn = "./neptune-features/valid_idx_{}.npy".format(i)

    train_offset, valid_offset, test_offset = i * 4, i * 4 + 1, i * 4 + 3
    
    train_data_fn = "./neptune-features/data_246009_1174_{}".format(train_offset)
    valid_data_fn = "./neptune-features/data_61502_1174_{}".format(valid_offset)
    test_data_fn = "./neptune-features/data_48744_1174_{}".format(test_offset)
    
    if i == 0:
        train_data_fn = train_data_fn.replace("246009", "246008")
        valid_data_fn = valid_data_fn.replace("61502", "61503")
    
    gc.collect()

    train_idx = np.load(train_idx_fn)
    valid_idx = np.load(valid_idx_fn)
    
    if nrows:
        train_idx, valid_idx = train_idx[:nrows].copy(), valid_idx[:nrows].copy()

    train_data = load_dataframe32(train_data_fn, nrows)
    valid_data = load_dataframe32(valid_data_fn, nrows)
    test_data = load_dataframe32(test_data_fn, nrows)
    gc.collect()
    
    X_train = train_data.values
    X_valid = valid_data.values
    X_test = test_data.values
    y_train = train_app.loc[train_idx].TARGET
    y_valid = train_app.loc[valid_idx].TARGET
    
    print(train_idx_fn, valid_idx_fn)
    print(train_idx.shape, valid_idx.shape)
    print(train_data_fn, train_data.shape)
    print(valid_data_fn, valid_data.shape)
    print(test_data_fn, test_data.shape)
    
    print(X_train.shape, y_train.shape)
    print(X_valid.shape, y_valid.shape)
    print(X_test.shape)
    
    # lgb_data_train = lgb.Dataset(data=X_train, label=y_train)
    # lgb_data_valid = lgb.Dataset(data=X_valid, label=y_valid)
    
    xgb_data_train = xgb.DMatrix(X_train, label=y_train)
    xgb_data_valid = xgb.DMatrix(X_valid, label=y_valid)
    xgb_data_test = xgb.DMatrix(X_test)

    %time estimator = xgb.train(params=xgb_params, dtrain=xgb_data_train, evals=[(xgb_data_train, 'train'), (xgb_data_valid, 'valid')], num_boost_round=xgb_params.nrounds, early_stopping_rounds=xgb_params.early_stopping_rounds, verbose_eval=250, feval=None)


    # pred_valid = estimator.predict(xgb_data_valid, ntree_limit=estimator.best_iteration)
    pred_valid = estimator.predict(xgb_data_valid, ntree_limit=estimator.best_ntree_limit)
    pred_valid_stack.append(pred_valid)
    
    auc = roc_auc_score(y_valid, pred_valid)
    auc_valid_stack.append(auc)
    print("fold-{},auc:{}".format(i, auc))

    pred_test = estimator.predict(xgb_data_test, ntree_limit=estimator.best_ntree_limit)
    pred_test_stack.append(pred_test)
    
print("kfold-auc, avg:{:.4}, std:{:.2}".format(np.mean(auc_valid_stack), np.std(auc_valid_stack)))

./neptune-features/train_idx_0.npy ./neptune-features/valid_idx_0.npy
(246008,) (61503,)
./neptune-features/data_246008_1174_0 (246008, 1174)
./neptune-features/data_61503_1174_1 (61503, 1174)
./neptune-features/data_48744_1174_3 (48744, 1174)
(246008, 1174) (246008,)
(61503, 1174) (61503,)
(48744, 1174)
[0]	train-auc:0.727295	valid-auc:0.725722
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 100 rounds.
[250]	train-auc:0.759361	valid-auc:0.754582
[500]	train-auc:0.770651	valid-auc:0.762219
[750]	train-auc:0.784784	valid-auc:0.770906
[1000]	train-auc:0.798391	valid-auc:0.778531
[1250]	train-auc:0.809691	valid-auc:0.783584
[1500]	train-auc:0.818892	valid-auc:0.786764
[1750]	train-auc:0.82703	valid-auc:0.788997
[2000]	train-auc:0.83437	valid-auc:0.790582
[2250]	train-auc:0.841137	valid-auc:0.791965
[2500]	train-auc:0.84755	valid-auc:0.792974
[2750]	train-auc:0.853526	valid-auc:0.793744
[3000]	train-auc:0.

## save oof & submission

In [10]:
def calculate_rank(predictions):
    rank = (1 + pd.Series(predictions).rank().values) / (predictions.shape[0] + 1)
    return rank

In [11]:
pred_target = sum([calculate_rank(p_) for p_ in pred_test_stack]) / 5

pred_sample = pd.read_csv("./result/submission-022-blend08.csv")
pred_sample["TARGET"] = pred_target

pred_sample.to_csv("./result/submission-035-xgb-1.csv", index=False)

In [12]:
oof_pred = np.zeros(train_app.shape[0])

for i in range(5):
    valid_idx_fn = "./neptune-features/valid_idx_{}.npy".format(i)
    valid_idx = np.load(valid_idx_fn)
    oof_pred[valid_idx] = pred_valid_stack[i]
    
oof_df = train_app[["SK_ID_CURR"]].copy()
oof_df["SK_ID_CURR"] = oof_df.SK_ID_CURR.astype("int")
oof_df["oof_pred"] = oof_pred
oof_df.to_csv("./oof-result/oof-035-xgb-1.csv", index=False)

## blending

In [13]:
pred31 = pd.read_csv("./result/submission-031-bld-to-overfit75.csv")
pred35 = pd.read_csv("./result/submission-035-xgb-1.csv")

In [14]:
pred31.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.452047
1,100005,0.851916
2,100013,0.302881
3,100028,0.46901
4,100038,0.871148


In [15]:
pred35.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.340576
1,100005,0.869094
2,100013,0.270448
3,100028,0.461052
4,100038,0.900556


In [19]:
pred31 = pd.read_csv("./result/submission-031-bld-to-overfit75.csv")
pred35 = pd.read_csv("./result/submission-035-xgb-1.csv")
pred31["TARGET"] = pred31.TARGET * 0.8 + pred35.TARGET * 0.2
pred31.to_csv("./result/submission-035-bld-2.csv", index=False)

In [20]:
pred31.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.429753
1,100005,0.855352
2,100013,0.296395
3,100028,0.467419
4,100038,0.877029


In [22]:
pred31 = pd.read_csv("./result/submission-031-lgb-reorder-better10.csv")
pred30 = pd.read_csv("./result/submission-030-keras01-avg-025.csv")
pred35 = pd.read_csv("./result/submission-035-xgb-1.csv")
pred31["TARGET"] = pred31.TARGET * 0.7 + pred35.TARGET * 0.2 + pred30.TARGET * 0.1
pred31.to_csv("./result/submission-035-bld-3.csv", index=False)

## stacking-logit

In [24]:
import glob

In [25]:
glob.glob("./oof-result/*.csv")

['./oof-result\\oof-024-first.csv',
 './oof-result\\oof-024-keras01.csv',
 './oof-result\\oof-028-lgb.csv',
 './oof-result\\oof-029-keras.csv',
 './oof-result\\oof-030-keras01.csv',
 './oof-result\\oof-031-lgb-seed-0.csv',
 './oof-result\\oof-031-lgb-seed-1.csv',
 './oof-result\\oof-031-lgb-seed-10.csv',
 './oof-result\\oof-031-lgb-seed-11.csv',
 './oof-result\\oof-031-lgb-seed-12.csv',
 './oof-result\\oof-031-lgb-seed-13.csv',
 './oof-result\\oof-031-lgb-seed-14.csv',
 './oof-result\\oof-031-lgb-seed-15.csv',
 './oof-result\\oof-031-lgb-seed-16.csv',
 './oof-result\\oof-031-lgb-seed-17.csv',
 './oof-result\\oof-031-lgb-seed-18.csv',
 './oof-result\\oof-031-lgb-seed-19.csv',
 './oof-result\\oof-031-lgb-seed-2.csv',
 './oof-result\\oof-031-lgb-seed-20.csv',
 './oof-result\\oof-031-lgb-seed-21.csv',
 './oof-result\\oof-031-lgb-seed-22.csv',
 './oof-result\\oof-031-lgb-seed-23.csv',
 './oof-result\\oof-031-lgb-seed-24.csv',
 './oof-result\\oof-031-lgb-seed-25.csv',
 './oof-result\\oof-031

In [27]:
pd.read_csv("./oof-result/oof-035-xgb-1.csv")

Unnamed: 0,SK_ID_CURR,oof_pred
0,100002,0.275387
1,100003,0.019541
2,100004,0.044631
3,100006,0.029361
4,100007,0.070024
5,100008,0.048403
6,100009,0.005530
7,100010,0.016747
8,100011,0.041483
9,100012,0.053386


In [37]:
glob.glob("./oof-result/*.csv")

['./oof-result\\oof-024-first.csv',
 './oof-result\\oof-024-keras01.csv',
 './oof-result\\oof-028-lgb.csv',
 './oof-result\\oof-029-keras.csv',
 './oof-result\\oof-030-keras01.csv',
 './oof-result\\oof-031-lgb-seed-0.csv',
 './oof-result\\oof-031-lgb-seed-1.csv',
 './oof-result\\oof-031-lgb-seed-10.csv',
 './oof-result\\oof-031-lgb-seed-11.csv',
 './oof-result\\oof-031-lgb-seed-12.csv',
 './oof-result\\oof-031-lgb-seed-13.csv',
 './oof-result\\oof-031-lgb-seed-14.csv',
 './oof-result\\oof-031-lgb-seed-15.csv',
 './oof-result\\oof-031-lgb-seed-16.csv',
 './oof-result\\oof-031-lgb-seed-17.csv',
 './oof-result\\oof-031-lgb-seed-18.csv',
 './oof-result\\oof-031-lgb-seed-19.csv',
 './oof-result\\oof-031-lgb-seed-2.csv',
 './oof-result\\oof-031-lgb-seed-20.csv',
 './oof-result\\oof-031-lgb-seed-21.csv',
 './oof-result\\oof-031-lgb-seed-22.csv',
 './oof-result\\oof-031-lgb-seed-23.csv',
 './oof-result\\oof-031-lgb-seed-24.csv',
 './oof-result\\oof-031-lgb-seed-25.csv',
 './oof-result\\oof-031

In [115]:
lgb_columns = ['./oof-result\\oof-024-first.csv',
 './oof-result\\oof-031-lgb-seed-0.csv',
 './oof-result\\oof-031-lgb-seed-1.csv',
 './oof-result\\oof-031-lgb-seed-10.csv',
 './oof-result\\oof-031-lgb-seed-11.csv',
 './oof-result\\oof-031-lgb-seed-12.csv',
 './oof-result\\oof-031-lgb-seed-13.csv',
 './oof-result\\oof-031-lgb-seed-14.csv',
 './oof-result\\oof-031-lgb-seed-15.csv',
 './oof-result\\oof-031-lgb-seed-16.csv',
 './oof-result\\oof-031-lgb-seed-17.csv',
 './oof-result\\oof-031-lgb-seed-18.csv',
 './oof-result\\oof-031-lgb-seed-19.csv',
 './oof-result\\oof-031-lgb-seed-2.csv',
 './oof-result\\oof-031-lgb-seed-20.csv',
 './oof-result\\oof-031-lgb-seed-21.csv',
 './oof-result\\oof-031-lgb-seed-22.csv',
 './oof-result\\oof-031-lgb-seed-23.csv',
 './oof-result\\oof-031-lgb-seed-24.csv',
 './oof-result\\oof-031-lgb-seed-25.csv',
 './oof-result\\oof-031-lgb-seed-26.csv',
 './oof-result\\oof-031-lgb-seed-27.csv',
 './oof-result\\oof-031-lgb-seed-28.csv',
 './oof-result\\oof-031-lgb-seed-29.csv',
 './oof-result\\oof-031-lgb-seed-3.csv',
 './oof-result\\oof-031-lgb-seed-4.csv',
 './oof-result\\oof-031-lgb-seed-5.csv',
 './oof-result\\oof-031-lgb-seed-6.csv',
 './oof-result\\oof-031-lgb-seed-7.csv',
 './oof-result\\oof-031-lgb-seed-8.csv',
 './oof-result\\oof-031-lgb-seed-9.csv',
]

In [118]:
lgb_info = pd.DataFrame([
    ("oof-024-first.csv       ", 0.79453, 0.002204),
    ("oof-031-lgb-seed-0.csv  ", 0.79426, 0.002599),
    ("oof-031-lgb-seed-1.csv  ", 0.79426, 0.002599),
    ("oof-031-lgb-seed-10.csv ", 0.79446, 0.002332),
    ("oof-031-lgb-seed-11.csv ", 0.79442, 0.002256),
    ("oof-031-lgb-seed-12.csv ", 0.79457, 0.002379),
    ("oof-031-lgb-seed-13.csv ", 0.79451, 0.002476),
    ("oof-031-lgb-seed-14.csv ", 0.79454, 0.002633),
    ("oof-031-lgb-seed-15.csv ", 0.79453, 0.002396),
    ("oof-031-lgb-seed-16.csv ", 0.79414, 0.002331),
    ("oof-031-lgb-seed-17.csv ", 0.79441, 0.002233),
    ("oof-031-lgb-seed-18.csv ", 0.79458, 0.002444),
    ("oof-031-lgb-seed-19.csv ", 0.79447, 0.002104),
    ("oof-031-lgb-seed-2.csv  ", 0.79426, 0.002599),
    ("oof-031-lgb-seed-20.csv ", 0.7943, 0.002261),
    ("oof-031-lgb-seed-21.csv ", 0.79471, 0.002242),
    ("oof-031-lgb-seed-22.csv ", 0.79425, 0.002295),
    ("oof-031-lgb-seed-23.csv ", 0.79419, 0.002687),
    ("oof-031-lgb-seed-24.csv ", 0.79452, 0.002525),
    ("oof-031-lgb-seed-25.csv ", 0.79439, 0.002363),
    ("oof-031-lgb-seed-26.csv ", 0.7945, 0.002492),
    ("oof-031-lgb-seed-27.csv ", 0.79446, 0.002419),
    ("oof-031-lgb-seed-28.csv ", 0.79442, 0.00233),
    ("oof-031-lgb-seed-29.csv ", 0.79433, 0.002703),
    ("oof-031-lgb-seed-3.csv  ", 0.79447, 0.002513),
    ("oof-031-lgb-seed-4.csv  ", 0.79448, 0.002551),
    ("oof-031-lgb-seed-5.csv  ", 0.79462, 0.002398),
    ("oof-031-lgb-seed-6.csv  ", 0.79412, 0.002712),
    ("oof-031-lgb-seed-7.csv  ", 0.79418, 0.002622),
    ("oof-031-lgb-seed-8.csv  ", 0.79461, 0.002116),
    ("oof-031-lgb-seed-9.csv  ", 0.7943, 0.002533)
], columns=["file", "auc", "std"])

lgb_info["file"] = lgb_info.file.apply(lambda x: './oof-result\\' + x.strip())
lgb_info.head()

Unnamed: 0,file,auc,std
0,./oof-result\oof-024-first.csv,0.79453,0.002204
1,./oof-result\oof-031-lgb-seed-0.csv,0.79426,0.002599
2,./oof-result\oof-031-lgb-seed-1.csv,0.79426,0.002599
3,./oof-result\oof-031-lgb-seed-10.csv,0.79446,0.002332
4,./oof-result\oof-031-lgb-seed-11.csv,0.79442,0.002256


In [120]:
list(lgb_info[lgb_info.auc >= 0.7945].file.values)

array(['./oof-result\\oof-024-first.csv',
       './oof-result\\oof-031-lgb-seed-12.csv',
       './oof-result\\oof-031-lgb-seed-13.csv',
       './oof-result\\oof-031-lgb-seed-14.csv',
       './oof-result\\oof-031-lgb-seed-15.csv',
       './oof-result\\oof-031-lgb-seed-18.csv',
       './oof-result\\oof-031-lgb-seed-21.csv',
       './oof-result\\oof-031-lgb-seed-24.csv',
       './oof-result\\oof-031-lgb-seed-26.csv',
       './oof-result\\oof-031-lgb-seed-5.csv',
       './oof-result\\oof-031-lgb-seed-8.csv'], dtype=object)

In [125]:
oof_df["lgb_avg"] = X2rank(oof_df[list(lgb_info[lgb_info.auc >= 0.7945].file.values)].values).mean(axis=1)
test_df["lgb_avg"] = test_df[list(lgb_info[lgb_info.auc >= 0.7945].file.values)].values.mean(axis=1)

In [124]:
oof_df.head().T

Unnamed: 0,0,1,2,3,4
./oof-result\oof-024-first.csv,0.316064,0.019087,0.041876,0.033489,0.040865
./oof-result\oof-024-keras01.csv,0.278892,0.01501,0.067596,0.043004,0.07649
./oof-result\oof-028-lgb.csv,0.309997,0.019556,0.046917,0.039269,0.068751
./oof-result\oof-029-keras.csv,0.374819,0.015967,0.050639,0.036757,0.086199
./oof-result\oof-030-keras01.csv,0.394588,0.012417,0.043042,0.028406,0.080837
./oof-result\oof-031-lgb-seed-0.csv,0.270621,0.017848,0.042343,0.028364,0.049091
./oof-result\oof-031-lgb-seed-1.csv,0.270621,0.017848,0.042343,0.028364,0.049091
./oof-result\oof-031-lgb-seed-10.csv,0.282062,0.015439,0.047114,0.036264,0.047562
./oof-result\oof-031-lgb-seed-11.csv,0.30284,0.015332,0.041685,0.032824,0.0502
./oof-result\oof-031-lgb-seed-12.csv,0.290954,0.018758,0.039045,0.032269,0.043835


In [42]:
oof_df = pd.DataFrame({fn: pd.read_csv(fn).oof_pred
    for fn in glob.glob("./oof-result/*.csv")
})

# train_app = load_dataframe32("./bindata/application_train")
# oof_df["TARGET"] = train_app.reset_index().TARGET

In [43]:
test_df = pd.DataFrame({fn: pd.read_csv(fn.replace("oof-result", "result").replace("oof", "submission")).TARGET
    for fn in glob.glob("./oof-result/*.csv")
})


In [40]:
oof_df.head().T

Unnamed: 0,0,1,2,3,4
./oof-result\oof-024-first.csv,0.316064,0.019087,0.041876,0.033489,0.040865
./oof-result\oof-024-keras01.csv,0.278892,0.01501,0.067596,0.043004,0.07649
./oof-result\oof-028-lgb.csv,0.309997,0.019556,0.046917,0.039269,0.068751
./oof-result\oof-029-keras.csv,0.374819,0.015967,0.050639,0.036757,0.086199
./oof-result\oof-030-keras01.csv,0.394588,0.012417,0.043042,0.028406,0.080837
./oof-result\oof-031-lgb-seed-0.csv,0.270621,0.017848,0.042343,0.028364,0.049091
./oof-result\oof-031-lgb-seed-1.csv,0.270621,0.017848,0.042343,0.028364,0.049091
./oof-result\oof-031-lgb-seed-10.csv,0.282062,0.015439,0.047114,0.036264,0.047562
./oof-result\oof-031-lgb-seed-11.csv,0.30284,0.015332,0.041685,0.032824,0.0502
./oof-result\oof-031-lgb-seed-12.csv,0.290954,0.018758,0.039045,0.032269,0.043835


In [41]:
test_df.head()

Unnamed: 0,./oof-result\oof-024-first.csv,./oof-result\oof-024-keras01.csv,./oof-result\oof-028-lgb.csv,./oof-result\oof-029-keras.csv,./oof-result\oof-030-keras01.csv,./oof-result\oof-031-lgb-seed-0.csv,./oof-result\oof-031-lgb-seed-1.csv,./oof-result\oof-031-lgb-seed-10.csv,./oof-result\oof-031-lgb-seed-11.csv,./oof-result\oof-031-lgb-seed-12.csv,...,./oof-result\oof-031-lgb-seed-29.csv,./oof-result\oof-031-lgb-seed-3.csv,./oof-result\oof-031-lgb-seed-4.csv,./oof-result\oof-031-lgb-seed-5.csv,./oof-result\oof-031-lgb-seed-6.csv,./oof-result\oof-031-lgb-seed-7.csv,./oof-result\oof-031-lgb-seed-8.csv,./oof-result\oof-031-lgb-seed-9.csv,./oof-result\oof-035-xgb-0.csv,./oof-result\oof-035-xgb-1.csv
0,0.454959,0.542095,0.438022,0.575386,0.461274,0.447619,0.447619,0.422952,0.447287,0.421024,...,0.437928,0.450836,0.426017,0.437083,0.440464,0.436078,0.452104,0.463646,0.370799,0.340576
1,0.825939,0.892691,0.851648,0.896084,0.884513,0.825582,0.825582,0.839229,0.838129,0.818139,...,0.830067,0.827609,0.828713,0.828881,0.83316,0.835737,0.839856,0.841018,0.859456,0.869094
2,0.336125,0.291665,0.413236,0.19897,0.297159,0.35993,0.35993,0.418619,0.35888,0.365777,...,0.3506,0.374549,0.41633,0.360324,0.376736,0.375998,0.367135,0.372231,0.269816,0.270448
3,0.478145,0.364292,0.429763,0.294465,0.353534,0.500359,0.500359,0.470822,0.498086,0.485999,...,0.499657,0.502201,0.463317,0.478769,0.498357,0.510436,0.467412,0.477345,0.443274,0.461052
4,0.861372,0.860732,0.889121,0.851747,0.838371,0.871909,0.871909,0.879893,0.877633,0.87449,...,0.876135,0.871174,0.88096,0.889076,0.885818,0.86948,0.866953,0.878543,0.89244,0.900556


In [49]:
def X2rank(X):
    return np.array([calculate_rank(col) for col in X.T]).T

In [95]:
def add_std(X):
    return np.hstack([X, X.std(axis=1).reshape((X.shape[0], 1))])

In [87]:
a = np.array([[1],[1],[1]])
b = a - 1

In [96]:
np.hstack([a, b]).std(axis=1)

array([0.5, 0.5, 0.5])

In [97]:
add_std(np.hstack([a, b]))

array([[1. , 0. , 0.5],
       [1. , 0. , 0.5],
       [1. , 0. , 0.5]])

In [51]:
from sklearn.linear_model import LogisticRegression

In [129]:
# nrows = None  
nrows = None
# nrows = 1000

features = list(np.load('./neptune-features/features_246008_1174_0.npy'))
train_app = load_dataframe32("./bindata/application_train")
auc_valid_stack2 = []
pred_test_stack2 = []

features = list(oof_df.columns)
features = [col_ for col_ in features if col_ not in list(lgb_info[lgb_info.auc < 0.7945].file.values)]
# features = features0

for i in range(5):
    train_idx_fn = "./neptune-features/train_idx_{}.npy".format(i)
    valid_idx_fn = "./neptune-features/valid_idx_{}.npy".format(i)

    train_offset, valid_offset, test_offset = i * 4, i * 4 + 1, i * 4 + 3
    
    train_data_fn = "./neptune-features/data_246009_1174_{}".format(train_offset)
    valid_data_fn = "./neptune-features/data_61502_1174_{}".format(valid_offset)
    test_data_fn = "./neptune-features/data_48744_1174_{}".format(test_offset)
    
    if i == 0:
        train_data_fn = train_data_fn.replace("246009", "246008")
        valid_data_fn = valid_data_fn.replace("61502", "61503")
    
    gc.collect()

    train_idx = np.load(train_idx_fn)
    valid_idx = np.load(valid_idx_fn)
    
    if nrows:
        train_idx, valid_idx = train_idx[:nrows].copy(), valid_idx[:nrows].copy()

    gc.collect()
    
    X_train = X2rank(oof_df.loc[train_idx][features].values)
    X_valid = X2rank(oof_df.loc[valid_idx][features].values)
    X_test = test_df[features].values
    
    # X_train = add_std(X_train)
    # X_valid = add_std(X_valid)
    # X_test = add_std(X_test)
    
    y_train = train_app.loc[train_idx].TARGET
    y_valid = train_app.loc[valid_idx].TARGET
    
    print(train_idx_fn, valid_idx_fn)
    print(X_train.shape, y_train.shape)
    print(X_valid.shape, y_valid.shape)
    print(X_test.shape)
    
    estimator = LogisticRegression(C=0.085)
    estimator.fit(X_train, y_train)
    
    pred_valid = estimator.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, pred_valid)
    auc_valid_stack2.append(auc)
    print("fold-{},auc:{}".format(i, auc))

    pred_test = estimator.predict_proba(X_test)[:, 1]
    pred_test_stack2.append(pred_test)
    
print("kfold-auc, avg:{:.8}, std:{:.2}".format(np.mean(auc_valid_stack2), np.std(auc_valid_stack2)))

./neptune-features/train_idx_0.npy ./neptune-features/valid_idx_0.npy
(246008, 18) (246008,)
(61503, 18) (61503,)
(48744, 18)
fold-0,auc:0.800141369508025
./neptune-features/train_idx_1.npy ./neptune-features/valid_idx_1.npy
(246009, 18) (246009,)
(61502, 18) (61502,)
(48744, 18)
fold-1,auc:0.7938189574398614
./neptune-features/train_idx_2.npy ./neptune-features/valid_idx_2.npy
(246009, 18) (246009,)
(61502, 18) (61502,)
(48744, 18)
fold-2,auc:0.794016662367688
./neptune-features/train_idx_3.npy ./neptune-features/valid_idx_3.npy
(246009, 18) (246009,)
(61502, 18) (61502,)
(48744, 18)
fold-3,auc:0.7959323948681506
./neptune-features/train_idx_4.npy ./neptune-features/valid_idx_4.npy
(246009, 18) (246009,)
(61502, 18) (61502,)
(48744, 18)
fold-4,auc:0.7942260057984825
kfold-auc, avg:0.79562708, std:0.0024


In [114]:
# nrows = None  
nrows = None
# nrows = 1000

features = list(np.load('./neptune-features/features_246008_1174_0.npy'))
train_app = load_dataframe32("./bindata/application_train")
auc_valid_stack2 = []
pred_test_stack2 = []

features = list(oof_df.columns)
# features = features0

for i in range(5):
    train_idx_fn = "./neptune-features/train_idx_{}.npy".format(i)
    valid_idx_fn = "./neptune-features/valid_idx_{}.npy".format(i)

    train_offset, valid_offset, test_offset = i * 4, i * 4 + 1, i * 4 + 3
    
    train_data_fn = "./neptune-features/data_246009_1174_{}".format(train_offset)
    valid_data_fn = "./neptune-features/data_61502_1174_{}".format(valid_offset)
    test_data_fn = "./neptune-features/data_48744_1174_{}".format(test_offset)
    
    if i == 0:
        train_data_fn = train_data_fn.replace("246009", "246008")
        valid_data_fn = valid_data_fn.replace("61502", "61503")
    
    gc.collect()

    train_idx = np.load(train_idx_fn)
    valid_idx = np.load(valid_idx_fn)
    
    if nrows:
        train_idx, valid_idx = train_idx[:nrows].copy(), valid_idx[:nrows].copy()

    gc.collect()
    
    X_train = X2rank(oof_df.loc[train_idx][features].values)
    X_valid = X2rank(oof_df.loc[valid_idx][features].values)
    X_test = test_df[features].values
    
    # X_train = add_std(X_train)
    # X_valid = add_std(X_valid)
    # X_test = add_std(X_test)
    
    y_train = train_app.loc[train_idx].TARGET
    y_valid = train_app.loc[valid_idx].TARGET
    
    print(train_idx_fn, valid_idx_fn)
    print(X_train.shape, y_train.shape)
    print(X_valid.shape, y_valid.shape)
    print(X_test.shape)
    
    estimator = LogisticRegression(C=0.085)
    estimator.fit(X_train, y_train)
    
    pred_valid = estimator.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, pred_valid)
    auc_valid_stack2.append(auc)
    print("fold-{},auc:{}".format(i, auc))

    pred_test = estimator.predict_proba(X_test)[:, 1]
    pred_test_stack2.append(pred_test)
    
print("kfold-auc, avg:{:.8}, std:{:.2}".format(np.mean(auc_valid_stack2), np.std(auc_valid_stack2)))

./neptune-features/train_idx_0.npy ./neptune-features/valid_idx_0.npy
(246008, 37) (246008,)
(61503, 37) (61503,)
(48744, 37)
fold-0,auc:0.8000124861436759
./neptune-features/train_idx_1.npy ./neptune-features/valid_idx_1.npy
(246009, 37) (246009,)
(61502, 37) (61502,)
(48744, 37)
fold-1,auc:0.793829676832402
./neptune-features/train_idx_2.npy ./neptune-features/valid_idx_2.npy
(246009, 37) (246009,)
(61502, 37) (61502,)
(48744, 37)
fold-2,auc:0.7939594780243636
./neptune-features/train_idx_3.npy ./neptune-features/valid_idx_3.npy
(246009, 37) (246009,)
(61502, 37) (61502,)
(48744, 37)
fold-3,auc:0.7958255785617564
./neptune-features/train_idx_4.npy ./neptune-features/valid_idx_4.npy
(246009, 37) (246009,)
(61502, 37) (61502,)
(48744, 37)
fold-4,auc:0.7941741223711103
kfold-auc, avg:0.79556027, std:0.0023


In [143]:
kfold_index_list = []

for i in range(5):
    train_idx_fn = "./neptune-features/train_idx_{}.npy".format(i)
    valid_idx_fn = "./neptune-features/valid_idx_{}.npy".format(i)

    train_offset, valid_offset, test_offset = i * 4, i * 4 + 1, i * 4 + 3

    train_data_fn = "./neptune-features/data_246009_1174_{}".format(train_offset)
    valid_data_fn = "./neptune-features/data_61502_1174_{}".format(valid_offset)
    test_data_fn = "./neptune-features/data_48744_1174_{}".format(test_offset)

    if i == 0:
        train_data_fn = train_data_fn.replace("246009", "246008")
        valid_data_fn = valid_data_fn.replace("61502", "61503")

    gc.collect()

    train_idx = np.load(train_idx_fn)
    valid_idx = np.load(valid_idx_fn)
    kfold_index_list.append((train_idx, valid_idx))

In [151]:
def cv_score(features):
    global train_app, oof_df, kfold_index_list
    auc_valid_stack2 = []
    pred_test_stack2 = []

    for i, (train_idx, valid_idx) in enumerate(kfold_index_list):
        gc.collect()
        X_train = X2rank(oof_df.loc[train_idx][features].values)
        X_valid = X2rank(oof_df.loc[valid_idx][features].values)

        y_train = train_app.loc[train_idx].TARGET
        y_valid = train_app.loc[valid_idx].TARGET

        # print(X_train.shape, y_train.shape)
        # print(X_valid.shape, y_valid.shape)

        estimator = LogisticRegression(C=0.085)
        estimator.fit(X_train, y_train)

        pred_valid = estimator.predict_proba(X_valid)[:, 1]
        auc = roc_auc_score(y_valid, pred_valid)
        auc_valid_stack2.append(auc)
        print("fold-{},auc={}".format(i, auc))
    return np.mean(auc_valid_stack2)

In [152]:
features = np.array(oof_df.columns)
cv_score(features)

fold-0,auc=0.8000104235253623
fold-1,auc=0.793829801517925
fold-2,auc=0.7939623636036118
fold-3,auc=0.7958293939387624
fold-4,auc=0.7941776135657563


0.7955619192302835

## stacking-logit-fs

In [154]:
features = np.array(oof_df.columns)
max_step = -1

ftr_cnt = len(features)
max_step = ftr_cnt if max_step < 0 or max_step > ftr_cnt else max_step

features_prev = np.arange(ftr_cnt)
features_removed = []
score_val_prev = 0.79556

for step_ in range(max_step):
    remove_trials = []
    param_list = []
    metrics = []
    for col_rm_ in features_prev:
        features_next = features_prev[features_prev != col_rm_]
        cv_auc = cv_score(features[features_next])
        print("kfold-auc=", cv_auc)
        metrics.append(cv_auc)
    remove_trials = [(score_val, col_rm_) for score_val, col_rm_ in zip(metrics, features_prev) if score_val >= score_val_prev]
    
    if len(remove_trials) == 0:
        break

    score_val_prev, col_rm_prev = max(remove_trials)
    print("step={}, score={}".format(score_val_prev, col_rm_prev))
    features_prev = features_prev[features_prev != col_rm_prev]
    features_removed.append(col_rm_prev)

fold-0,auc=0.8000457302785635
fold-1,auc=0.7938342616972076
fold-2,auc=0.793948352513262
fold-3,auc=0.7958148449194415
fold-4,auc=0.7941746246756461
kfold-auc= 0.7955635628168242
fold-0,auc=0.7999795804349359
fold-1,auc=0.793779214819993
fold-2,auc=0.793949339310116
fold-3,auc=0.7957440271047804
fold-4,auc=0.7942380931693334
kfold-auc= 0.7955380509678317
fold-0,auc=0.8000087456441438
fold-1,auc=0.793830774065005
fold-2,auc=0.7939565496957932
fold-3,auc=0.7958466005409464
fold-4,auc=0.794167806158756
kfold-auc= 0.7955620952209289
fold-0,auc=0.8000031420196068
fold-1,auc=0.7938094136536811
fold-2,auc=0.7939496599300326
fold-3,auc=0.7958185747978034
fold-4,auc=0.7942926413044556
kfold-auc= 0.7955746863411159
fold-0,auc=0.7999410105411908
fold-1,auc=0.7937820184630402
fold-2,auc=0.7938110986894643
fold-3,auc=0.7956655535990023
fold-4,auc=0.7942043069550244
kfold-auc= 0.7954807976495444
fold-0,auc=0.8000121227808641
fold-1,auc=0.7938294452735735
fold-2,auc=0.7939633682126835
fold-3,auc=0.79

fold-4,auc=0.7942897984745296
kfold-auc= 0.7955765943717616
fold-0,auc=0.7999959175119394
fold-1,auc=0.7938022388924391
fold-2,auc=0.7939692034951632
fold-3,auc=0.7958073352885092
fold-4,auc=0.7942708533999097
kfold-auc= 0.7955691097175921
fold-0,auc=0.7999881693343375
fold-1,auc=0.79380887572471
fold-2,auc=0.79394104594161
fold-3,auc=0.7958090630736145
fold-4,auc=0.7942987580199733
kfold-auc= 0.7955691824188491
fold-0,auc=0.7999751630831078
fold-1,auc=0.7937908283858564
fold-2,auc=0.7939451748136455
fold-3,auc=0.7958090203242925
fold-4,auc=0.794366163013746
kfold-auc= 0.7955772699241296
fold-0,auc=0.8000005165451735
fold-1,auc=0.7938162392954584
fold-2,auc=0.7939559191432907
fold-3,auc=0.7958015178182469
fold-4,auc=0.7942835641983761
kfold-auc= 0.7955715514001092
fold-0,auc=0.7999860461555555
fold-1,auc=0.7938008816014595
fold-2,auc=0.7939443839511848
fold-3,auc=0.7958129817614826
fold-4,auc=0.794293368042933
kfold-auc= 0.7955675323025231
fold-0,auc=0.8000078158628315
fold-1,auc=0.793

fold-2,auc=0.7939524350735317
fold-3,auc=0.7958290626315154
fold-4,auc=0.7943046467391057
kfold-auc= 0.7955818026480395
fold-0,auc=0.8000137401016141
fold-1,auc=0.7938134178401934
fold-2,auc=0.7939572265600612
fold-3,auc=0.7958346236058446
fold-4,auc=0.794300977422284
kfold-auc= 0.7955839971059995
fold-0,auc=0.8000429836831929
fold-1,auc=0.7937918294324845
fold-2,auc=0.7939277473399635
fold-3,auc=0.7957844644011343
fold-4,auc=0.794263632326902
kfold-auc= 0.7955621314367354
fold-0,auc=0.8000126642627011
fold-1,auc=0.793816485104061
fold-2,auc=0.7939526060708205
fold-3,auc=0.7958352042841376
fold-4,auc=0.7943142475243823
kfold-auc= 0.7955862414492205
fold-0,auc=0.8000377291719457
fold-1,auc=0.7938088009133963
fold-2,auc=0.793954223420177
fold-3,auc=0.7958274096577238
fold-4,auc=0.7943031291381677
kfold-auc= 0.7955862584602821
fold-0,auc=0.7999958427019487
fold-1,auc=0.7938048109766579
fold-2,auc=0.7939777462347154
fold-3,auc=0.7958121944614654
fold-4,auc=0.7942966098665328
kfold-auc= 0.7

fold-0,auc=0.8000218017687006
fold-1,auc=0.7938342331976593
fold-2,auc=0.7939554061514243
fold-3,auc=0.7958239327128519
fold-4,auc=0.7942941517805066
kfold-auc= 0.7955859051222285
fold-0,auc=0.8000234369013531
fold-1,auc=0.7938408700299304
fold-2,auc=0.7939269351028417
fold-3,auc=0.7958153792859691
fold-4,auc=0.7942803971860899
kfold-auc= 0.7955774037012369
fold-0,auc=0.800070203832644
fold-1,auc=0.7938087332269694
fold-2,auc=0.7939429482864477
fold-3,auc=0.7958196791552933
fold-4,auc=0.7943032360114732
kfold-auc= 0.7955889601025655
fold-0,auc=0.8000458122133152
fold-1,auc=0.7938083769826177
fold-2,auc=0.7939698661096573
fold-3,auc=0.7958193941598121
fold-4,auc=0.7943101400270081
kfold-auc= 0.7955907178984821
fold-0,auc=0.8000763952499647
fold-1,auc=0.7937882420518634
fold-2,auc=0.7939248154489497
fold-3,auc=0.7958260879911792
fold-4,auc=0.7942802012516965
kfold-auc= 0.7955791483987307
fold-0,auc=0.8000353459393867
fold-1,auc=0.7938089683482416
fold-2,auc=0.7939605075705399
fold-3,auc=

fold-3,auc=0.7958411179403747
fold-4,auc=0.7943119069989921
kfold-auc= 0.7955930982852827
fold-0,auc=0.8000226994885882
fold-1,auc=0.7938024526390502
fold-2,auc=0.7938783504981659
fold-3,auc=0.7957518965425079
fold-4,auc=0.7942561511955177
kfold-auc= 0.795542310072766
fold-0,auc=0.8000567059729045
fold-1,auc=0.7938053168436372
fold-2,auc=0.7939661825430613
fold-3,auc=0.7958351045357192
fold-4,auc=0.7943313650654784
kfold-auc= 0.7955989349921601
fold-0,auc=0.8000567059729045
fold-1,auc=0.7938053168436372
fold-2,auc=0.7939661825430613
fold-3,auc=0.7958351045357192
fold-4,auc=0.7943313650654784
kfold-auc= 0.7955989349921601
fold-0,auc=0.8000576927523049
fold-1,auc=0.7937988510086551
fold-2,auc=0.7939652384955295
fold-3,auc=0.7958429240992375
fold-4,auc=0.7943218355290721
kfold-auc= 0.7955973083769599
fold-0,auc=0.800064204783871
fold-1,auc=0.793803396686582
fold-2,auc=0.7939695597395149
fold-3,auc=0.7958401097688595
fold-4,auc=0.7943207632335737
kfold-auc= 0.7955996068424802
fold-0,auc=0.

fold-1,auc=0.7937849111671756
fold-2,auc=0.7939656873634126
fold-3,auc=0.795828535389875
fold-4,auc=0.7944057524485432
kfold-auc= 0.7956044910467658
fold-0,auc=0.8000605426567102
fold-1,auc=0.7937597852530549
fold-2,auc=0.7939315662794131
fold-3,auc=0.7957784723711397
fold-4,auc=0.7943745026940177
kfold-auc= 0.7955809738508671
fold-0,auc=0.8000361759740447
fold-1,auc=0.7937826561404299
fold-2,auc=0.7939616012406994
fold-3,auc=0.7958305873573405
fold-4,auc=0.7944152285482967
kfold-auc= 0.7956052498521623
fold-0,auc=0.8000199885170227
fold-1,auc=0.7937755241285102
fold-2,auc=0.7939864528466694
fold-3,auc=0.7958081047763088
fold-4,auc=0.7944124819043454
kfold-auc= 0.7956005104345711
fold-0,auc=0.8000360797897711
fold-1,auc=0.7937795211901355
fold-2,auc=0.7939790500890425
fold-3,auc=0.7958246416391116
fold-4,auc=0.7944036221073203
kfold-auc= 0.7956045829630762
fold-0,auc=0.8000266786676141
fold-1,auc=0.7938146468832066
fold-2,auc=0.7939467743507843
fold-3,auc=0.7958138652474748
fold-4,auc=

fold-4,auc=0.7941548103648082
kfold-auc= 0.7954078143953813
fold-0,auc=0.7999717930711485
fold-1,auc=0.7936910906547292
fold-2,auc=0.7939870905240587
fold-3,auc=0.7957885647736216
fold-4,auc=0.794352109174074
kfold-auc= 0.7955581296395264
fold-0,auc=0.8000344695937822
fold-1,auc=0.7937802479286128
fold-2,auc=0.7939570555627724
fold-3,auc=0.7958292193790302
fold-4,auc=0.7944160372229748
kfold-auc= 0.7956034059374344
step=0.7956093009327887, score=16
fold-0,auc=0.8001095895115253
fold-1,auc=0.7937839671196438
fold-2,auc=0.7939379715528554
fold-3,auc=0.795823989711948
fold-4,auc=0.794408139285699
kfold-auc= 0.7956127314363343
fold-0,auc=0.8000631752559045
fold-1,auc=0.7937636041925045
fold-2,auc=0.7939867770290294
fold-3,auc=0.7958076487835387
fold-4,auc=0.7943935154550644
kfold-auc= 0.7956029441432083
fold-0,auc=0.8000721417676396
fold-1,auc=0.7937762579918743
fold-2,auc=0.793942114674665
fold-3,auc=0.7958448228816316
fold-4,auc=0.7943983105040375
kfold-auc= 0.7956067295639695
fold-0,auc

fold-1,auc=0.7937753816307694
fold-2,auc=0.7939743227264963
fold-3,auc=0.7958423683580489
fold-4,auc=0.7944102375649302
kfold-auc= 0.7956160474707653
fold-0,auc=0.8000811830893656
fold-1,auc=0.7937778539665696
fold-2,auc=0.7939753487102289
fold-3,auc=0.7958491120636254
fold-4,auc=0.7944024714380645
kfold-auc= 0.7956171938535708
fold-0,auc=0.8001079650660142
fold-1,auc=0.7937528064262063
fold-2,auc=0.7939436429629334
fold-3,auc=0.7957977594403373
fold-4,auc=0.7943687564726258
kfold-auc= 0.7955941860736233
fold-0,auc=0.8000604714091001
fold-1,auc=0.7937679503735943
fold-2,auc=0.7939968729939546
fold-3,auc=0.7958242782698729
fold-4,auc=0.794402895368843
kfold-auc= 0.7956104936830729
fold-0,auc=0.8000684012681077
fold-1,auc=0.7938044618571934
fold-2,auc=0.7939575685546388
fold-3,auc=0.7958308082288384
fold-4,auc=0.7944002306610928
kfold-auc= 0.7956122941139743
fold-0,auc=0.8000702572683517
fold-1,auc=0.7937609822340765
fold-2,auc=0.793990528282052
fold-3,auc=0.7958450758151214
fold-4,auc=0

fold-3,auc=0.7958108906071386
fold-4,auc=0.7943903947545442
kfold-auc= 0.7956143993835414
fold-0,auc=0.8001139641147874
fold-1,auc=0.7937718084999225
fold-2,auc=0.7939600337655522
fold-3,auc=0.7958494219962113
fold-4,auc=0.7943955923596345
kfold-auc= 0.7956181641472215
fold-0,auc=0.8000735916565059
fold-1,auc=0.7937729449194044
fold-2,auc=0.7938719772867151
fold-3,auc=0.7957633569233
fold-4,auc=0.7943363382366271
kfold-auc= 0.7955636418045106
fold-0,auc=0.8001162191016481
fold-1,auc=0.7937753887556565
fold-2,auc=0.793968394820485
fold-3,auc=0.7958436686499323
fold-4,auc=0.7944089230232727
kfold-auc= 0.7956225188701989
fold-0,auc=0.8001162191016481
fold-1,auc=0.7937753887556565
fold-2,auc=0.793968394820485
fold-3,auc=0.7958436686499323
fold-4,auc=0.7944089230232727
kfold-auc= 0.7956225188701989
fold-0,auc=0.8001161051054719
fold-1,auc=0.7937655421617773
fold-2,auc=0.7939642053869098
fold-3,auc=0.7958603337607019
fold-4,auc=0.7943975517035685
kfold-auc= 0.795620747623686
fold-0,auc=0.800

fold-1,auc=0.7938035249345488
fold-2,auc=0.7939733679916338
fold-3,auc=0.795853194623895
fold-4,auc=0.794394940432471
kfold-auc= 0.7956260842019675
fold-0,auc=0.8001076836379544
fold-1,auc=0.7938101332672713
fold-2,auc=0.7939397883990488
fold-3,auc=0.7958444096381838
fold-4,auc=0.7943803914131502
kfold-auc= 0.7956164812711217
fold-0,auc=0.8001573824084023
fold-1,auc=0.79375054071213
fold-2,auc=0.7939408642569906
fold-3,auc=0.7958542099202973
fold-4,auc=0.7943829991218042
kfold-auc= 0.7956171992839248
fold-0,auc=0.7998801935811817
fold-1,auc=0.7935964044685083
fold-2,auc=0.793851001619291
fold-3,auc=0.7956640644976123
fold-4,auc=0.7941469088650891
kfold-auc= 0.7954277146063364
fold-0,auc=0.8000543298651065
fold-1,auc=0.7936822415450346
fold-2,auc=0.7939998298220732
fold-3,auc=0.7958142036796088
fold-4,auc=0.7943367229805269
kfold-auc= 0.79557746557847
fold-0,auc=0.800117758050027
fold-1,auc=0.7937758233737656
fold-2,auc=0.7939677108313299
fold-3,auc=0.7958489695658847
fold-4,auc=0.79440

fold-4,auc=0.7943985527501966
kfold-auc= 0.7956281632565676
fold-0,auc=0.8001130343334751
fold-1,auc=0.7938153593719101
fold-2,auc=0.7939556626473575
fold-3,auc=0.7958814376760929
fold-4,auc=0.7943949974315673
kfold-auc= 0.7956320982920805
fold-0,auc=0.8001091549011036
fold-1,auc=0.7937648367979611
fold-2,auc=0.7939931502404801
fold-3,auc=0.7958991002710466
fold-4,auc=0.794398948181427
kfold-auc= 0.7956330380784036
fold-0,auc=0.8001249041853233
fold-1,auc=0.7937783883330972
fold-2,auc=0.7939645046321651
fold-3,auc=0.7958878821364137
fold-4,auc=0.7944028098701986
kfold-auc= 0.7956316978314396
fold-0,auc=0.8001040963207842
fold-1,auc=0.7938087617265175
fold-2,auc=0.793972651940487
fold-3,auc=0.7958934181736381
fold-4,auc=0.7943870781196304
kfold-auc= 0.7956332012562115
fold-0,auc=0.8001033731575413
fold-1,auc=0.7938114299967113
fold-2,auc=0.7939383420469812
fold-3,auc=0.7958827059059845
fold-4,auc=0.7943690699676553
kfold-auc= 0.7956209842149747
fold-0,auc=0.8001598155142883
fold-1,auc=0

fold-0,auc=0.8000850126484101
fold-1,auc=0.7937997095575425
fold-2,auc=0.7940014400465426
fold-3,auc=0.7958745977845413
fold-4,auc=0.7943929846509805
kfold-auc= 0.7956307489376034
fold-0,auc=0.8000965868226761
fold-1,auc=0.7938452375856815
fold-2,auc=0.7939573227460361
fold-3,auc=0.7958842021322614
fold-4,auc=0.7943915846106786
kfold-auc= 0.7956349867794666
fold-0,auc=0.800093922162057
fold-1,auc=0.7937879107446164
fold-2,auc=0.793995280581703
fold-3,auc=0.7959048037431165
fold-4,auc=0.7943966254682543
kfold-auc= 0.7956357085399495
fold-0,auc=0.8001092688972796
fold-1,auc=0.7938050211608254
fold-2,auc=0.793967657394677
fold-3,auc=0.7958898450427914
fold-4,auc=0.7943982143180626
kfold-auc= 0.7956340013627272
fold-0,auc=0.8000873246333591
fold-1,auc=0.7938435596747853
fold-2,auc=0.7939379501781944
fold-3,auc=0.7958884378776023
fold-4,auc=0.7943635873670836
kfold-auc= 0.7956241719462049
fold-0,auc=0.800144956825195
fold-1,auc=0.7937766498606612
fold-2,auc=0.7939358412116326
fold-3,auc=0.7

fold-3,auc=0.7958871660852669
fold-4,auc=0.7944120472862365
kfold-auc= 0.7956356544053128
fold-0,auc=0.800147080003977
fold-1,auc=0.7937759765588366
fold-2,auc=0.7939443697014108
fold-3,auc=0.7958954416415556
fold-4,auc=0.7944187126180554
kfold-auc= 0.7956363161047671
fold-0,auc=0.7998718077374692
fold-1,auc=0.7936311347303491
fold-2,auc=0.7938596797316967
fold-3,auc=0.7957026172613463
fold-4,auc=0.7941846529541448
kfold-auc= 0.7954499784830013
fold-0,auc=0.800036535774476
fold-1,auc=0.7937154969552597
fold-2,auc=0.7940127116178282
fold-3,auc=0.7958533193094182
fold-4,auc=0.794363327308707
kfold-auc= 0.7955962781931378
fold-0,auc=0.8001054464629962
fold-1,auc=0.7938054165920558
fold-2,auc=0.7939775253632175
fold-3,auc=0.7958895101731007
fold-4,auc=0.7944422532448115
kfold-auc= 0.7956440303672363
step=0.7956478044502304, score=25
fold-0,auc=0.8001284487539275
fold-1,auc=0.7937920289293212
fold-2,auc=0.7939894488616666
fold-3,auc=0.7958846046883787
fold-4,auc=0.7944473119146048
kfold-auc

In [155]:
features_prev

array([ 0,  1,  2,  4,  7,  9, 10, 12, 13, 15, 19, 22, 26, 30, 33, 35, 36,
       37])

array(['./oof-result\\oof-024-first.csv',
       './oof-result\\oof-024-keras01.csv',
       './oof-result\\oof-028-lgb.csv',
       './oof-result\\oof-030-keras01.csv',
       './oof-result\\oof-031-lgb-seed-10.csv',
       './oof-result\\oof-031-lgb-seed-12.csv',
       './oof-result\\oof-031-lgb-seed-13.csv',
       './oof-result\\oof-031-lgb-seed-15.csv',
       './oof-result\\oof-031-lgb-seed-16.csv',
       './oof-result\\oof-031-lgb-seed-18.csv',
       './oof-result\\oof-031-lgb-seed-21.csv',
       './oof-result\\oof-031-lgb-seed-24.csv',
       './oof-result\\oof-031-lgb-seed-28.csv',
       './oof-result\\oof-031-lgb-seed-5.csv',
       './oof-result\\oof-031-lgb-seed-8.csv',
       './oof-result\\oof-035-xgb-0.csv',
       './oof-result\\oof-035-xgb-1.csv', 'lgb_avg'], dtype=object)

In [157]:
list(features[features_prev])

['./oof-result\\oof-024-first.csv',
 './oof-result\\oof-024-keras01.csv',
 './oof-result\\oof-028-lgb.csv',
 './oof-result\\oof-030-keras01.csv',
 './oof-result\\oof-031-lgb-seed-10.csv',
 './oof-result\\oof-031-lgb-seed-12.csv',
 './oof-result\\oof-031-lgb-seed-13.csv',
 './oof-result\\oof-031-lgb-seed-15.csv',
 './oof-result\\oof-031-lgb-seed-16.csv',
 './oof-result\\oof-031-lgb-seed-18.csv',
 './oof-result\\oof-031-lgb-seed-21.csv',
 './oof-result\\oof-031-lgb-seed-24.csv',
 './oof-result\\oof-031-lgb-seed-28.csv',
 './oof-result\\oof-031-lgb-seed-5.csv',
 './oof-result\\oof-031-lgb-seed-8.csv',
 './oof-result\\oof-035-xgb-0.csv',
 './oof-result\\oof-035-xgb-1.csv',
 'lgb_avg']

## stacking-logit-fs-result

In [158]:
# nrows = None  
nrows = None
# nrows = 1000

# features = list(np.load('./neptune-features/features_246008_1174_0.npy'))
train_app = load_dataframe32("./bindata/application_train")
auc_valid_stack2 = []
pred_test_stack2 = []

features = list(oof_df.columns)
features = [
    './oof-result\\oof-024-first.csv',
    './oof-result\\oof-024-keras01.csv',
    './oof-result\\oof-028-lgb.csv',
    './oof-result\\oof-030-keras01.csv',
    './oof-result\\oof-031-lgb-seed-10.csv',
    './oof-result\\oof-031-lgb-seed-12.csv',
    './oof-result\\oof-031-lgb-seed-13.csv',
    './oof-result\\oof-031-lgb-seed-15.csv',
    './oof-result\\oof-031-lgb-seed-16.csv',
    './oof-result\\oof-031-lgb-seed-18.csv',
    './oof-result\\oof-031-lgb-seed-21.csv',
    './oof-result\\oof-031-lgb-seed-24.csv',
    './oof-result\\oof-031-lgb-seed-28.csv',
    './oof-result\\oof-031-lgb-seed-5.csv',
    './oof-result\\oof-031-lgb-seed-8.csv',
    './oof-result\\oof-035-xgb-0.csv',
    './oof-result\\oof-035-xgb-1.csv',
    'lgb_avg'
]

# features = features0

for i in range(5):
    train_idx_fn = "./neptune-features/train_idx_{}.npy".format(i)
    valid_idx_fn = "./neptune-features/valid_idx_{}.npy".format(i)

    train_offset, valid_offset, test_offset = i * 4, i * 4 + 1, i * 4 + 3
    
    train_data_fn = "./neptune-features/data_246009_1174_{}".format(train_offset)
    valid_data_fn = "./neptune-features/data_61502_1174_{}".format(valid_offset)
    test_data_fn = "./neptune-features/data_48744_1174_{}".format(test_offset)
    
    if i == 0:
        train_data_fn = train_data_fn.replace("246009", "246008")
        valid_data_fn = valid_data_fn.replace("61502", "61503")
    
    gc.collect()

    train_idx = np.load(train_idx_fn)
    valid_idx = np.load(valid_idx_fn)
    
    if nrows:
        train_idx, valid_idx = train_idx[:nrows].copy(), valid_idx[:nrows].copy()

    gc.collect()
    
    X_train = X2rank(oof_df.loc[train_idx][features].values)
    X_valid = X2rank(oof_df.loc[valid_idx][features].values)
    X_test = test_df[features].values
    
    # X_train = add_std(X_train)
    # X_valid = add_std(X_valid)
    # X_test = add_std(X_test)
    
    y_train = train_app.loc[train_idx].TARGET
    y_valid = train_app.loc[valid_idx].TARGET
    
    print(train_idx_fn, valid_idx_fn)
    print(X_train.shape, y_train.shape)
    print(X_valid.shape, y_valid.shape)
    print(X_test.shape)
    
    estimator = LogisticRegression(C=0.08)
    estimator.fit(X_train, y_train)
    
    pred_valid = estimator.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, pred_valid)
    auc_valid_stack2.append(auc)
    print("fold-{},auc:{}".format(i, auc))

    pred_test = estimator.predict_proba(X_test)[:, 1]
    pred_test_stack2.append(pred_test)
    
print("kfold-auc, avg:{:.8}, std:{:.2}".format(np.mean(auc_valid_stack2), np.std(auc_valid_stack2)))

./neptune-features/train_idx_0.npy ./neptune-features/valid_idx_0.npy
(246008, 18) (246008,)
(61503, 18) (61503,)
(48744, 18)
fold-0,auc:0.8000801713733015
./neptune-features/train_idx_1.npy ./neptune-features/valid_idx_1.npy
(246009, 18) (246009,)
(61502, 18) (61502,)
(48744, 18)
fold-1,auc:0.7938211091557452
./neptune-features/train_idx_2.npy ./neptune-features/valid_idx_2.npy
(246009, 18) (246009,)
(61502, 18) (61502,)
(48744, 18)
fold-2,auc:0.7939944647821376
./neptune-features/train_idx_3.npy ./neptune-features/valid_idx_3.npy
(246009, 18) (246009,)
(61502, 18) (61502,)
(48744, 18)
fold-3,auc:0.7958933754243159
./neptune-features/train_idx_4.npy ./neptune-features/valid_idx_4.npy
(246009, 18) (246009,)
(61502, 18) (61502,)
(48744, 18)
fold-4,auc:0.7944414944443426
kfold-auc, avg:0.79564612, std:0.0023


In [178]:
# nrows = None  
nrows = None
C = .5
# nrows = 1000

# features = list(np.load('./neptune-features/features_246008_1174_0.npy'))
train_app = load_dataframe32("./bindata/application_train")
auc_valid_stack2 = []
pred_test_stack2 = []

features = list(oof_df.columns)
features = [
    './oof-result\\oof-024-first.csv',
    './oof-result\\oof-024-keras01.csv',
    './oof-result\\oof-028-lgb.csv',
    './oof-result\\oof-030-keras01.csv',
    './oof-result\\oof-031-lgb-seed-10.csv',
    './oof-result\\oof-031-lgb-seed-12.csv',
    './oof-result\\oof-031-lgb-seed-13.csv',
    './oof-result\\oof-031-lgb-seed-15.csv',
    './oof-result\\oof-031-lgb-seed-16.csv',
    './oof-result\\oof-031-lgb-seed-18.csv',
    './oof-result\\oof-031-lgb-seed-21.csv',
    './oof-result\\oof-031-lgb-seed-24.csv',
    './oof-result\\oof-031-lgb-seed-28.csv',
    './oof-result\\oof-031-lgb-seed-5.csv',
    './oof-result\\oof-031-lgb-seed-8.csv',
    './oof-result\\oof-035-xgb-0.csv',
    './oof-result\\oof-035-xgb-1.csv',
    'lgb_avg'
]

# features = features0

for i in range(5):
    train_idx_fn = "./neptune-features/train_idx_{}.npy".format(i)
    valid_idx_fn = "./neptune-features/valid_idx_{}.npy".format(i)

    train_offset, valid_offset, test_offset = i * 4, i * 4 + 1, i * 4 + 3
    
    train_data_fn = "./neptune-features/data_246009_1174_{}".format(train_offset)
    valid_data_fn = "./neptune-features/data_61502_1174_{}".format(valid_offset)
    test_data_fn = "./neptune-features/data_48744_1174_{}".format(test_offset)
    
    if i == 0:
        train_data_fn = train_data_fn.replace("246009", "246008")
        valid_data_fn = valid_data_fn.replace("61502", "61503")
    
    gc.collect()

    train_idx = np.load(train_idx_fn)
    valid_idx = np.load(valid_idx_fn)
    
    if nrows:
        train_idx, valid_idx = train_idx[:nrows].copy(), valid_idx[:nrows].copy()

    gc.collect()
    
    X_train = X2rank(oof_df.loc[train_idx][features].values)
    X_valid = X2rank(oof_df.loc[valid_idx][features].values)
    X_test = test_df[features].values
    
    # X_train = add_std(X_train)
    # X_valid = add_std(X_valid)
    # X_test = add_std(X_test)
    
    y_train = train_app.loc[train_idx].TARGET
    y_valid = train_app.loc[valid_idx].TARGET
    
    print(train_idx_fn, valid_idx_fn)
    print(X_train.shape, y_train.shape)
    print(X_valid.shape, y_valid.shape)
    print(X_test.shape)
    
    estimator = LogisticRegression(C=C)
    estimator.fit(X_train, y_train)
    
    pred_valid = estimator.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, pred_valid)
    auc_valid_stack2.append(auc)
    print("fold-{},auc:{}".format(i, auc))

    pred_test = estimator.predict_proba(X_test)[:, 1]
    pred_test_stack2.append(pred_test)
    
print("kfold-auc, avg:{:.8}, std:{:.2}".format(np.mean(auc_valid_stack2), np.std(auc_valid_stack2)))

./neptune-features/train_idx_0.npy ./neptune-features/valid_idx_0.npy
(246008, 18) (246008,)
(61503, 18) (61503,)
(48744, 18)
fold-0,auc:0.8001245194482286
./neptune-features/train_idx_1.npy ./neptune-features/valid_idx_1.npy
(246009, 18) (246009,)
(61502, 18) (61502,)
(48744, 18)
fold-1,auc:0.7938577203877627
./neptune-features/train_idx_2.npy ./neptune-features/valid_idx_2.npy
(246009, 18) (246009,)
(61502, 18) (61502,)
(48744, 18)
fold-2,auc:0.7939715475829969
./neptune-features/train_idx_3.npy ./neptune-features/valid_idx_3.npy
(246009, 18) (246009,)
(61502, 18) (61502,)
(48744, 18)
fold-3,auc:0.7959402536185475
./neptune-features/train_idx_4.npy ./neptune-features/valid_idx_4.npy
(246009, 18) (246009,)
(61502, 18) (61502,)
(48744, 18)
fold-4,auc:0.7945466007778489
kfold-auc, avg:0.79568813, std:0.0023


* c:.09,  kfold-auc, avg:0.79565171, std:0.0023
* c:.25,  kfold-auc, avg:0.79568481, std:0.0023
* c:.4,  kfold-auc, avg:0.79568798, std:0.0023
* c:.45,  kfold-auc, avg:0.79568796, std:0.0023
* c:.49, kfold-auc, avg:0.79568809, std:0.0023
* c:.5,  kfold-auc, avg:0.79568813, std:0.0023
* c:.51,  kfold-auc, avg:0.79568787, std:0.0023
* c:.55, kfold-auc, avg:0.79568727, std:0.0023
* c:.6,  kfold-auc, avg:0.79568722, std:0.0023
* c:.8,  kfold-auc, avg:0.79568508, std:0.0023
* c:.9,  kfold-auc, avg:0.79568455, std:0.0023
* c:1.0,  kfold-auc, avg:0.79568426, std:0.0023
* c:1.1,  kfold-auc, avg:0.79568152, std:0.0023
* c:2.0,  kfold-auc, avg:0.79567872, std:0.0023
* c:5.0,  kfold-auc, avg:0.79566086, std:0.0023
* c:10.0, kfold-auc, avg:0.79563815, std:0.0023

In [180]:
pred_target = sum([calculate_rank(p_) for p_ in pred_test_stack2]) / 5

pred_sample = pd.read_csv("./result/submission-022-blend08.csv")
pred_sample["TARGET"] = pred_target

pred_sample.to_csv("./result/submission-035-stacking-logit-fs-0.csv", index=False)

In [182]:
pred_test_stack2

[array([0.03557907, 0.18386796, 0.02318907, ..., 0.00622359, 0.01416234,
        0.20464635]),
 array([0.03512826, 0.18277808, 0.02319036, ..., 0.00606857, 0.01350655,
        0.20539384]),
 array([0.03389851, 0.18531288, 0.02337917, ..., 0.00610189, 0.01383104,
        0.2050081 ]),
 array([0.03431867, 0.18363512, 0.02354724, ..., 0.0060875 , 0.01371794,
        0.20509959]),
 array([0.03462207, 0.18393768, 0.02300943, ..., 0.00610313, 0.01361034,
        0.2057945 ])]

## stacking-logit-fs-result-bld

In [188]:
pred_target = sum([calculate_rank(p_) for p_ in pred_test_stack2]) / 5

bld_param = .4
pred_sample = pd.read_csv("./result/submission-022-blend08.csv")
pred_sample["TARGET"] = pred_target * (1 - bld_param) + pred_sample.TARGET * bld_param
pred_sample.to_csv("./result/submission-035-stacking-logit-fs-0-bld{}.csv".format(str(bld_param).replace(".", "-")), index=False)

In [4]:
pred_sample = pd.read_csv("./result/submission-022-blend08.csv")
pred_target = pd.read_csv("./result/submission-035-stacking-logit-fs-0.csv").TARGET
bld_param = .5
pred_sample["TARGET"] = pred_target * (1 - bld_param) + pred_sample.TARGET * bld_param
pred_sample.to_csv("./result/submission-035-stacking-logit-fs-0-bld{}.csv".format(str(bld_param).replace(".", "-")), index=False)

In [113]:
# nrows = None  
nrows = None
# nrows = 1000

features = list(np.load('./neptune-features/features_246008_1174_0.npy'))
train_app = load_dataframe32("./bindata/application_train")
auc_valid_stack2 = []
pred_test_stack2 = []

features = list(oof_df.columns)
# features = features0

for i in range(5):
    train_idx_fn = "./neptune-features/train_idx_{}.npy".format(i)
    valid_idx_fn = "./neptune-features/valid_idx_{}.npy".format(i)

    train_offset, valid_offset, test_offset = i * 4, i * 4 + 1, i * 4 + 3
    
    train_data_fn = "./neptune-features/data_246009_1174_{}".format(train_offset)
    valid_data_fn = "./neptune-features/data_61502_1174_{}".format(valid_offset)
    test_data_fn = "./neptune-features/data_48744_1174_{}".format(test_offset)
    
    if i == 0:
        train_data_fn = train_data_fn.replace("246009", "246008")
        valid_data_fn = valid_data_fn.replace("61502", "61503")
    
    gc.collect()

    train_idx = np.load(train_idx_fn)
    valid_idx = np.load(valid_idx_fn)
    
    if nrows:
        train_idx, valid_idx = train_idx[:nrows].copy(), valid_idx[:nrows].copy()

    gc.collect()
    
    X_train = X2rank(oof_df.loc[train_idx][features].values)
    X_valid = X2rank(oof_df.loc[valid_idx][features].values)
    X_test = test_df[features].values
    
    # X_train = add_std(X_train)
    # X_valid = add_std(X_valid)
    # X_test = add_std(X_test)
    
    y_train = train_app.loc[train_idx].TARGET
    y_valid = train_app.loc[valid_idx].TARGET
    
    print(train_idx_fn, valid_idx_fn)
    print(X_train.shape, y_train.shape)
    print(X_valid.shape, y_valid.shape)
    print(X_test.shape)
    
    estimator = LogisticRegression(C=0.07)
    estimator.fit(X_train, y_train)
    
    pred_valid = estimator.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, pred_valid)
    auc_valid_stack2.append(auc)
    print("fold-{},auc:{}".format(i, auc))

    pred_test = estimator.predict_proba(X_test)[:, 1]
    pred_test_stack2.append(pred_test)
    
print("kfold-auc, avg:{:.8}, std:{:.2}".format(np.mean(auc_valid_stack2), np.std(auc_valid_stack2)))

./neptune-features/train_idx_0.npy ./neptune-features/valid_idx_0.npy
(246008, 37) (246008,)
(61503, 37) (61503,)
(48744, 37)
fold-0,auc:0.8000226567400222
./neptune-features/train_idx_1.npy ./neptune-features/valid_idx_1.npy
(246009, 37) (246009,)
(61502, 37) (61502,)
(48744, 37)
fold-1,auc:0.7938191284371502
./neptune-features/train_idx_2.npy ./neptune-features/valid_idx_2.npy
(246009, 37) (246009,)
(61502, 37) (61502,)
(48744, 37)
fold-2,auc:0.7939546936627212
./neptune-features/train_idx_3.npy ./neptune-features/valid_idx_3.npy
(246009, 37) (246009,)
(61502, 37) (61502,)
(48744, 37)
fold-3,auc:0.7958049199518052
./neptune-features/train_idx_4.npy ./neptune-features/valid_idx_4.npy
(246009, 37) (246009,)
(61502, 37) (61502,)
(48744, 37)
fold-4,auc:0.7941663384320271
kfold-auc, avg:0.79555355, std:0.0023


./neptune-features/train_idx_0.npy ./neptune-features/valid_idx_0.npy
(246008, 37) (246008,)
(61503, 37) (61503,)
(48744, 37)
fold-0,auc:0.7996818972326608
./neptune-features/train_idx_1.npy ./neptune-features/valid_idx_1.npy
(246009, 37) (246009,)
(61502, 37) (61502,)
(48744, 37)
fold-1,auc:0.7938554689234604
./neptune-features/train_idx_2.npy ./neptune-features/valid_idx_2.npy
(246009, 37) (246009,)
(61502, 37) (61502,)
(48744, 37)
fold-2,auc:0.793828070170376
./neptune-features/train_idx_3.npy ./neptune-features/valid_idx_3.npy
(246009, 37) (246009,)
(61502, 37) (61502,)
(48744, 37)
fold-3,auc:0.7959910006264379
./neptune-features/train_idx_4.npy ./neptune-features/valid_idx_4.npy
(246009, 37) (246009,)
(61502, 37) (61502,)
(48744, 37)
fold-4,auc:0.7941152921788814
kfold-auc, avg:0.7955, std:0.0022

## stacking-lgb

In [100]:
lgb_params = addict.Dict()
lgb_params.boosting_type = "gbdt"
lgb_params.objective = "binary"
lgb_params.metric = "auc"
lgb_params.learning_rate = 0.02
lgb_params.max_bin = 300
lgb_params.max_depth = -1
lgb_params.num_leaves = 30
lgb_params.min_child_samples = 70
lgb_params.subsample = 1.0
lgb_params.subsample_freq = 1
lgb_params.colsample_bytree = 0.05
lgb_params.min_gain_to_split = 0.5
lgb_params.reg_lambda = 100
lgb_params.reg_alpha = 0.0
lgb_params.scale_pos_weight = 1
lgb_params.is_unbalance = False

In [108]:
test_df[list(oof_df.columns)].values[:nrows, :].shape

(1000, 37)

In [110]:
# nrows = None  
nrows = None
# nrows = 1000

features = list(np.load('./neptune-features/features_246008_1174_0.npy'))
train_app = load_dataframe32("./bindata/application_train")
auc_valid_stack3 = []
pred_valid_stack3 = []
pred_test_stack3 = []

for i in range(5):
    train_idx_fn = "./neptune-features/train_idx_{}.npy".format(i)
    valid_idx_fn = "./neptune-features/valid_idx_{}.npy".format(i)

    train_offset, valid_offset, test_offset = i * 4, i * 4 + 1, i * 4 + 3
    
    train_data_fn = "./neptune-features/data_246009_1174_{}".format(train_offset)
    valid_data_fn = "./neptune-features/data_61502_1174_{}".format(valid_offset)
    test_data_fn = "./neptune-features/data_48744_1174_{}".format(test_offset)
    
    if i == 0:
        train_data_fn = train_data_fn.replace("246009", "246008")
        valid_data_fn = valid_data_fn.replace("61502", "61503")
    
    gc.collect()

    train_idx = np.load(train_idx_fn)
    valid_idx = np.load(valid_idx_fn)
    
    if nrows:
        train_idx, valid_idx = train_idx[:nrows].copy(), valid_idx[:nrows].copy()

    train_data = load_dataframe32(train_data_fn, nrows)
    valid_data = load_dataframe32(valid_data_fn, nrows)
    test_data = load_dataframe32(test_data_fn, nrows)
    gc.collect()
    
    X_train = train_data.values
    X_valid = valid_data.values
    X_test = test_data.values
    
    X_train = np.hstack([X_train, X2rank(oof_df.loc[train_idx][list(oof_df.columns)].values)])
    X_valid = np.hstack([X_valid, X2rank(oof_df.loc[valid_idx][list(oof_df.columns)].values)])
    X_test = np.hstack([X_test, test_df[list(oof_df.columns)].values[:nrows, :]])
    
    y_train = train_app.loc[train_idx].TARGET
    y_valid = train_app.loc[valid_idx].TARGET
    
    print(train_idx_fn, valid_idx_fn)
    print(train_idx.shape, valid_idx.shape)
    print(train_data_fn, train_data.shape)
    print(valid_data_fn, valid_data.shape)
    print(test_data_fn, test_data.shape)
    
    print(X_train.shape, y_train.shape)
    print(X_valid.shape, y_valid.shape)
    print(X_test.shape)
    
    lgb_data_train = lgb.Dataset(data=X_train, label=y_train)
    lgb_data_valid = lgb.Dataset(data=X_valid, label=y_valid)
    
    estimator = lgb.train(
        lgb_params,
        lgb_data_train,
        valid_sets=[lgb_data_train, lgb_data_valid],
        valid_names=["data_train", "data_valid"],
        num_boost_round=5000,
        early_stopping_rounds=100,
        verbose_eval=100
    )
    
    pred_valid = estimator.predict(X_valid)
    pred_valid_stack3.append(pred_valid)
    
    auc = roc_auc_score(y_valid, pred_valid)
    auc_valid_stack3.append(auc)
    print("fold-{},auc:{}".format(i, auc))

    pred_test = estimator.predict(X_test)
    pred_test_stack3.append(pred_test)
    
print("kfold-auc, avg:{:.4}, std:{:.2}".format(np.mean(auc_valid_stack3), np.std(auc_valid_stack3)))

./neptune-features/train_idx_0.npy ./neptune-features/valid_idx_0.npy
(246008,) (61503,)
./neptune-features/data_246008_1174_0 (246008, 1174)
./neptune-features/data_61503_1174_1 (61503, 1174)
./neptune-features/data_48744_1174_3 (48744, 1174)
(246008, 1211) (246008,)
(61503, 1211) (61503,)
(48744, 1211)
Training until validation scores don't improve for 100 rounds.
[100]	data_train's auc: 0.798951	data_valid's auc: 0.798683
Early stopping, best iteration is:
[20]	data_train's auc: 0.796233	data_valid's auc: 0.7988
fold-0,auc:0.7988002009325101
./neptune-features/train_idx_1.npy ./neptune-features/valid_idx_1.npy
(246009,) (61502,)
./neptune-features/data_246009_1174_4 (246009, 1174)
./neptune-features/data_61502_1174_5 (61502, 1174)
./neptune-features/data_48744_1174_7 (48744, 1174)
(246009, 1211) (246009,)
(61502, 1211) (61502,)
(48744, 1211)
Training until validation scores don't improve for 100 rounds.
[100]	data_train's auc: 0.800736	data_valid's auc: 0.792568
Early stopping, best