## functions

In [1]:
import pandas as pd
import numpy as np
import os
import gc

import addict
import lightgbm as lgb

def save_dataframe(path, dataframe):
    np.save(path + ".data", dataframe.values)
    np.save(path + ".header", dataframe.columns)


def load_dataframe(path):
    data = np.load(path + ".data.npy")
    header = np.load(path + ".header.npy")
    return pd.DataFrame(data=data, columns=header)


def save_dataframe32(path, dataframe, keep=[]):
    col64 = [col_ for col_ in dataframe.columns if col_ in keep]
    col32 = [col_ for col_ in dataframe.columns if col_ not in keep]
    dataframe64 = dataframe[col64]
    dataframe32 = dataframe[col32]
    np.save(path + ".data64", dataframe64.values)
    np.save(path + ".header64", col64)
    np.save(path + ".data32", dataframe32.values.astype(np.float32))
    np.save(path + ".header32", col32)


def load_dataframe32(path, nrows=None):
    path_data32 = path + ".data32.npy"
    path_header32 = path + ".header32.npy"
    path_data64 = path + ".data64.npy"
    path_header64 = path + ".header64.npy"
    result = pd.DataFrame()
    if os.path.exists(path_data32):
        data32 = np.load(path_data32)
        header32 = np.load(path_header32)
        df32 = pd.DataFrame(data=data32, columns=header32)
        result = pd.concat([result, df32], axis=1)
    if os.path.exists(path_data64):
        data64 = np.load(path_data64)
        header64 = np.load(path_header64)
        df64 = pd.DataFrame(data=data64, columns=header64)
        result = pd.concat([result, df64], axis=1)
    if nrows and nrows > 0:
        return result.head(nrows)
    return result

In [2]:
from sklearn.metrics import roc_auc_score
import xgboost as xgb

In [3]:
import glob

In [4]:
from sklearn.linear_model import LogisticRegression

## load oof & pred

### load lgb_info

In [5]:
lgb_info = pd.DataFrame([
    ("oof-024-first.csv       ", 0.79453, 0.002204),
    ("oof-031-lgb-seed-0.csv  ", 0.79426, 0.002599),
    ("oof-031-lgb-seed-1.csv  ", 0.79426, 0.002599),
    ("oof-031-lgb-seed-10.csv ", 0.79446, 0.002332),
    ("oof-031-lgb-seed-11.csv ", 0.79442, 0.002256),
    ("oof-031-lgb-seed-12.csv ", 0.79457, 0.002379),
    ("oof-031-lgb-seed-13.csv ", 0.79451, 0.002476),
    ("oof-031-lgb-seed-14.csv ", 0.79454, 0.002633),
    ("oof-031-lgb-seed-15.csv ", 0.79453, 0.002396),
    ("oof-031-lgb-seed-16.csv ", 0.79414, 0.002331),
    ("oof-031-lgb-seed-17.csv ", 0.79441, 0.002233),
    ("oof-031-lgb-seed-18.csv ", 0.79458, 0.002444),
    ("oof-031-lgb-seed-19.csv ", 0.79447, 0.002104),
    ("oof-031-lgb-seed-2.csv  ", 0.79426, 0.002599),
    ("oof-031-lgb-seed-20.csv ", 0.7943, 0.002261),
    ("oof-031-lgb-seed-21.csv ", 0.79471, 0.002242),
    ("oof-031-lgb-seed-22.csv ", 0.79425, 0.002295),
    ("oof-031-lgb-seed-23.csv ", 0.79419, 0.002687),
    ("oof-031-lgb-seed-24.csv ", 0.79452, 0.002525),
    ("oof-031-lgb-seed-25.csv ", 0.79439, 0.002363),
    ("oof-031-lgb-seed-26.csv ", 0.7945, 0.002492),
    ("oof-031-lgb-seed-27.csv ", 0.79446, 0.002419),
    ("oof-031-lgb-seed-28.csv ", 0.79442, 0.00233),
    ("oof-031-lgb-seed-29.csv ", 0.79433, 0.002703),
    ("oof-031-lgb-seed-3.csv  ", 0.79447, 0.002513),
    ("oof-031-lgb-seed-4.csv  ", 0.79448, 0.002551),
    ("oof-031-lgb-seed-5.csv  ", 0.79462, 0.002398),
    ("oof-031-lgb-seed-6.csv  ", 0.79412, 0.002712),
    ("oof-031-lgb-seed-7.csv  ", 0.79418, 0.002622),
    ("oof-031-lgb-seed-8.csv  ", 0.79461, 0.002116),
    ("oof-031-lgb-seed-9.csv  ", 0.7943, 0.002533)
], columns=["file", "auc", "std"])

lgb_info["file"] = lgb_info.file.apply(lambda x: './oof-result\\' + x.strip())
lgb_info.head()

Unnamed: 0,file,auc,std
0,./oof-result\oof-024-first.csv,0.79453,0.002204
1,./oof-result\oof-031-lgb-seed-0.csv,0.79426,0.002599
2,./oof-result\oof-031-lgb-seed-1.csv,0.79426,0.002599
3,./oof-result\oof-031-lgb-seed-10.csv,0.79446,0.002332
4,./oof-result\oof-031-lgb-seed-11.csv,0.79442,0.002256


### load oof & test

In [6]:
oof_df = pd.DataFrame({fn: pd.read_csv(fn).oof_pred
    for fn in glob.glob("./oof-result/*.csv")
})

test_df = pd.DataFrame({fn: pd.read_csv(fn.replace("oof-result", "result").replace("oof", "submission")).TARGET
    for fn in glob.glob("./oof-result/*.csv")
})

### cal lgb_avg

In [7]:
def X2rank(X):
    return np.array([calculate_rank(col) for col in X.T]).T

def calculate_rank(predictions):
    rank = (1 + pd.Series(predictions).rank().values) / (predictions.shape[0] + 1)
    return rank

In [8]:
oof_df["lgb_avg"] = X2rank(oof_df[list(lgb_info[lgb_info.auc >= 0.7945].file.values)].values).mean(axis=1)
test_df["lgb_avg"] = test_df[list(lgb_info[lgb_info.auc >= 0.7945].file.values)].values.mean(axis=1)

## stacking

### result 035

In [12]:
# nrows = None  
nrows = None
C = .5
# nrows = 1000

# features = list(np.load('./neptune-features/features_246008_1174_0.npy'))
train_app = load_dataframe32("./bindata/application_train")
auc_valid_stack2 = []
pred_test_stack2 = []

features = list(oof_df.columns)
features = [
    './oof-result\\oof-024-first.csv',
    './oof-result\\oof-024-keras01.csv',
    './oof-result\\oof-028-lgb.csv',
    './oof-result\\oof-030-keras01.csv',
    './oof-result\\oof-031-lgb-seed-10.csv',
    './oof-result\\oof-031-lgb-seed-12.csv',
    './oof-result\\oof-031-lgb-seed-13.csv',
    './oof-result\\oof-031-lgb-seed-15.csv',
    './oof-result\\oof-031-lgb-seed-16.csv',
    './oof-result\\oof-031-lgb-seed-18.csv',
    './oof-result\\oof-031-lgb-seed-21.csv',
    './oof-result\\oof-031-lgb-seed-24.csv',
    './oof-result\\oof-031-lgb-seed-28.csv',
    './oof-result\\oof-031-lgb-seed-5.csv',
    './oof-result\\oof-031-lgb-seed-8.csv',
    './oof-result\\oof-035-xgb-0.csv',
    './oof-result\\oof-035-xgb-1.csv',
    'lgb_avg'
]

# features = features0

for i in range(5):
    train_idx_fn = "./neptune-features/train_idx_{}.npy".format(i)
    valid_idx_fn = "./neptune-features/valid_idx_{}.npy".format(i)

    train_offset, valid_offset, test_offset = i * 4, i * 4 + 1, i * 4 + 3
    
    train_data_fn = "./neptune-features/data_246009_1174_{}".format(train_offset)
    valid_data_fn = "./neptune-features/data_61502_1174_{}".format(valid_offset)
    test_data_fn = "./neptune-features/data_48744_1174_{}".format(test_offset)
    
    if i == 0:
        train_data_fn = train_data_fn.replace("246009", "246008")
        valid_data_fn = valid_data_fn.replace("61502", "61503")
    
    gc.collect()

    train_idx = np.load(train_idx_fn)
    valid_idx = np.load(valid_idx_fn)
    
    if nrows:
        train_idx, valid_idx = train_idx[:nrows].copy(), valid_idx[:nrows].copy()

    gc.collect()
    
    X_train = X2rank(oof_df.loc[train_idx][features].values)
    X_valid = X2rank(oof_df.loc[valid_idx][features].values)
    X_test = test_df[features].values
    
    # X_train = add_std(X_train)
    # X_valid = add_std(X_valid)
    # X_test = add_std(X_test)
    
    y_train = train_app.loc[train_idx].TARGET
    y_valid = train_app.loc[valid_idx].TARGET
    
    print(train_idx_fn, valid_idx_fn)
    print(X_train.shape, y_train.shape)
    print(X_valid.shape, y_valid.shape)
    print(X_test.shape)
    
    estimator = LogisticRegression(C=C)
    estimator.fit(X_train, y_train)
    
    pred_valid = estimator.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, pred_valid)
    auc_valid_stack2.append(auc)
    print("fold-{},auc:{}".format(i, auc))

    pred_test = estimator.predict_proba(X_test)[:, 1]
    pred_test_stack2.append(pred_test)
    
print("kfold-auc, avg:{:.8}, std:{:.2}".format(np.mean(auc_valid_stack2), np.std(auc_valid_stack2)))

./neptune-features/train_idx_0.npy ./neptune-features/valid_idx_0.npy
(246008, 18) (246008,)
(61503, 18) (61503,)
(48744, 18)
fold-0,auc:0.8001245194482286
./neptune-features/train_idx_1.npy ./neptune-features/valid_idx_1.npy
(246009, 18) (246009,)
(61502, 18) (61502,)
(48744, 18)
fold-1,auc:0.7938577203877627
./neptune-features/train_idx_2.npy ./neptune-features/valid_idx_2.npy
(246009, 18) (246009,)
(61502, 18) (61502,)
(48744, 18)
fold-2,auc:0.7939715475829969
./neptune-features/train_idx_3.npy ./neptune-features/valid_idx_3.npy
(246009, 18) (246009,)
(61502, 18) (61502,)
(48744, 18)
fold-3,auc:0.7959402536185475
./neptune-features/train_idx_4.npy ./neptune-features/valid_idx_4.npy
(246009, 18) (246009,)
(61502, 18) (61502,)
(48744, 18)
fold-4,auc:0.7945466007778489
kfold-auc, avg:0.79568813, std:0.0023


### add 038-keras01

In [9]:
# nrows = None  
nrows = None
C = .5
# nrows = 1000

# features = list(np.load('./neptune-features/features_246008_1174_0.npy'))
train_app = load_dataframe32("./bindata/application_train")
auc_valid_stack2 = []
pred_test_stack2 = []

features = list(oof_df.columns)
features = [
    './oof-result\\oof-024-first.csv',
    './oof-result\\oof-024-keras01.csv',
    './oof-result\\oof-028-lgb.csv',
    './oof-result\\oof-030-keras01.csv',
    './oof-result\\oof-036-keras01.csv',
    './oof-result\\oof-038-keras01.csv',
    './oof-result\\oof-031-lgb-seed-10.csv',
    './oof-result\\oof-031-lgb-seed-12.csv',
    './oof-result\\oof-031-lgb-seed-13.csv',
    './oof-result\\oof-031-lgb-seed-15.csv',
    './oof-result\\oof-031-lgb-seed-16.csv',
    './oof-result\\oof-031-lgb-seed-18.csv',
    './oof-result\\oof-031-lgb-seed-21.csv',
    './oof-result\\oof-031-lgb-seed-24.csv',
    './oof-result\\oof-031-lgb-seed-28.csv',
    './oof-result\\oof-031-lgb-seed-5.csv',
    './oof-result\\oof-031-lgb-seed-8.csv',
    './oof-result\\oof-035-xgb-0.csv',
    './oof-result\\oof-035-xgb-1.csv',
    'lgb_avg'
]

# features = features0

for i in range(5):
    train_idx_fn = "./neptune-features/train_idx_{}.npy".format(i)
    valid_idx_fn = "./neptune-features/valid_idx_{}.npy".format(i)

    train_offset, valid_offset, test_offset = i * 4, i * 4 + 1, i * 4 + 3
    
    train_data_fn = "./neptune-features/data_246009_1174_{}".format(train_offset)
    valid_data_fn = "./neptune-features/data_61502_1174_{}".format(valid_offset)
    test_data_fn = "./neptune-features/data_48744_1174_{}".format(test_offset)
    
    if i == 0:
        train_data_fn = train_data_fn.replace("246009", "246008")
        valid_data_fn = valid_data_fn.replace("61502", "61503")
    
    gc.collect()

    train_idx = np.load(train_idx_fn)
    valid_idx = np.load(valid_idx_fn)
    
    if nrows:
        train_idx, valid_idx = train_idx[:nrows].copy(), valid_idx[:nrows].copy()

    gc.collect()
    
    X_train = X2rank(oof_df.loc[train_idx][features].values)
    X_valid = X2rank(oof_df.loc[valid_idx][features].values)
    X_test = test_df[features].values
    
    # X_train = add_std(X_train)
    # X_valid = add_std(X_valid)
    # X_test = add_std(X_test)
    
    y_train = train_app.loc[train_idx].TARGET
    y_valid = train_app.loc[valid_idx].TARGET
    
    print(train_idx_fn, valid_idx_fn)
    print(X_train.shape, y_train.shape)
    print(X_valid.shape, y_valid.shape)
    print(X_test.shape)
    
    estimator = LogisticRegression(C=C)
    estimator.fit(X_train, y_train)
    
    pred_valid = estimator.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, pred_valid)
    auc_valid_stack2.append(auc)
    print("fold-{},auc:{}".format(i, auc))

    pred_test = estimator.predict_proba(X_test)[:, 1]
    pred_test_stack2.append(pred_test)
    
print("kfold-auc, avg:{:.8}, std:{:.2}".format(np.mean(auc_valid_stack2), np.std(auc_valid_stack2)))

./neptune-features/train_idx_0.npy ./neptune-features/valid_idx_0.npy
(246008, 20) (246008,)
(61503, 20) (61503,)
(48744, 20)
fold-0,auc:0.8001663132963324
./neptune-features/train_idx_1.npy ./neptune-features/valid_idx_1.npy
(246009, 20) (246009,)
(61502, 20) (61502,)
(48744, 20)
fold-1,auc:0.7939032840403366
./neptune-features/train_idx_2.npy ./neptune-features/valid_idx_2.npy
(246009, 20) (246009,)
(61502, 20) (61502,)
(48744, 20)
fold-2,auc:0.7940744345141926
./neptune-features/train_idx_3.npy ./neptune-features/valid_idx_3.npy
(246009, 20) (246009,)
(61502, 20) (61502,)
(48744, 20)
fold-3,auc:0.7960057562674826
./neptune-features/train_idx_4.npy ./neptune-features/valid_idx_4.npy
(246009, 20) (246009,)
(61502, 20) (61502,)
(48744, 20)
fold-4,auc:0.7944858789281128
kfold-auc, avg:0.79572713, std:0.0023


### add features

In [31]:
def add_top_ratio(X, th):
    new_col = (X > th).sum(axis=1)
    return np.hstack([X, new_col.reshape((X.shape[0], 1)).astype("float") / X.shape[1]])

def add_bottom_ratio(X, th):
    new_col = (X < th).sum(axis=1)
    return np.hstack([X, new_col.reshape((X.shape[0], 1)).astype("float") / X.shape[1]])

def add_median(X):
    return np.hstack([X, np.median(X, axis=1).reshape((X.shape[0], 1))])

In [11]:
X = np.array([
   [1, 1, 3],
   [1, 1, 3],
   [1, 2, 3],
   [1, 2, 1],
   [1, 1, 1],
   [1, 1, 2],
   [1, 1, 2],
   [1, 1, 3]
])

add_top_ratio(X, 1.5)

array([[1.        , 1.        , 3.        , 0.33333333],
       [1.        , 1.        , 3.        , 0.33333333],
       [1.        , 2.        , 3.        , 0.66666667],
       [1.        , 2.        , 1.        , 0.33333333],
       [1.        , 1.        , 1.        , 0.        ],
       [1.        , 1.        , 2.        , 0.33333333],
       [1.        , 1.        , 2.        , 0.33333333],
       [1.        , 1.        , 3.        , 0.33333333]])

In [32]:
add_median(X)

array([[1., 1., 3., 1.],
       [1., 1., 3., 1.],
       [1., 2., 3., 2.],
       [1., 2., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 2., 1.],
       [1., 1., 2., 1.],
       [1., 1., 3., 1.]])

In [21]:
# nrows = None  
nrows = None
C = .51
# nrows = 1000

# features = list(np.load('./neptune-features/features_246008_1174_0.npy'))
train_app = load_dataframe32("./bindata/application_train")
auc_valid_stack2 = []
pred_test_stack2 = []

features = list(oof_df.columns)
features = [
    './oof-result\\oof-024-first.csv',
    './oof-result\\oof-024-keras01.csv',
    './oof-result\\oof-028-lgb.csv',
    './oof-result\\oof-030-keras01.csv',
    './oof-result\\oof-036-keras01.csv',
    './oof-result\\oof-038-keras01.csv',
    './oof-result\\oof-031-lgb-seed-10.csv',
    './oof-result\\oof-031-lgb-seed-12.csv',
    './oof-result\\oof-031-lgb-seed-13.csv',
    './oof-result\\oof-031-lgb-seed-15.csv',
    './oof-result\\oof-031-lgb-seed-16.csv',
    './oof-result\\oof-031-lgb-seed-18.csv',
    './oof-result\\oof-031-lgb-seed-21.csv',
    './oof-result\\oof-031-lgb-seed-24.csv',
    './oof-result\\oof-031-lgb-seed-28.csv',
    './oof-result\\oof-031-lgb-seed-5.csv',
    './oof-result\\oof-031-lgb-seed-8.csv',
    './oof-result\\oof-035-xgb-0.csv',
    './oof-result\\oof-035-xgb-1.csv',
    'lgb_avg'
]

# features = features0

for i in range(5):
    train_idx_fn = "./neptune-features/train_idx_{}.npy".format(i)
    valid_idx_fn = "./neptune-features/valid_idx_{}.npy".format(i)

    train_offset, valid_offset, test_offset = i * 4, i * 4 + 1, i * 4 + 3
    
    train_data_fn = "./neptune-features/data_246009_1174_{}".format(train_offset)
    valid_data_fn = "./neptune-features/data_61502_1174_{}".format(valid_offset)
    test_data_fn = "./neptune-features/data_48744_1174_{}".format(test_offset)
    
    if i == 0:
        train_data_fn = train_data_fn.replace("246009", "246008")
        valid_data_fn = valid_data_fn.replace("61502", "61503")
    
    gc.collect()

    train_idx = np.load(train_idx_fn)
    valid_idx = np.load(valid_idx_fn)
    
    if nrows:
        train_idx, valid_idx = train_idx[:nrows].copy(), valid_idx[:nrows].copy()

    gc.collect()
    
    X_train = X2rank(oof_df.loc[train_idx][features].values)
    X_valid = X2rank(oof_df.loc[valid_idx][features].values)
    X_test = test_df[features].values
    
    top_th = .9
    X_train = add_top_ratio(X_train, top_th)
    X_valid = add_top_ratio(X_valid, top_th)
    X_test = add_top_ratio(X_test, top_th)
    
    bottom_th = .32
    X_train = add_bottom_ratio(X_train, bottom_th)
    X_valid = add_bottom_ratio(X_valid, bottom_th)
    X_test = add_bottom_ratio(X_test, bottom_th)
    
    # X_train = add_std(X_train)
    # X_valid = add_std(X_valid)
    # X_test = add_std(X_test)
    
    y_train = train_app.loc[train_idx].TARGET
    y_valid = train_app.loc[valid_idx].TARGET
    
    print(train_idx_fn, valid_idx_fn)
    print(X_train.shape, y_train.shape)
    print(X_valid.shape, y_valid.shape)
    print(X_test.shape)
    
    estimator = LogisticRegression(C=C)
    estimator.fit(X_train, y_train)
    
    pred_valid = estimator.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, pred_valid)
    auc_valid_stack2.append(auc)
    print("fold-{},auc:{}".format(i, auc))

    pred_test = estimator.predict_proba(X_test)[:, 1]
    pred_test_stack2.append(pred_test)
    
    # break
    
print("kfold-auc, avg:{:.8}, std:{:.2}".format(np.mean(auc_valid_stack2), np.std(auc_valid_stack2)))

./neptune-features/train_idx_0.npy ./neptune-features/valid_idx_0.npy
(246008, 22) (246008,)
(61503, 22) (61503,)
(48744, 22)
fold-0,auc:0.8003766184295409
./neptune-features/train_idx_1.npy ./neptune-features/valid_idx_1.npy
(246009, 22) (246009,)
(61502, 22) (61502,)
(48744, 22)
fold-1,auc:0.7939175658763938
./neptune-features/train_idx_2.npy ./neptune-features/valid_idx_2.npy
(246009, 22) (246009,)
(61502, 22) (61502,)
(48744, 22)
fold-2,auc:0.7942887190541441
./neptune-features/train_idx_3.npy ./neptune-features/valid_idx_3.npy
(246009, 22) (246009,)
(61502, 22) (61502,)
(48744, 22)
fold-3,auc:0.7959949086269753
./neptune-features/train_idx_4.npy ./neptune-features/valid_idx_4.npy
(246009, 22) (246009,)
(61502, 22) (61502,)
(48744, 22)
fold-4,auc:0.7946382980739596
kfold-auc, avg:0.79584322, std:0.0024


In [42]:
# nrows = None  
nrows = None
C = .4
# nrows = 1000

# features = list(np.load('./neptune-features/features_246008_1174_0.npy'))
train_app = load_dataframe32("./bindata/application_train")
auc_valid_stack2 = []
pred_test_stack2 = []

features = list(oof_df.columns)
features = [
    './oof-result\\oof-024-first.csv',
    './oof-result\\oof-024-keras01.csv',
    './oof-result\\oof-028-lgb.csv',
    './oof-result\\oof-030-keras01.csv',
    './oof-result\\oof-036-keras01.csv',
    './oof-result\\oof-038-keras01.csv',
    './oof-result\\oof-031-lgb-seed-10.csv',
    './oof-result\\oof-031-lgb-seed-12.csv',
    './oof-result\\oof-031-lgb-seed-13.csv',
    './oof-result\\oof-031-lgb-seed-15.csv',
    './oof-result\\oof-031-lgb-seed-16.csv',
    './oof-result\\oof-031-lgb-seed-18.csv',
    './oof-result\\oof-031-lgb-seed-21.csv',
    './oof-result\\oof-031-lgb-seed-24.csv',
    './oof-result\\oof-031-lgb-seed-28.csv',
    './oof-result\\oof-031-lgb-seed-5.csv',
    './oof-result\\oof-031-lgb-seed-8.csv',
    './oof-result\\oof-035-xgb-0.csv',
    './oof-result\\oof-035-xgb-1.csv',
    'lgb_avg'
]

# features = features0

for i in range(5):
    train_idx_fn = "./neptune-features/train_idx_{}.npy".format(i)
    valid_idx_fn = "./neptune-features/valid_idx_{}.npy".format(i)

    train_offset, valid_offset, test_offset = i * 4, i * 4 + 1, i * 4 + 3
    
    train_data_fn = "./neptune-features/data_246009_1174_{}".format(train_offset)
    valid_data_fn = "./neptune-features/data_61502_1174_{}".format(valid_offset)
    test_data_fn = "./neptune-features/data_48744_1174_{}".format(test_offset)
    
    if i == 0:
        train_data_fn = train_data_fn.replace("246009", "246008")
        valid_data_fn = valid_data_fn.replace("61502", "61503")
    
    gc.collect()

    train_idx = np.load(train_idx_fn)
    valid_idx = np.load(valid_idx_fn)
    
    if nrows:
        train_idx, valid_idx = train_idx[:nrows].copy(), valid_idx[:nrows].copy()

    gc.collect()
    
    X_train = X2rank(oof_df.loc[train_idx][features].values)
    X_valid = X2rank(oof_df.loc[valid_idx][features].values)
    X_test = test_df[features].values
    
    top_th = .9
    X_train = add_top_ratio(X_train, top_th)
    X_valid = add_top_ratio(X_valid, top_th)
    X_test = add_top_ratio(X_test, top_th)

    bottom_th = .32
    X_train = add_bottom_ratio(X_train, bottom_th)
    X_valid = add_bottom_ratio(X_valid, bottom_th)
    X_test = add_bottom_ratio(X_test, bottom_th)
    
    X_train = add_median(X_train)
    X_valid = add_median(X_valid)
    X_test = add_median(X_test)

    # X_train = add_std(X_train)
    # X_valid = add_std(X_valid)
    # X_test = add_std(X_test)
    
    y_train = train_app.loc[train_idx].TARGET
    y_valid = train_app.loc[valid_idx].TARGET
    
    print(train_idx_fn, valid_idx_fn)
    print(X_train.shape, y_train.shape)
    print(X_valid.shape, y_valid.shape)
    print(X_test.shape)
    
    estimator = LogisticRegression(C=C)
    estimator.fit(X_train, y_train)
    
    pred_valid = estimator.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, pred_valid)
    auc_valid_stack2.append(auc)
    print("fold-{},auc:{}".format(i, auc))

    pred_test = estimator.predict_proba(X_test)[:, 1]
    pred_test_stack2.append(pred_test)
    
    # break
    
print("kfold-auc, avg:{:.8}, std:{:.2}".format(np.mean(auc_valid_stack2), np.std(auc_valid_stack2)))

./neptune-features/train_idx_0.npy ./neptune-features/valid_idx_0.npy
(246008, 23) (246008,)
(61503, 23) (61503,)
(48744, 23)
fold-0,auc:0.800379290214921
./neptune-features/train_idx_1.npy ./neptune-features/valid_idx_1.npy
(246009, 23) (246009,)
(61502, 23) (61502,)
(48744, 23)
fold-1,auc:0.7939329306952798
./neptune-features/train_idx_2.npy ./neptune-features/valid_idx_2.npy
(246009, 23) (246009,)
(61502, 23) (61502,)
(48744, 23)
fold-2,auc:0.7943042049961098
./neptune-features/train_idx_3.npy ./neptune-features/valid_idx_3.npy
(246009, 23) (246009,)
(61502, 23) (61502,)
(48744, 23)
fold-3,auc:0.7960194894872381
./neptune-features/train_idx_4.npy ./neptune-features/valid_idx_4.npy
(246009, 23) (246009,)
(61502, 23) (61502,)
(48744, 23)
fold-4,auc:0.7946462458854445
kfold-auc, avg:0.79585643, std:0.0024


### feature selection

In [43]:
kfold_index_list = []

for i in range(5):
    train_idx_fn = "./neptune-features/train_idx_{}.npy".format(i)
    valid_idx_fn = "./neptune-features/valid_idx_{}.npy".format(i)

    train_offset, valid_offset, test_offset = i * 4, i * 4 + 1, i * 4 + 3

    train_data_fn = "./neptune-features/data_246009_1174_{}".format(train_offset)
    valid_data_fn = "./neptune-features/data_61502_1174_{}".format(valid_offset)
    test_data_fn = "./neptune-features/data_48744_1174_{}".format(test_offset)

    if i == 0:
        train_data_fn = train_data_fn.replace("246009", "246008")
        valid_data_fn = valid_data_fn.replace("61502", "61503")

    gc.collect()

    train_idx = np.load(train_idx_fn)
    valid_idx = np.load(valid_idx_fn)
    kfold_index_list.append((train_idx, valid_idx))

In [47]:
def cv_score(features):
    global train_app, oof_df, kfold_index_list
    auc_valid_stack2 = []
    pred_test_stack2 = []

    for i, (train_idx, valid_idx) in enumerate(kfold_index_list):
        gc.collect()
        model_preds = [col_ for col_ in features if col_ not in ("top", "bot", "med")]
        X_train = X2rank(oof_df.loc[train_idx][model_preds].values)
        X_valid = X2rank(oof_df.loc[valid_idx][model_preds].values)

        y_train = train_app.loc[train_idx].TARGET
        y_valid = train_app.loc[valid_idx].TARGET
        
        if "top" in features:
            top_th = .9
            X_train = add_top_ratio(X_train, top_th)
            X_valid = add_top_ratio(X_valid, top_th)

        if "bot" in features:
            bottom_th = .32
            X_train = add_bottom_ratio(X_train, bottom_th)
            X_valid = add_bottom_ratio(X_valid, bottom_th)

        if "med" in features:
            X_train = add_median(X_train)
            X_valid = add_median(X_valid)

        estimator = LogisticRegression(C=0.4)
        estimator.fit(X_train, y_train)

        pred_valid = estimator.predict_proba(X_valid)[:, 1]
        auc = roc_auc_score(y_valid, pred_valid)
        auc_valid_stack2.append(auc)
        print("fold-{},auc={}".format(i, auc))
    return np.mean(auc_valid_stack2)

In [48]:
features = [
    './oof-result\\oof-024-first.csv',
    './oof-result\\oof-024-keras01.csv',
    './oof-result\\oof-028-lgb.csv',
    './oof-result\\oof-030-keras01.csv',
    './oof-result\\oof-036-keras01.csv',
    './oof-result\\oof-038-keras01.csv',
    './oof-result\\oof-031-lgb-seed-10.csv',
    './oof-result\\oof-031-lgb-seed-12.csv',
    './oof-result\\oof-031-lgb-seed-13.csv',
    './oof-result\\oof-031-lgb-seed-15.csv',
    './oof-result\\oof-031-lgb-seed-16.csv',
    './oof-result\\oof-031-lgb-seed-18.csv',
    './oof-result\\oof-031-lgb-seed-21.csv',
    './oof-result\\oof-031-lgb-seed-24.csv',
    './oof-result\\oof-031-lgb-seed-28.csv',
    './oof-result\\oof-031-lgb-seed-5.csv',
    './oof-result\\oof-031-lgb-seed-8.csv',
    './oof-result\\oof-035-xgb-0.csv',
    './oof-result\\oof-035-xgb-1.csv',
    'lgb_avg',
    "top",
    "bot",
    "med"
]
features = np.array(features)
max_step = -1

ftr_cnt = len(features)
max_step = ftr_cnt if max_step < 0 or max_step > ftr_cnt else max_step

features_prev = np.arange(ftr_cnt)
features_removed = []
score_val_prev = 0.795856

for step_ in range(max_step):
    remove_trials = []
    param_list = []
    metrics = []
    for col_rm_ in features_prev:
        features_next = features_prev[features_prev != col_rm_]
        cv_auc = cv_score(features[features_next])
        print("kfold-auc=", cv_auc)
        metrics.append(cv_auc)
    remove_trials = [(score_val, col_rm_) for score_val, col_rm_ in zip(metrics, features_prev) if score_val >= score_val_prev]
    
    if len(remove_trials) == 0:
        break

    score_val_prev, col_rm_prev = max(remove_trials)
    print("step={}, score={}".format(score_val_prev, col_rm_prev))
    features_prev = features_prev[features_prev != col_rm_prev]
    features_removed.append(col_rm_prev)

fold-0,auc=0.8004276388431568
fold-1,auc=0.793936553700336
fold-2,auc=0.7942862217812391
fold-3,auc=0.7959958063627415
fold-4,auc=0.7946558146087295
kfold-auc= 0.7958604070592405
fold-0,auc=0.8003463239457126
fold-1,auc=0.7939226922326137
fold-2,auc=0.7943181056507105
fold-3,auc=0.7959456435955878
fold-4,auc=0.7946356084291045
kfold-auc= 0.7958336747707458
fold-0,auc=0.8003743064445922
fold-1,auc=0.7939243202693007
fold-2,auc=0.7943025555847615
fold-3,auc=0.7960225745633233
fold-4,auc=0.7946390675617591
kfold-auc= 0.7958525648847473
fold-0,auc=0.8003143943292317
fold-1,auc=0.7938542612551084
fold-2,auc=0.7941452238293059
fold-3,auc=0.7959166702424693
fold-4,auc=0.7945833972569292
kfold-auc= 0.7957627893826089
fold-0,auc=0.800299215025893
fold-1,auc=0.7938813572004936
fold-2,auc=0.7942132629380243
fold-3,auc=0.7959780333320385
fold-4,auc=0.7946878374134978
kfold-auc= 0.7958119411819895
fold-0,auc=0.800334097855814
fold-1,auc=0.793951049283004
fold-2,auc=0.7942666034047947
fold-3,auc=0.7

fold-3,auc=0.7959187507094828
fold-4,auc=0.7946553123041936
kfold-auc= 0.7958480368971396
fold-0,auc=0.8004362740535049
fold-1,auc=0.7939176798745862
fold-2,auc=0.7942968271755874
fold-3,auc=0.7959874809322437
fold-4,auc=0.7946447639089418
kfold-auc= 0.7958566051889727
fold-0,auc=0.8003846230985393
fold-1,auc=0.7938558109180379
fold-2,auc=0.7941407066509271
fold-3,auc=0.7958881208201293
fold-4,auc=0.7945919079344897
kfold-auc= 0.7957722338844246
fold-0,auc=0.8003558390640458
fold-1,auc=0.7938848733322443
fold-2,auc=0.7942089096320475
fold-3,auc=0.7959438481240555
fold-4,auc=0.7947000744069764
kfold-auc= 0.795818708911874
fold-0,auc=0.800397123491737
fold-1,auc=0.7939468598494288
fold-2,auc=0.7942641524436553
fold-3,auc=0.7959946770681467
fold-4,auc=0.7946957496005477
kfold-auc= 0.7958597124907032
fold-0,auc=0.8004405667220155
fold-1,auc=0.7939001134656072
fold-2,auc=0.7943202538041507
fold-3,auc=0.7960059272647714
fold-4,auc=0.7946217398364956
kfold-auc= 0.7958577202186081
fold-0,auc=0

In [50]:
features = [
    './oof-result\\oof-024-first.csv',
    './oof-result\\oof-024-keras01.csv',
    './oof-result\\oof-028-lgb.csv',
    './oof-result\\oof-030-keras01.csv',
    './oof-result\\oof-036-keras01.csv',
    './oof-result\\oof-038-keras01.csv',
    './oof-result\\oof-031-lgb-seed-10.csv',
    './oof-result\\oof-031-lgb-seed-12.csv',
    './oof-result\\oof-031-lgb-seed-13.csv',
    './oof-result\\oof-031-lgb-seed-15.csv',
    './oof-result\\oof-031-lgb-seed-16.csv',
    './oof-result\\oof-031-lgb-seed-18.csv',
    './oof-result\\oof-031-lgb-seed-21.csv',
    './oof-result\\oof-031-lgb-seed-24.csv',
    './oof-result\\oof-031-lgb-seed-28.csv',
    './oof-result\\oof-031-lgb-seed-5.csv',
    './oof-result\\oof-031-lgb-seed-8.csv',
    './oof-result\\oof-035-xgb-0.csv',
    './oof-result\\oof-035-xgb-1.csv',
    'lgb_avg',
    "top",
    "bot",
    "med"
]
features = np.array(features)

features[features_prev]

array(['./oof-result\\oof-024-keras01.csv',
       './oof-result\\oof-028-lgb.csv',
       './oof-result\\oof-030-keras01.csv',
       './oof-result\\oof-036-keras01.csv',
       './oof-result\\oof-038-keras01.csv',
       './oof-result\\oof-031-lgb-seed-10.csv',
       './oof-result\\oof-031-lgb-seed-12.csv',
       './oof-result\\oof-031-lgb-seed-13.csv',
       './oof-result\\oof-031-lgb-seed-16.csv',
       './oof-result\\oof-031-lgb-seed-18.csv',
       './oof-result\\oof-031-lgb-seed-21.csv',
       './oof-result\\oof-031-lgb-seed-28.csv',
       './oof-result\\oof-031-lgb-seed-5.csv',
       './oof-result\\oof-031-lgb-seed-8.csv',
       './oof-result\\oof-035-xgb-0.csv',
       './oof-result\\oof-035-xgb-1.csv', 'lgb_avg', 'top', 'bot', 'med'],
      dtype='<U36')

In [49]:
features_prev

array([ 1,  2,  3,  4,  5,  6,  7,  8, 10, 11, 12, 14, 15, 16, 17, 18, 19,
       20, 21, 22])

### feature selection result

In [52]:
['./oof-result\\oof-024-keras01.csv',
       './oof-result\\oof-028-lgb.csv',
       './oof-result\\oof-030-keras01.csv',
       './oof-result\\oof-036-keras01.csv',
       './oof-result\\oof-038-keras01.csv',
       './oof-result\\oof-031-lgb-seed-10.csv',
       './oof-result\\oof-031-lgb-seed-12.csv',
       './oof-result\\oof-031-lgb-seed-13.csv',
       './oof-result\\oof-031-lgb-seed-16.csv',
       './oof-result\\oof-031-lgb-seed-18.csv',
       './oof-result\\oof-031-lgb-seed-21.csv',
       './oof-result\\oof-031-lgb-seed-28.csv',
       './oof-result\\oof-031-lgb-seed-5.csv',
       './oof-result\\oof-031-lgb-seed-8.csv',
       './oof-result\\oof-035-xgb-0.csv',
       './oof-result\\oof-035-xgb-1.csv',
]

# nrows = None  
nrows = None
C = .4
# nrows = 1000

# features = list(np.load('./neptune-features/features_246008_1174_0.npy'))
train_app = load_dataframe32("./bindata/application_train")
auc_valid_stack2 = []
pred_test_stack2 = []

features = list(oof_df.columns)
features = [
    './oof-result\\oof-024-keras01.csv',
    './oof-result\\oof-028-lgb.csv',
    './oof-result\\oof-030-keras01.csv',
    './oof-result\\oof-036-keras01.csv',
    './oof-result\\oof-038-keras01.csv',
    './oof-result\\oof-031-lgb-seed-10.csv',
    './oof-result\\oof-031-lgb-seed-12.csv',
    './oof-result\\oof-031-lgb-seed-13.csv',
    './oof-result\\oof-031-lgb-seed-16.csv',
    './oof-result\\oof-031-lgb-seed-18.csv',
    './oof-result\\oof-031-lgb-seed-21.csv',
    './oof-result\\oof-031-lgb-seed-28.csv',
    './oof-result\\oof-031-lgb-seed-5.csv',
    './oof-result\\oof-031-lgb-seed-8.csv',
    './oof-result\\oof-035-xgb-0.csv',
    './oof-result\\oof-035-xgb-1.csv',
    'lgb_avg'
]


# features = features0

for i in range(5):
    train_idx_fn = "./neptune-features/train_idx_{}.npy".format(i)
    valid_idx_fn = "./neptune-features/valid_idx_{}.npy".format(i)

    train_offset, valid_offset, test_offset = i * 4, i * 4 + 1, i * 4 + 3
    
    train_data_fn = "./neptune-features/data_246009_1174_{}".format(train_offset)
    valid_data_fn = "./neptune-features/data_61502_1174_{}".format(valid_offset)
    test_data_fn = "./neptune-features/data_48744_1174_{}".format(test_offset)
    
    if i == 0:
        train_data_fn = train_data_fn.replace("246009", "246008")
        valid_data_fn = valid_data_fn.replace("61502", "61503")
    
    gc.collect()

    train_idx = np.load(train_idx_fn)
    valid_idx = np.load(valid_idx_fn)
    
    if nrows:
        train_idx, valid_idx = train_idx[:nrows].copy(), valid_idx[:nrows].copy()

    gc.collect()
    
    X_train = X2rank(oof_df.loc[train_idx][features].values)
    X_valid = X2rank(oof_df.loc[valid_idx][features].values)
    X_test = test_df[features].values
    
    top_th = .9
    X_train = add_top_ratio(X_train, top_th)
    X_valid = add_top_ratio(X_valid, top_th)
    X_test = add_top_ratio(X_test, top_th)

    bottom_th = .32
    X_train = add_bottom_ratio(X_train, bottom_th)
    X_valid = add_bottom_ratio(X_valid, bottom_th)
    X_test = add_bottom_ratio(X_test, bottom_th)
    
    X_train = add_median(X_train)
    X_valid = add_median(X_valid)
    X_test = add_median(X_test)

    # X_train = add_std(X_train)
    # X_valid = add_std(X_valid)
    # X_test = add_std(X_test)
    
    y_train = train_app.loc[train_idx].TARGET
    y_valid = train_app.loc[valid_idx].TARGET
    
    print(train_idx_fn, valid_idx_fn)
    print(X_train.shape, y_train.shape)
    print(X_valid.shape, y_valid.shape)
    print(X_test.shape)
    
    estimator = LogisticRegression(C=C)
    estimator.fit(X_train, y_train)
    
    pred_valid = estimator.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, pred_valid)
    auc_valid_stack2.append(auc)
    print("fold-{},auc:{}".format(i, auc))

    pred_test = estimator.predict_proba(X_test)[:, 1]
    pred_test_stack2.append(pred_test)
    
    # break
    
print("kfold-auc, avg:{:.8}, std:{:.2}".format(np.mean(auc_valid_stack2), np.std(auc_valid_stack2)))

./neptune-features/train_idx_0.npy ./neptune-features/valid_idx_0.npy
(246008, 20) (246008,)
(61503, 20) (61503,)
(48744, 20)
fold-0,auc:0.8004363738001591
./neptune-features/train_idx_1.npy ./neptune-features/valid_idx_1.npy
(246009, 20) (246009,)
(61502, 20) (61502,)
(48744, 20)
fold-1,auc:0.7939255172503222
./neptune-features/train_idx_2.npy ./neptune-features/valid_idx_2.npy
(246009, 20) (246009,)
(61502, 20) (61502,)
(48744, 20)
fold-2,auc:0.7943274962518195
./neptune-features/train_idx_3.npy ./neptune-features/valid_idx_3.npy
(246009, 20) (246009,)
(61502, 20) (61502,)
(48744, 20)
fold-3,auc:0.7959696651522185
./neptune-features/train_idx_4.npy ./neptune-features/valid_idx_4.npy
(246009, 20) (246009,)
(61502, 20) (61502,)
(48744, 20)
fold-4,auc:0.7946682190370534
kfold-auc, avg:0.79586545, std:0.0024


### bayesian opt

In [53]:
from bayes_opt import BayesianOptimization

In [54]:
def target_function(top_th, bottom_th, C):
    global train_app, oof_df, kfold_index_list
    auc_valid_stack2 = []
    pred_test_stack2 = []
    features = [
        './oof-result\\oof-024-keras01.csv',
        './oof-result\\oof-028-lgb.csv',
        './oof-result\\oof-030-keras01.csv',
        './oof-result\\oof-036-keras01.csv',
        './oof-result\\oof-038-keras01.csv',
        './oof-result\\oof-031-lgb-seed-10.csv',
        './oof-result\\oof-031-lgb-seed-12.csv',
        './oof-result\\oof-031-lgb-seed-13.csv',
        './oof-result\\oof-031-lgb-seed-16.csv',
        './oof-result\\oof-031-lgb-seed-18.csv',
        './oof-result\\oof-031-lgb-seed-21.csv',
        './oof-result\\oof-031-lgb-seed-28.csv',
        './oof-result\\oof-031-lgb-seed-5.csv',
        './oof-result\\oof-031-lgb-seed-8.csv',
        './oof-result\\oof-035-xgb-0.csv',
        './oof-result\\oof-035-xgb-1.csv',
        'lgb_avg'
    ]

    for i, (train_idx, valid_idx) in enumerate(kfold_index_list):
        gc.collect()
        X_train = X2rank(oof_df.loc[train_idx][features].values)
        X_valid = X2rank(oof_df.loc[valid_idx][features].values)

        y_train = train_app.loc[train_idx].TARGET
        y_valid = train_app.loc[valid_idx].TARGET
        
        X_train = add_top_ratio(X_train, top_th)
        X_valid = add_top_ratio(X_valid, top_th)

        X_train = add_bottom_ratio(X_train, bottom_th)
        X_valid = add_bottom_ratio(X_valid, bottom_th)

        X_train = add_median(X_train)
        X_valid = add_median(X_valid)

        estimator = LogisticRegression(C=C)
        estimator.fit(X_train, y_train)

        pred_valid = estimator.predict_proba(X_valid)[:, 1]
        auc = roc_auc_score(y_valid, pred_valid)
        auc_valid_stack2.append(auc)
        # print("fold-{},auc={}".format(i, auc))
    return np.mean(auc_valid_stack2)

In [57]:
import warnings
warnings.filterwarnings("ignore")

In [58]:
lgbBO = BayesianOptimization(
    target_function,
    {
        'top_th': (.6, .95),
        'bottom_th': (.2, .5),
        'C': (.2, .6)
    },
    random_state=0)

In [59]:
lgbBO.maximize(init_points=10, n_iter=50)

[31mInitialization[0m
[94m-------------------------------------------------------------------[0m
 Step |   Time |      Value |         C |   bottom_th |    top_th | 
    1 | 00m13s | [35m   0.79473[0m | [32m   0.5914[0m | [32m     0.4375[0m | [32m   0.7921[0m | 
    2 | 00m14s | [35m   0.79568[0m | [32m   0.5197[0m | [32m     0.3587[0m | [32m   0.8503[0m | 
    3 | 00m13s |    0.79539 |    0.3846 |      0.3704 |    0.8110 | 
    4 | 00m13s |    0.79479 |    0.5122 |      0.4777 |    0.7907 | 
    5 | 00m14s |    0.79540 |    0.2473 |      0.2213 |    0.7483 | 
    6 | 00m13s | [35m   0.79574[0m | [32m   0.4560[0m | [32m     0.2261[0m | [32m   0.8261[0m | 
    7 | 00m14s |    0.79551 |    0.2573 |      0.2061 |    0.7532 | 
    8 | 00m14s | [35m   0.79580[0m | [32m   0.5779[0m | [32m     0.4498[0m | [32m   0.9121[0m | 
    9 | 00m14s |    0.79579 |    0.4087 |      0.4334 |    0.9373 | 
   10 | 00m14s |    0.79453 |    0.3659 |      0.4610 |    0.7342

In [60]:
lgbBO.maximize(n_iter=50)

[31mBayesian Optimization[0m
[94m-------------------------------------------------------------------[0m
 Step |   Time |      Value |         C |   bottom_th |    top_th | 
   61 | 00m29s |    0.79575 |    0.4469 |      0.4212 |    0.8786 | 
   62 | 00m22s |    0.79572 |    0.2561 |      0.3484 |    0.8367 | 
   63 | 00m25s |    0.79578 |    0.2855 |      0.2748 |    0.8766 | 
   64 | 00m22s |    0.79580 |    0.4147 |      0.3748 |    0.9082 | 
   65 | 00m29s |    0.79546 |    0.4252 |      0.2712 |    0.7668 | 
   66 | 00m29s |    0.79568 |    0.2216 |      0.3489 |    0.8223 | 
   67 | 00m29s |    0.79576 |    0.4414 |      0.2300 |    0.8696 | 
   68 | 00m33s |    0.79579 |    0.3623 |      0.3702 |    0.9171 | 
   69 | 00m28s |    0.79580 |    0.2048 |      0.2278 |    0.9191 | 
   70 | 00m25s |    0.79582 |    0.3539 |      0.2383 |    0.9349 | 
   71 | 00m31s |    0.79578 |    0.3324 |      0.2051 |    0.9288 | 
   72 | 00m33s |    0.79492 |    0.4104 |      0.3983 |    0.793

KeyboardInterrupt: 

In [61]:
target_function(0.9073, 0.3277, 0.4578)

0.795861087629438

In [62]:
target_function(0.9073, 0.3277, 0.4)

0.7958606166850788

In [63]:
target_function(0.9, 0.32, 0.4)

0.7958654542983146

In [64]:
target_function(0.9, 0.315, 0.4)

0.7958580729562547

In [67]:
# nrows = None  
nrows = None
C = .4
# nrows = 1000

# features = list(np.load('./neptune-features/features_246008_1174_0.npy'))
train_app = load_dataframe32("./bindata/application_train")
auc_valid_stack2 = []
pred_test_stack2 = []

features = list(oof_df.columns)
features = [
    './oof-result\\oof-024-keras01.csv',
    './oof-result\\oof-028-lgb.csv',
    './oof-result\\oof-030-keras01.csv',
    './oof-result\\oof-036-keras01.csv',
    './oof-result\\oof-038-keras01.csv',
    './oof-result\\oof-031-lgb-seed-10.csv',
    './oof-result\\oof-031-lgb-seed-12.csv',
    './oof-result\\oof-031-lgb-seed-13.csv',
    './oof-result\\oof-031-lgb-seed-16.csv',
    './oof-result\\oof-031-lgb-seed-18.csv',
    './oof-result\\oof-031-lgb-seed-21.csv',
    './oof-result\\oof-031-lgb-seed-28.csv',
    './oof-result\\oof-031-lgb-seed-5.csv',
    './oof-result\\oof-031-lgb-seed-8.csv',
    './oof-result\\oof-035-xgb-0.csv',
    './oof-result\\oof-035-xgb-1.csv',
    'lgb_avg'
]

# features = features0

for i in range(5):
    train_idx_fn = "./neptune-features/train_idx_{}.npy".format(i)
    valid_idx_fn = "./neptune-features/valid_idx_{}.npy".format(i)

    train_offset, valid_offset, test_offset = i * 4, i * 4 + 1, i * 4 + 3
    
    train_data_fn = "./neptune-features/data_246009_1174_{}".format(train_offset)
    valid_data_fn = "./neptune-features/data_61502_1174_{}".format(valid_offset)
    test_data_fn = "./neptune-features/data_48744_1174_{}".format(test_offset)
    
    if i == 0:
        train_data_fn = train_data_fn.replace("246009", "246008")
        valid_data_fn = valid_data_fn.replace("61502", "61503")
    
    gc.collect()

    train_idx = np.load(train_idx_fn)
    valid_idx = np.load(valid_idx_fn)
    
    if nrows:
        train_idx, valid_idx = train_idx[:nrows].copy(), valid_idx[:nrows].copy()

    gc.collect()
    
    X_train = X2rank(oof_df.loc[train_idx][features].values)
    X_valid = X2rank(oof_df.loc[valid_idx][features].values)
    X_test = test_df[features].values
    
    top_th = .9
    X_train = add_top_ratio(X_train, top_th)
    X_valid = add_top_ratio(X_valid, top_th)
    X_test = add_top_ratio(X_test, top_th)

    bottom_th = .32
    X_train = add_bottom_ratio(X_train, bottom_th)
    X_valid = add_bottom_ratio(X_valid, bottom_th)
    X_test = add_bottom_ratio(X_test, bottom_th)
    
    X_train = add_median(X_train)
    X_valid = add_median(X_valid)
    X_test = add_median(X_test)

    # X_train = add_std(X_train)
    # X_valid = add_std(X_valid)
    # X_test = add_std(X_test)
    
    y_train = train_app.loc[train_idx].TARGET
    y_valid = train_app.loc[valid_idx].TARGET
    
    print(train_idx_fn, valid_idx_fn)
    print(X_train.shape, y_train.shape)
    print(X_valid.shape, y_valid.shape)
    print(X_test.shape)
    
    estimator = LogisticRegression(C=C)
    estimator.fit(X_train, y_train)
    
    pred_valid = estimator.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, pred_valid)
    auc_valid_stack2.append(auc)
    print("fold-{},auc:{}".format(i, auc))

    pred_test = estimator.predict_proba(X_test)[:, 1]
    pred_test_stack2.append(pred_test)
    
    # break
    
print("kfold-auc, avg:{:.8}, std:{:.2}".format(np.mean(auc_valid_stack2), np.std(auc_valid_stack2)))

./neptune-features/train_idx_0.npy ./neptune-features/valid_idx_0.npy
(246008, 20) (246008,)
(61503, 20) (61503,)
(48744, 20)
fold-0,auc:0.8004363738001591
./neptune-features/train_idx_1.npy ./neptune-features/valid_idx_1.npy
(246009, 20) (246009,)
(61502, 20) (61502,)
(48744, 20)
fold-1,auc:0.7939255172503222
./neptune-features/train_idx_2.npy ./neptune-features/valid_idx_2.npy
(246009, 20) (246009,)
(61502, 20) (61502,)
(48744, 20)
fold-2,auc:0.7943274962518195
./neptune-features/train_idx_3.npy ./neptune-features/valid_idx_3.npy
(246009, 20) (246009,)
(61502, 20) (61502,)
(48744, 20)
fold-3,auc:0.7959696651522185
./neptune-features/train_idx_4.npy ./neptune-features/valid_idx_4.npy
(246009, 20) (246009,)
(61502, 20) (61502,)
(48744, 20)
fold-4,auc:0.7946682190370534
kfold-auc, avg:0.79586545, std:0.0024


### submission

In [68]:
pred_target = sum([calculate_rank(p_) for p_ in pred_test_stack2]) / 5

pred_sample = pd.read_csv("./result/submission-022-blend08.csv")
pred_sample["TARGET"] = pred_target

pred_sample.to_csv("./result/submission-039-stacking-logit-fs-above-bottom-median-0.csv", index=False)

### blending wrong

In [70]:
pred_sample = pd.read_csv("./result/submission-022-blend08.csv")
pred_target = pd.read_csv("./result/submission-037-stacking-logit-fs-above-bottom-0.csv").TARGET
bld_param = .3
pred_sample["TARGET"] = pred_target * (1 - bld_param) + pred_sample.TARGET * bld_param
pred_sample.to_csv("./result/submission-039-stacking-logit-fs-above-bottom-median-0-bld{}.csv".format(str(bld_param).replace(".", "-")), index=False)

In [72]:
pred_sample = pd.read_csv("./result/submission-022-blend08.csv")
pred_target = pd.read_csv("./result/submission-037-stacking-logit-fs-above-bottom-0.csv").TARGET
bld_param = .32
pred_sample["TARGET"] = pred_target * (1 - bld_param) + pred_sample.TARGET * bld_param
pred_sample.to_csv("./result/submission-039-stacking-logit-fs-above-bottom-median-0-bld{}.csv".format(str(bld_param).replace(".", "-")), index=False)

In [9]:
pred_sample = pd.read_csv("./result/submission-022-blend08.csv")
pred_target = pd.read_csv("./result/submission-039-stacking-logit-fs-above-bottom-median-0.csv").TARGET
bld_param = .3
pred_sample["TARGET"] = pred_target * (1 - bld_param) + pred_sample.TARGET * bld_param
pred_sample.to_csv("./result/submission-039-stacking-logit-fs-above-bottom-median-0-bld{}-fixed.csv".format(str(bld_param).replace(".", "-")), index=False)

In [13]:
pred_sample = pd.read_csv("./result/submission-022-blend08.csv")
pred_target = pd.read_csv("./result/submission-039-stacking-logit-fs-above-bottom-median-0.csv").TARGET
bld_param = .25
pred_sample["TARGET"] = pred_target * (1 - bld_param) + pred_sample.TARGET * bld_param
pred_sample.to_csv("./result/submission-039-stacking-logit-fs-above-bottom-median-0-bld{}-fixed.csv".format(str(bld_param).replace(".", "-")), index=False)