## functions

In [1]:
import pandas as pd
import numpy as np
import os
import gc

import addict
import lightgbm as lgb

def save_dataframe(path, dataframe):
    np.save(path + ".data", dataframe.values)
    np.save(path + ".header", dataframe.columns)


def load_dataframe(path):
    data = np.load(path + ".data.npy")
    header = np.load(path + ".header.npy")
    return pd.DataFrame(data=data, columns=header)


def save_dataframe32(path, dataframe, keep=[]):
    col64 = [col_ for col_ in dataframe.columns if col_ in keep]
    col32 = [col_ for col_ in dataframe.columns if col_ not in keep]
    dataframe64 = dataframe[col64]
    dataframe32 = dataframe[col32]
    np.save(path + ".data64", dataframe64.values)
    np.save(path + ".header64", col64)
    np.save(path + ".data32", dataframe32.values.astype(np.float32))
    np.save(path + ".header32", col32)


def load_dataframe32(path, nrows=None):
    path_data32 = path + ".data32.npy"
    path_header32 = path + ".header32.npy"
    path_data64 = path + ".data64.npy"
    path_header64 = path + ".header64.npy"
    result = pd.DataFrame()
    if os.path.exists(path_data32):
        data32 = np.load(path_data32)
        header32 = np.load(path_header32)
        df32 = pd.DataFrame(data=data32, columns=header32)
        result = pd.concat([result, df32], axis=1)
    if os.path.exists(path_data64):
        data64 = np.load(path_data64)
        header64 = np.load(path_header64)
        df64 = pd.DataFrame(data=data64, columns=header64)
        result = pd.concat([result, df64], axis=1)
    if nrows and nrows > 0:
        return result.head(nrows)
    return result

In [2]:
from sklearn.metrics import roc_auc_score

## check data

In [5]:
# nrows = None  
nrows = None

features = list(np.load('./neptune-features/features_246008_1174_0.npy'))
features_nmf = list(np.load('./nmf-features/features-part2.npy'))

train_app = load_dataframe32("./bindata/application_train")
auc_valid_stack = []
pred_valid_stack = []
pred_test_stack = []

for i in range(5):
    train_idx_fn = "./neptune-features/train_idx_{}.npy".format(i)
    valid_idx_fn = "./neptune-features/valid_idx_{}.npy".format(i)

    train_offset, valid_offset, test_offset = i * 4, i * 4 + 1, i * 4 + 3
    
    train_data_fn = "./neptune-features/data_246009_1174_{}".format(train_offset)
    valid_data_fn = "./neptune-features/data_61502_1174_{}".format(valid_offset)
    test_data_fn = "./neptune-features/data_48744_1174_{}".format(test_offset)

    if i == 0:
        train_data_fn = train_data_fn.replace("246009", "246008")
        valid_data_fn = valid_data_fn.replace("61502", "61503")
        
    train_nmf_fn = "./nmf-features/nmf_246009_1174_{}".format(train_offset)
    valid_nmf_fn = "./nmf-features/nmf_61502_1174_{}".format(valid_offset)
    test_nmf_fn = "./nmf-features/nmf_48744_1174_{}".format(test_offset)

    if i == 0:
        train_nmf_fn = train_nmf_fn.replace("246009", "246008")
        valid_nmf_fn = valid_nmf_fn.replace("61502", "61503")
    
    gc.collect()

    train_idx = np.load(train_idx_fn)
    valid_idx = np.load(valid_idx_fn)
    
    if nrows:
        train_idx, valid_idx = train_idx[:nrows].copy(), valid_idx[:nrows].copy()

    train_data = load_dataframe32(train_data_fn, nrows)
    valid_data = load_dataframe32(valid_data_fn, nrows)
    test_data = load_dataframe32(test_data_fn, nrows)
    
    train_nmf = load_dataframe32(train_nmf_fn, nrows)
    valid_nmf = load_dataframe32(valid_nmf_fn, nrows)
    test_nmf = load_dataframe32(test_nmf_fn, nrows)
    
    gc.collect()
    
    X_train = np.hstack([train_data.values, train_nmf.values])
    X_valid = np.hstack([valid_data.values, valid_nmf.values])
    X_test = np.hstack([test_data.values, test_nmf.values])
    y_train = train_app.loc[train_idx].TARGET
    y_valid = train_app.loc[valid_idx].TARGET
    
    print(train_idx_fn, valid_idx_fn)
    print(train_idx.shape, valid_idx.shape)
    print(train_data_fn, train_data.shape)
    print(valid_data_fn, valid_data.shape)
    print(test_data_fn, test_data.shape)

    print(train_nmf_fn, train_nmf.shape)
    print(valid_nmf_fn, valid_nmf.shape)
    print(test_nmf_fn, test_nmf.shape)
    
    print(X_train.shape, y_train.shape)
    print(X_valid.shape, y_valid.shape)
    print(X_test.shape)

./neptune-features/train_idx_0.npy ./neptune-features/valid_idx_0.npy
(246008,) (61503,)
./neptune-features/data_246008_1174_0 (246008, 1174)
./neptune-features/data_61503_1174_1 (61503, 1174)
./neptune-features/data_48744_1174_3 (48744, 1174)
./nmf-features/nmf_246008_1174_0 (246008, 492)
./nmf-features/nmf_61503_1174_1 (61503, 492)
./nmf-features/nmf_48744_1174_3 (48744, 492)
(246008, 1666) (246008,)
(61503, 1666) (61503,)
(48744, 1666)
./neptune-features/train_idx_1.npy ./neptune-features/valid_idx_1.npy
(246009,) (61502,)
./neptune-features/data_246009_1174_4 (246009, 1174)
./neptune-features/data_61502_1174_5 (61502, 1174)
./neptune-features/data_48744_1174_7 (48744, 1174)
./nmf-features/nmf_246009_1174_4 (246009, 492)
./nmf-features/nmf_61502_1174_5 (61502, 492)
./nmf-features/nmf_48744_1174_7 (48744, 492)
(246009, 1666) (246009,)
(61502, 1666) (61502,)
(48744, 1666)
./neptune-features/train_idx_2.npy ./neptune-features/valid_idx_2.npy
(246009,) (61502,)
./neptune-features/data_2

## lightGBM

In [6]:
lgb_params = addict.Dict()
lgb_params.boosting_type = "gbdt"
lgb_params.objective = "binary"
lgb_params.metric = "auc"
lgb_params.learning_rate = 0.02
lgb_params.max_bin = 300
lgb_params.max_depth = -1
lgb_params.num_leaves = 30
lgb_params.min_child_samples = 70
lgb_params.subsample = 1.0
lgb_params.subsample_freq = 1
lgb_params.colsample_bytree = 0.05
lgb_params.min_gain_to_split = 0.5
lgb_params.reg_lambda = 100
lgb_params.reg_alpha = 0.0
lgb_params.scale_pos_weight = 1
lgb_params.is_unbalance = False

In [8]:
nrows = None  
# nrows = 1000

features = list(np.load('./neptune-features/features_246008_1174_0.npy'))
features_nmf = list(np.load('./nmf-features/features-part2.npy'))

train_app = load_dataframe32("./bindata/application_train")
auc_valid_stack = []
pred_valid_stack = []
pred_test_stack = []

for i in range(5):
    train_idx_fn = "./neptune-features/train_idx_{}.npy".format(i)
    valid_idx_fn = "./neptune-features/valid_idx_{}.npy".format(i)

    train_offset, valid_offset, test_offset = i * 4, i * 4 + 1, i * 4 + 3
    
    train_data_fn = "./neptune-features/data_246009_1174_{}".format(train_offset)
    valid_data_fn = "./neptune-features/data_61502_1174_{}".format(valid_offset)
    test_data_fn = "./neptune-features/data_48744_1174_{}".format(test_offset)

    if i == 0:
        train_data_fn = train_data_fn.replace("246009", "246008")
        valid_data_fn = valid_data_fn.replace("61502", "61503")
        
    train_nmf_fn = "./nmf-features/nmf_246009_1174_{}".format(train_offset)
    valid_nmf_fn = "./nmf-features/nmf_61502_1174_{}".format(valid_offset)
    test_nmf_fn = "./nmf-features/nmf_48744_1174_{}".format(test_offset)

    if i == 0:
        train_nmf_fn = train_nmf_fn.replace("246009", "246008")
        valid_nmf_fn = valid_nmf_fn.replace("61502", "61503")
    
    gc.collect()

    train_idx = np.load(train_idx_fn)
    valid_idx = np.load(valid_idx_fn)
    
    if nrows:
        train_idx, valid_idx = train_idx[:nrows].copy(), valid_idx[:nrows].copy()

    train_data = load_dataframe32(train_data_fn, nrows)
    valid_data = load_dataframe32(valid_data_fn, nrows)
    test_data = load_dataframe32(test_data_fn, nrows)
    
    train_nmf = load_dataframe32(train_nmf_fn, nrows)
    valid_nmf = load_dataframe32(valid_nmf_fn, nrows)
    test_nmf = load_dataframe32(test_nmf_fn, nrows)
    
    gc.collect()
    
    X_train = np.hstack([train_data.values, train_nmf.values])
    X_valid = np.hstack([valid_data.values, valid_nmf.values])
    X_test = np.hstack([test_data.values, test_nmf.values])
    y_train = train_app.loc[train_idx].TARGET
    y_valid = train_app.loc[valid_idx].TARGET
    
    print(train_idx_fn, valid_idx_fn)
    print(train_idx.shape, valid_idx.shape)
    print(train_data_fn, train_data.shape)
    print(valid_data_fn, valid_data.shape)
    print(test_data_fn, test_data.shape)

    print(train_nmf_fn, train_nmf.shape)
    print(valid_nmf_fn, valid_nmf.shape)
    print(test_nmf_fn, test_nmf.shape)
    
    print(X_train.shape, y_train.shape)
    print(X_valid.shape, y_valid.shape)
    print(X_test.shape)
    
    lgb_data_train = lgb.Dataset(data=X_train, label=y_train)
    lgb_data_valid = lgb.Dataset(data=X_valid, label=y_valid)
    
    estimator = lgb.train(
        lgb_params,
        lgb_data_train,
        valid_sets=[lgb_data_train, lgb_data_valid],
        valid_names=["data_train", "data_valid"],
        num_boost_round=5000,
        early_stopping_rounds=100,
        verbose_eval=100
    )
    
    pred_valid = estimator.predict(X_valid)
    pred_valid_stack.append(pred_valid)
    
    auc = roc_auc_score(y_valid, pred_valid)
    auc_valid_stack.append(auc)
    print("fold-{},auc:{}".format(i, auc))

    pred_test = estimator.predict(X_test)
    pred_test_stack.append(pred_test)
    
print("kfold-auc, avg:{:.4}, std:{:.2}".format(np.mean(auc_valid_stack), np.std(auc_valid_stack)))

./neptune-features/train_idx_0.npy ./neptune-features/valid_idx_0.npy
(246008,) (61503,)
./neptune-features/data_246008_1174_0 (246008, 1174)
./neptune-features/data_61503_1174_1 (61503, 1174)
./neptune-features/data_48744_1174_3 (48744, 1174)
./nmf-features/nmf_246008_1174_0 (246008, 492)
./nmf-features/nmf_61503_1174_1 (61503, 492)
./nmf-features/nmf_48744_1174_3 (48744, 492)
(246008, 1666) (246008,)
(61503, 1666) (61503,)
(48744, 1666)
Training until validation scores don't improve for 100 rounds.
[100]	data_train's auc: 0.769268	data_valid's auc: 0.764094
[200]	data_train's auc: 0.784729	data_valid's auc: 0.77605
[300]	data_train's auc: 0.795489	data_valid's auc: 0.782882
[400]	data_train's auc: 0.803533	data_valid's auc: 0.786865
[500]	data_train's auc: 0.810565	data_valid's auc: 0.789508
[600]	data_train's auc: 0.816922	data_valid's auc: 0.791686
[700]	data_train's auc: 0.822574	data_valid's auc: 0.793064
[800]	data_train's auc: 0.827879	data_valid's auc: 0.7941
[900]	data_train'

[1200]	data_train's auc: 0.846792	data_valid's auc: 0.789813
[1300]	data_train's auc: 0.850738	data_valid's auc: 0.790199
[1400]	data_train's auc: 0.854468	data_valid's auc: 0.790521
[1500]	data_train's auc: 0.857998	data_valid's auc: 0.790775
[1600]	data_train's auc: 0.861401	data_valid's auc: 0.790892
Early stopping, best iteration is:
[1569]	data_train's auc: 0.860353	data_valid's auc: 0.790903
fold-4,auc:0.7909028444882434
kfold-auc, avg:0.7927, std:0.0026


## save oof & submission

In [9]:
def calculate_rank(predictions):
    rank = (1 + pd.Series(predictions).rank().values) / (predictions.shape[0] + 1)
    return rank

pred_target = sum([calculate_rank(p_) for p_ in pred_test_stack]) / 5

pred_sample = pd.read_csv("./result/submission-022-blend08.csv")
pred_sample["TARGET"] = pred_target

pred_sample.to_csv("./result/submission-028-lgb.csv", index=False)

In [10]:
oof_pred = np.zeros(train_app.shape[0])

for i in range(5):
    valid_idx_fn = "./neptune-features/valid_idx_{}.npy".format(i)
    valid_idx = np.load(valid_idx_fn)
    oof_pred[valid_idx] = pred_valid_stack[i]
    
oof_df = train_app[["SK_ID_CURR"]].copy()
oof_df["SK_ID_CURR"] = oof_df.SK_ID_CURR.astype("int")
oof_df["oof_pred"] = oof_pred
oof_df.to_csv("./oof-result/oof-028-lgb.csv", index=False)

In [14]:
pred_lgb024.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.454959
1,100005,0.825939
2,100013,0.336125
3,100028,0.478145
4,100038,0.861372


In [22]:
oof_lgb024 = pd.read_csv("./oof-result/oof-024-first.csv")
oof_keras025 = pd.read_csv("./oof-result/oof-024-keras01.csv")


oof_df["oof_lgb028"] = oof_df["oof_pred"]
oof_df["oof_lgb024"] = oof_lgb024["oof_pred"]
oof_df["oof_keras025"] = oof_keras025["oof_pred"]
oof_df["TARGET"] = train_app["TARGET"]

In [47]:
def rank_avg_and_show_oof_auc(oof_df, columns=["oof_lgb028", "oof_lgb024", "oof_keras025"], weights=[.33, .33, .34]):
    auc_valid_stack = []
    for i in range(5):
        train_idx_fn = "./neptune-features/train_idx_{}.npy".format(i)
        valid_idx_fn = "./neptune-features/valid_idx_{}.npy".format(i)

        train_offset, valid_offset, test_offset = i * 4, i * 4 + 1, i * 4 + 3

        train_data_fn = "./neptune-features/data_246009_1174_{}".format(train_offset)
        valid_data_fn = "./neptune-features/data_61502_1174_{}".format(valid_offset)
        test_data_fn = "./neptune-features/data_48744_1174_{}".format(test_offset)

        if i == 0:
            train_data_fn = train_data_fn.replace("246009", "246008")
            valid_data_fn = valid_data_fn.replace("61502", "61503")

        gc.collect()

        train_idx = np.load(train_idx_fn)
        valid_idx = np.load(valid_idx_fn)

        fold_data = oof_df.loc[valid_idx]
        fold_target = fold_data.TARGET
        
        fold_pred = np.zeros(fold_data.shape[0])
        for col_, w_ in zip(columns, weights):
            fold_pred += w_ * calculate_rank(oof_df.loc[valid_idx][col_])

        auc = roc_auc_score(fold_target, fold_pred)
        auc_valid_stack.append(auc)
        print("fold-{},auc:{}".format(i, auc))

    print("kfold-auc, avg:{:.8}, std:{:.2}".format(np.mean(auc_valid_stack), np.std(auc_valid_stack)))

In [57]:
rank_avg_and_show_oof_auc(oof_df, weights=[.08, .79, .13])

fold-0,auc:0.799022735717998
fold-1,auc:0.7929873281568535
fold-2,auc:0.79350871135891
fold-3,auc:0.7953560520687457
fold-4,auc:0.7934177497073852
kfold-auc, avg:0.79485852, std:0.0022


In [54]:
rank_avg_and_show_oof_auc(oof_df, weights=[.09, .79, .12])

fold-0,auc:0.7990244901903976
fold-1,auc:0.7929922870782282
fold-2,auc:0.7935086757344748
fold-3,auc:0.7953522402541833
fold-4,auc:0.7934147661609405
kfold-auc, avg:0.79485849, std:0.0022


In [55]:
rank_avg_and_show_oof_auc(oof_df, weights=[.08, .8, .12])

fold-0,auc:0.7990193496753264
fold-1,auc:0.7929912450634997
fold-2,auc:0.793515000852938
fold-3,auc:0.7953511091783667
fold-4,auc:0.7934149514080032
kfold-auc, avg:0.79485833, std:0.0022


In [52]:
rank_avg_and_show_oof_auc(oof_df, weights=[.07, .81, .12])

fold-0,auc:0.7990143480930951
fold-1,auc:0.7929885322627621
fold-2,auc:0.7935232069415779
fold-3,auc:0.7953494081115878
fold-4,auc:0.7934154252129909
kfold-auc, avg:0.79485818, std:0.0022


In [49]:
rank_avg_and_show_oof_auc(oof_df, weights=[.06, .82, .12])

fold-0,auc:0.7990091042689893
fold-1,auc:0.7929871322224601
fold-2,auc:0.7935306560109706
fold-3,auc:0.7953474006746661
fold-4,auc:0.7934145987260951
kfold-auc, avg:0.79485778, std:0.0022


In [48]:
rank_avg_and_show_oof_auc(oof_df, weights=[.1, .78, .12])

fold-0,auc:0.7990302042487301
fold-1,auc:0.7929919611146465
fold-2,auc:0.7934991301670729
fold-3,auc:0.7953514974847101
fold-4,auc:0.7934149781263296
kfold-auc, avg:0.79485755, std:0.0022


In [26]:
rank_avg_and_show_oof_auc(oof_df)

fold-0,auc:0.7980927905362655
fold-1,auc:0.7918409926136118
fold-2,auc:0.791884021587624
fold-3,auc:0.7943416409338011
fold-4,auc:0.7921728520393769
kfold-auc, avg:0.793666, std:0.0024


In [27]:
rank_avg_and_show_oof_auc(oof_df, weights=[.1, .8, .1])

fold-0,auc:0.7990161595635827
fold-1,auc:0.792988746009373
fold-2,auc:0.7935156794984278
fold-3,auc:0.7953263840391416
fold-4,auc:0.7933993817486151
kfold-auc, avg:0.794849, std:0.0022


In [29]:
rank_avg_and_show_oof_auc(oof_df, weights=[.2, .6, .2])

fold-0,auc:0.7989400617723905
fold-1,auc:0.7928378640579036
fold-2,auc:0.7931569360926667
fold-3,auc:0.7952314288884352
fold-4,auc:0.7932448963855289
kfold-auc, avg:0.794682, std:0.0023


In [30]:
rank_avg_and_show_oof_auc(oof_df, weights=[.1, .7, .2])

fold-0,auc:0.7989325985852291
fold-1,auc:0.7928745946317788
fold-2,auc:0.7933073139583787
fold-3,auc:0.7952965628244661
fold-4,auc:0.7933339503485504
kfold-auc, avg:0.794749, std:0.0023


In [19]:
oof_df.head()

Unnamed: 0,SK_ID_CURR,oof_pred,oof_lgb028,oof_lgb024,oof_keras025
0,100002,0.309997,0.309997,0.316064,0.278892
1,100003,0.019556,0.019556,0.019087,0.01501
2,100004,0.046917,0.046917,0.041876,0.067596
3,100006,0.039269,0.039269,0.033489,0.043004
4,100007,0.068751,0.068751,0.040865,0.07649


In [68]:
pred_lgb028 = pd.read_csv("./result/submission-028-lgb.csv")
pred_lgb024 = pd.read_csv("./result/submission-024-first.csv")
pred_keras025 = pd.read_csv("./result/submission-025-keras01.csv")

pred_lgb028["TARGET"] = .08 * pred_lgb028.TARGET + .79 * pred_lgb024.TARGET + .13 * pred_keras025.TARGET

pred_lgb028.to_csv("./result/submission-028-avg-l08l79k13.csv", index=False)

In [69]:
pred_lgb028.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.464932
1,100005,0.836674
2,100013,0.336514
3,100028,0.459474
4,100038,0.863509


In [62]:
pred_lgb028.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.451552
1,100005,0.856984
2,100013,0.397432
3,100028,0.421252
4,100038,0.88543
