## functions

In [3]:
import pandas as pd
import numpy as np
import os
import gc

import addict
import lightgbm as lgb

def save_dataframe(path, dataframe):
    np.save(path + ".data", dataframe.values)
    np.save(path + ".header", dataframe.columns)


def load_dataframe(path):
    data = np.load(path + ".data.npy")
    header = np.load(path + ".header.npy")
    return pd.DataFrame(data=data, columns=header)


def save_dataframe32(path, dataframe, keep=[]):
    col64 = [col_ for col_ in dataframe.columns if col_ in keep]
    col32 = [col_ for col_ in dataframe.columns if col_ not in keep]
    dataframe64 = dataframe[col64]
    dataframe32 = dataframe[col32]
    np.save(path + ".data64", dataframe64.values)
    np.save(path + ".header64", col64)
    np.save(path + ".data32", dataframe32.values.astype(np.float32))
    np.save(path + ".header32", col32)


def load_dataframe32(path, nrows=None):
    path_data32 = path + ".data32.npy"
    path_header32 = path + ".header32.npy"
    path_data64 = path + ".data64.npy"
    path_header64 = path + ".header64.npy"
    result = pd.DataFrame()
    if os.path.exists(path_data32):
        data32 = np.load(path_data32)
        header32 = np.load(path_header32)
        df32 = pd.DataFrame(data=data32, columns=header32)
        result = pd.concat([result, df32], axis=1)
    if os.path.exists(path_data64):
        data64 = np.load(path_data64)
        header64 = np.load(path_header64)
        df64 = pd.DataFrame(data=data64, columns=header64)
        result = pd.concat([result, df64], axis=1)
    if nrows and nrows > 0:
        return result.head(nrows)
    return result

In [7]:
from sklearn.metrics import roc_auc_score

## lightGBM

In [5]:
lgb_params = addict.Dict()
lgb_params.boosting_type = "gbdt"
lgb_params.objective = "binary"
lgb_params.metric = "auc"
lgb_params.learning_rate = 0.02
lgb_params.max_bin = 300
lgb_params.max_depth = -1
lgb_params.num_leaves = 30
lgb_params.min_child_samples = 70
lgb_params.subsample = 1.0
lgb_params.subsample_freq = 1
lgb_params.colsample_bytree = 0.05
lgb_params.min_gain_to_split = 0.5
lgb_params.reg_lambda = 100
lgb_params.reg_alpha = 0.0
lgb_params.scale_pos_weight = 1
lgb_params.is_unbalance = False

In [26]:
# nrows = None  
nrows = None

features = list(np.load('./neptune-features/features_246008_1174_0.npy'))
train_app = load_dataframe32("./bindata/application_train")
auc_valid_stack = []
pred_valid_stack = []
pred_test_stack = []

for i in range(5):
    train_idx_fn = "./neptune-features/train_idx_{}.npy".format(i)
    valid_idx_fn = "./neptune-features/valid_idx_{}.npy".format(i)

    train_offset, valid_offset, test_offset = i * 4, i * 4 + 1, i * 4 + 3
    
    train_data_fn = "./neptune-features/data_246009_1174_{}".format(train_offset)
    valid_data_fn = "./neptune-features/data_61502_1174_{}".format(valid_offset)
    test_data_fn = "./neptune-features/data_48744_1174_{}".format(test_offset)
    
    if i == 0:
        train_data_fn = train_data_fn.replace("246009", "246008")
        valid_data_fn = valid_data_fn.replace("61502", "61503")
    
    gc.collect()

    train_idx = np.load(train_idx_fn)
    valid_idx = np.load(valid_idx_fn)
    
    if nrows:
        train_idx, valid_idx = train_idx[:nrows].copy(), valid_idx[:nrows].copy()

    train_data = load_dataframe32(train_data_fn, nrows)
    valid_data = load_dataframe32(valid_data_fn, nrows)
    test_data = load_dataframe32(test_data_fn, nrows)
    gc.collect()
    
    X_train = train_data.values
    X_valid = valid_data.values
    X_test = test_data.values
    y_train = train_app.loc[train_idx].TARGET
    y_valid = train_app.loc[valid_idx].TARGET
    
    print(train_idx_fn, valid_idx_fn)
    print(train_idx.shape, valid_idx.shape)
    print(train_data_fn, train_data.shape)
    print(valid_data_fn, valid_data.shape)
    print(test_data_fn, test_data.shape)
    
    print(X_train.shape, y_train.shape)
    print(X_valid.shape, y_valid.shape)
    print(X_test.shape)
    
    lgb_data_train = lgb.Dataset(data=X_train, label=y_train)
    lgb_data_valid = lgb.Dataset(data=X_valid, label=y_valid)
    
    estimator = lgb.train(
        lgb_params,
        lgb_data_train,
        valid_sets=[lgb_data_train, lgb_data_valid],
        valid_names=["data_train", "data_valid"],
        num_boost_round=5000,
        early_stopping_rounds=100,
        verbose_eval=100
    )
    
    pred_valid = estimator.predict(X_valid)
    pred_valid_stack.append(pred_valid)
    
    auc = roc_auc_score(y_valid, pred_valid)
    auc_valid_stack.append(auc)
    print("fold-{},auc:{}".format(i, auc))

    pred_test = estimator.predict(X_test)
    pred_test_stack.append(pred_test)
    
print("kfold-auc, avg:{:.4}, std:{:.2}".format(np.mean(auc_valid_stack), np.std(auc_valid_stack)))

./neptune-features/train_idx_0.npy ./neptune-features/valid_idx_0.npy
(246008,) (61503,)
./neptune-features/data_246008_1174_0 (246008, 1174)
./neptune-features/data_61503_1174_1 (61503, 1174)
./neptune-features/data_48744_1174_3 (48744, 1174)
(246008, 1174) (246008,)
(61503, 1174) (61503,)
(48744, 1174)
Training until validation scores don't improve for 100 rounds.
[100]	data_train's auc: 0.768382	data_valid's auc: 0.762167
[200]	data_train's auc: 0.784172	data_valid's auc: 0.774796
[300]	data_train's auc: 0.795062	data_valid's auc: 0.781935
[400]	data_train's auc: 0.803136	data_valid's auc: 0.78613
[500]	data_train's auc: 0.810141	data_valid's auc: 0.78927
[600]	data_train's auc: 0.816243	data_valid's auc: 0.79153
[700]	data_train's auc: 0.821794	data_valid's auc: 0.793261
[800]	data_train's auc: 0.826771	data_valid's auc: 0.794482
[900]	data_train's auc: 0.831328	data_valid's auc: 0.795302
[1000]	data_train's auc: 0.835667	data_valid's auc: 0.796046
[1100]	data_train's auc: 0.839781

[300]	data_train's auc: 0.796639	data_valid's auc: 0.775895
[400]	data_train's auc: 0.804586	data_valid's auc: 0.779692
[500]	data_train's auc: 0.811405	data_valid's auc: 0.782627
[600]	data_train's auc: 0.817362	data_valid's auc: 0.784618
[700]	data_train's auc: 0.822706	data_valid's auc: 0.786339
[800]	data_train's auc: 0.827581	data_valid's auc: 0.787547
[900]	data_train's auc: 0.832184	data_valid's auc: 0.788606
[1000]	data_train's auc: 0.836237	data_valid's auc: 0.78931
[1100]	data_train's auc: 0.840314	data_valid's auc: 0.789992
[1200]	data_train's auc: 0.844042	data_valid's auc: 0.790574
[1300]	data_train's auc: 0.847626	data_valid's auc: 0.790953
[1400]	data_train's auc: 0.851027	data_valid's auc: 0.791297
[1500]	data_train's auc: 0.854333	data_valid's auc: 0.791437
[1600]	data_train's auc: 0.857415	data_valid's auc: 0.791587
[1700]	data_train's auc: 0.860425	data_valid's auc: 0.791887
[1800]	data_train's auc: 0.863273	data_valid's auc: 0.79216
[1900]	data_train's auc: 0.866075

## save oof & submission

In [30]:
def calculate_rank(predictions):
    rank = (1 + pd.Series(predictions).rank().values) / (predictions.shape[0] + 1)
    return rank

In [35]:
pred_target = sum([calculate_rank(p_) for p_ in pred_test_stack]) / 5

pred_sample = pd.read_csv("./result/submission-022-blend08.csv")
pred_sample["TARGET"] = pred_target

pred_sample.to_csv("./result/submission-024-first.csv", index=False)

In [47]:
oof_pred = np.zeros(train_app.shape[0])

for i in range(5):
    valid_idx_fn = "./neptune-features/valid_idx_{}.npy".format(i)
    valid_idx = np.load(valid_idx_fn)
    oof_pred[valid_idx] = pred_valid_stack[i]
    
oof_df = train_app[["SK_ID_CURR"]].copy()
oof_df["SK_ID_CURR"] = oof_df.SK_ID_CURR.astype("int")
oof_df["oof_pred"] = oof_pred
oof_df.to_csv("./oof-result/oof-024-first.csv", index=False)