## functions

In [1]:
import pandas as pd
import numpy as np
import os
import gc

import addict
import lightgbm as lgb

from sklearn.metrics import roc_auc_score
import catboost

import warnings
warnings.filterwarnings('ignore')

def save_dataframe(path, dataframe):
    np.save(path + ".data", dataframe.values)
    np.save(path + ".header", dataframe.columns)


def load_dataframe(path):
    data = np.load(path + ".data.npy")
    header = np.load(path + ".header.npy")
    return pd.DataFrame(data=data, columns=header)


def save_dataframe32(path, dataframe, keep=[]):
    col64 = [col_ for col_ in dataframe.columns if col_ in keep]
    col32 = [col_ for col_ in dataframe.columns if col_ not in keep]
    dataframe64 = dataframe[col64]
    dataframe32 = dataframe[col32]
    np.save(path + ".data64", dataframe64.values)
    np.save(path + ".header64", col64)
    np.save(path + ".data32", dataframe32.values.astype(np.float32))
    np.save(path + ".header32", col32)


def load_dataframe32(path, nrows=None):
    path_data32 = path + ".data32.npy"
    path_header32 = path + ".header32.npy"
    path_data64 = path + ".data64.npy"
    path_header64 = path + ".header64.npy"
    result = pd.DataFrame()
    if os.path.exists(path_data32):
        data32 = np.load(path_data32)
        header32 = np.load(path_header32)
        df32 = pd.DataFrame(data=data32, columns=header32)
        result = pd.concat([result, df32], axis=1)
    if os.path.exists(path_data64):
        data64 = np.load(path_data64)
        header64 = np.load(path_header64)
        df64 = pd.DataFrame(data=data64, columns=header64)
        result = pd.concat([result, df64], axis=1)
    if nrows and nrows > 0:
        return result.head(nrows)
    return result

## check data

In [None]:
# nrows = None  
nrows = None

features = list(np.load('./neptune-features/features_246008_1174_0.npy'))
train_app = load_dataframe32("./bindata/application_train")
auc_valid_stack = []
pred_valid_stack = []
pred_test_stack = []

for i in range(5):
    train_idx_fn = "./neptune-features/train_idx_{}.npy".format(i)
    valid_idx_fn = "./neptune-features/valid_idx_{}.npy".format(i)

    train_offset, valid_offset, test_offset = i * 4, i * 4 + 1, i * 4 + 3
    
    train_data_fn = "./neptune-features/data_246009_1174_{}".format(train_offset)
    valid_data_fn = "./neptune-features/data_61502_1174_{}".format(valid_offset)
    test_data_fn = "./neptune-features/data_48744_1174_{}".format(test_offset)
    
    if i == 0:
        train_data_fn = train_data_fn.replace("246009", "246008")
        valid_data_fn = valid_data_fn.replace("61502", "61503")
    
    gc.collect()

    train_idx = np.load(train_idx_fn)
    valid_idx = np.load(valid_idx_fn)
    
    if nrows:
        train_idx, valid_idx = train_idx[:nrows].copy(), valid_idx[:nrows].copy()

    train_data = load_dataframe32(train_data_fn, nrows)
    valid_data = load_dataframe32(valid_data_fn, nrows)
    test_data = load_dataframe32(test_data_fn, nrows)
    gc.collect()
    
    X_train = train_data.values
    X_valid = valid_data.values
    X_test = test_data.values
    y_train = train_app.loc[train_idx].TARGET
    y_valid = train_app.loc[valid_idx].TARGET
    
    print(train_idx_fn, valid_idx_fn)
    print(train_idx.shape, valid_idx.shape)
    print(train_data_fn, train_data.shape)
    print(valid_data_fn, valid_data.shape)
    print(test_data_fn, test_data.shape)
    
    print(X_train.shape, y_train.shape)
    print(X_valid.shape, y_valid.shape)
    print(X_test.shape)
    

./neptune-features/train_idx_0.npy ./neptune-features/valid_idx_0.npy
(246008,) (61503,)
./neptune-features/data_246008_1174_0 (246008, 1174)
./neptune-features/data_61503_1174_1 (61503, 1174)
./neptune-features/data_48744_1174_3 (48744, 1174)
(246008, 1174) (246008,)
(61503, 1174) (61503,)
(48744, 1174)
./neptune-features/train_idx_1.npy ./neptune-features/valid_idx_1.npy
(246009,) (61502,)
./neptune-features/data_246009_1174_4 (246009, 1174)
./neptune-features/data_61502_1174_5 (61502, 1174)
./neptune-features/data_48744_1174_7 (48744, 1174)
(246009, 1174) (246009,)
(61502, 1174) (61502,)
(48744, 1174)
./neptune-features/train_idx_2.npy ./neptune-features/valid_idx_2.npy
(246009,) (61502,)
./neptune-features/data_246009_1174_8 (246009, 1174)
./neptune-features/data_61502_1174_9 (61502, 1174)
./neptune-features/data_48744_1174_11 (48744, 1174)
(246009, 1174) (246009,)
(61502, 1174) (61502,)
(48744, 1174)
./neptune-features/train_idx_3.npy ./neptune-features/valid_idx_3.npy
(246009,) (

In [4]:
train_data.head()

NameError: name 'train_data' is not defined

## catboost

In [2]:
catboost_params = addict.Dict()
catboost_params.border_count = 128
catboost_params.colsample_bylevel = 1
catboost_params.depth = 6
catboost_params.eval_metric = "AUC"
catboost_params.iterations = 10000
catboost_params.l2_leaf_reg = 3
catboost_params.learning_rate = 0.05
catboost_params.loss_function = "Logloss"
catboost_params.max_ctr_complexity = 1
catboost_params.model_size_reg = 0.5
catboost_params.od_wait = 100
catboost_params.od_type = 'Iter'
# catboost_params.early_stopping_rounds = 100

In [9]:
def replace_nan(X):
    X = X.copy()
    X[np.isnan(X)] = 0
    X[X == np.Inf] = X[X != np.Inf].max()
    X[X == -np.Inf] = X[X != -np.Inf].min()
    # print(X.shape, X.max(), X.min())
    return X

In [10]:
# nrows = None  
nrows = None
# nrows = 5000

features = list(np.load('./neptune-features/features_246008_1174_0.npy'))
train_app = load_dataframe32("./bindata/application_train")
auc_valid_stack = []
pred_valid_stack = []
pred_test_stack = []

for i in range(5):
    train_idx_fn = "./neptune-features/train_idx_{}.npy".format(i)
    valid_idx_fn = "./neptune-features/valid_idx_{}.npy".format(i)

    train_offset, valid_offset, test_offset = i * 4, i * 4 + 1, i * 4 + 3
    
    train_data_fn = "./neptune-features/data_246009_1174_{}".format(train_offset)
    valid_data_fn = "./neptune-features/data_61502_1174_{}".format(valid_offset)
    test_data_fn = "./neptune-features/data_48744_1174_{}".format(test_offset)
    
    if i == 0:
        train_data_fn = train_data_fn.replace("246009", "246008")
        valid_data_fn = valid_data_fn.replace("61502", "61503")
    
    gc.collect()

    train_idx = np.load(train_idx_fn)
    valid_idx = np.load(valid_idx_fn)
    
    if nrows:
        train_idx, valid_idx = train_idx[:nrows].copy(), valid_idx[:nrows].copy()

    train_data = load_dataframe32(train_data_fn, nrows)
    valid_data = load_dataframe32(valid_data_fn, nrows)
    test_data = load_dataframe32(test_data_fn, nrows)
    gc.collect()
    
    X_train = replace_nan(train_data.values)
    X_valid = replace_nan(valid_data.values)
    X_test = replace_nan(test_data.values)
    y_train = train_app.loc[train_idx].TARGET
    y_valid = train_app.loc[valid_idx].TARGET
    
    print(train_idx_fn, valid_idx_fn)
    print(train_idx.shape, valid_idx.shape)
    print(train_data_fn, train_data.shape)
    print(valid_data_fn, valid_data.shape)
    print(test_data_fn, test_data.shape)
    
    print(X_train.shape, y_train.shape)
    print(X_valid.shape, y_valid.shape)
    print(X_test.shape)
    
    valid_pool = catboost.Pool(data=X_valid, label=y_valid)
    estimator = catboost.CatBoostClassifier(**catboost_params)
    
    estimator.fit(
        X_train, y_train,
        use_best_model=True,
        eval_set=valid_pool,
        verbose_eval=100,
        early_stopping_rounds=100
    )
    pred_valid = estimator.predict_proba(X_valid)[:, 1]
    pred_valid_stack.append(pred_valid)
    
    auc = roc_auc_score(y_valid, pred_valid)
    auc_valid_stack.append(auc)
    print("fold-{},auc:{}".format(i, auc))

    pred_test = estimator.predict_proba(X_test)[:, 1]
    pred_test_stack.append(pred_test)
    
print("kfold-auc, avg:{:.4}, std:{:.2}".format(np.mean(auc_valid_stack), np.std(auc_valid_stack)))

./neptune-features/train_idx_0.npy ./neptune-features/valid_idx_0.npy
(246008,) (61503,)
./neptune-features/data_246008_1174_0 (246008, 1174)
./neptune-features/data_61503_1174_1 (61503, 1174)
./neptune-features/data_48744_1174_3 (48744, 1174)
(246008, 1174) (246008,)
(61503, 1174) (61503,)
(48744, 1174)
0:	test: 0.7279460	best: 0.7279460 (0)	total: 1.59s	remaining: 4h 24m 37s
100:	test: 0.7758127	best: 0.7758127 (100)	total: 2m 20s	remaining: 3h 49m 17s
200:	test: 0.7822922	best: 0.7822922 (200)	total: 4m 41s	remaining: 3h 48m 31s
300:	test: 0.7855698	best: 0.7855698 (300)	total: 7m 1s	remaining: 3h 46m 24s
400:	test: 0.7871756	best: 0.7871956 (397)	total: 9m 22s	remaining: 3h 44m 22s
500:	test: 0.7882866	best: 0.7882901 (495)	total: 11m 43s	remaining: 3h 42m 14s
600:	test: 0.7892256	best: 0.7892256 (600)	total: 14m 4s	remaining: 3h 40m 12s
700:	test: 0.7899286	best: 0.7899296 (699)	total: 16m 26s	remaining: 3h 38m 7s
800:	test: 0.7904953	best: 0.7905184 (796)	total: 18m 47s	remaining

1400:	test: 0.7865974	best: 0.7865974 (1400)	total: 33m 54s	remaining: 3h 28m 9s
1500:	test: 0.7868253	best: 0.7868474 (1493)	total: 36m 40s	remaining: 3h 27m 37s
1600:	test: 0.7869691	best: 0.7869758 (1565)	total: 39m 17s	remaining: 3h 26m 7s
1700:	test: 0.7871584	best: 0.7871894 (1667)	total: 41m 53s	remaining: 3h 24m 21s
1800:	test: 0.7872126	best: 0.7872585 (1779)	total: 44m 24s	remaining: 3h 22m 11s
1900:	test: 0.7873387	best: 0.7873418 (1896)	total: 47m 4s	remaining: 3h 20m 34s
2000:	test: 0.7873599	best: 0.7873968 (1969)	total: 49m 38s	remaining: 3h 18m 27s
2100:	test: 0.7874643	best: 0.7874741 (2086)	total: 52m 3s	remaining: 3h 15m 42s
2200:	test: 0.7875163	best: 0.7875287 (2188)	total: 54m 33s	remaining: 3h 13m 20s
2300:	test: 0.7876857	best: 0.7877114 (2294)	total: 56m 59s	remaining: 3h 10m 41s
2400:	test: 0.7877725	best: 0.7877995 (2369)	total: 59m 22s	remaining: 3h 7m 53s
2500:	test: 0.7879901	best: 0.7879950 (2499)	total: 1h 1m 45s	remaining: 3h 5m 11s
2600:	test: 0.788211

## save oof & submission

In [11]:
def calculate_rank(predictions):
    rank = (1 + pd.Series(predictions).rank().values) / (predictions.shape[0] + 1)
    return rank

In [12]:
pred_target = sum([calculate_rank(p_) for p_ in pred_test_stack]) / 5

pred_sample = pd.read_csv("./result/submission-022-blend08.csv")
pred_sample["TARGET"] = pred_target

pred_sample.to_csv("./result/submission-043-cat.csv", index=False)

In [13]:
oof_pred = np.zeros(train_app.shape[0])

for i in range(5):
    valid_idx_fn = "./neptune-features/valid_idx_{}.npy".format(i)
    valid_idx = np.load(valid_idx_fn)
    oof_pred[valid_idx] = pred_valid_stack[i]
    
oof_df = train_app[["SK_ID_CURR"]].copy()
oof_df["SK_ID_CURR"] = oof_df.SK_ID_CURR.astype("int")
oof_df["oof_pred"] = oof_pred
oof_df.to_csv("./oof-result/oof-043-cat.csv", index=False)