In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score
from sklearn.model_selection import KFold
# from lightgbm import LGBMClassifier

import matplotlib.pyplot as plt
import seaborn as sns
import gc
import os

In [2]:
def save_dataframe(path, dataframe):
    np.save(path + ".data", dataframe.values)
    np.save(path + ".header", dataframe.columns)
    
def load_dataframe(path):
    data = np.load(path + ".data.npy")
    header = np.load(path + ".header.npy")
    return pd.DataFrame(data=data, columns=header)

## train blending validate

In [3]:
data = load_dataframe("./bindata/data_009")
data["SK_ID_CURR"] = data.SK_ID_CURR.astype("int")

test = load_dataframe("./bindata/test_009")
test["SK_ID_CURR"] = test.SK_ID_CURR.astype("int")

y = load_dataframe("./bindata/y_009")
y["SK_ID_CURR"] = y.SK_ID_CURR.astype("int")

In [4]:
data_train = data[data.SK_ID_CURR < 130000]
data_blending = data[(data.SK_ID_CURR >= 130000) & (data.SK_ID_CURR < 150000)]
data_val = data[(data.SK_ID_CURR >= 150000) & (data.SK_ID_CURR < 170000)]

ydata_train = y[y.SK_ID_CURR < 130000]
ydata_blending = y[(y.SK_ID_CURR >= 130000) & (y.SK_ID_CURR < 150000)]
ydata_val = y[(y.SK_ID_CURR >= 150000) & (y.SK_ID_CURR < 170000)]

In [5]:
data_train.shape, data_blending.shape, data_val.shape

((25803, 380), (17392, 380), (17182, 380))

## single xgbs

In [6]:
import xgboost as xgb

class XgbWrapper:
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

### xgb0: data

In [10]:
data_train = data[data.SK_ID_CURR < 130000]
data_blending = data[(data.SK_ID_CURR >= 130000) & (data.SK_ID_CURR < 150000)]
data_val = data[(data.SK_ID_CURR >= 150000) & (data.SK_ID_CURR < 170000)]

features = list(data_train.columns)
features.remove("SK_ID_CURR")
print(len(features))

X_num_train = data_train[features].values
X_num_blending = data_blending[features].values
X_num_val = data_val[features].values

y_train = ydata_train.TARGET.values
y_blending = ydata_blending.TARGET.values
y_val = ydata_val.TARGET.values

xgb_params = {
     "objective": "binary:logistic",
     "booster": "gbtree",
     "eval_metric": "auc",
     "nthread": 8,
     "eta": 0.025,
     "max_depth": 6,
     "min_child_weight": 19,
     "gamma": 0,
     "subsample": 0.8,
     "colsample_bytree": 0.632,
     "alpha": 0,
     "lambda": 0.05,
     "nrounds": 250
}

xgb_clf = XgbWrapper(params=xgb_params)
xgb_clf.train(X_num_train, y_train)
pred_blending0 = xgb_clf.predict(X_num_blending)
pred_val0 = xgb_clf.predict(X_num_val)

print("blending:", roc_auc_score(y_blending, pred_blending0))
print("validate:", roc_auc_score(y_val, pred_val0))

379
blending: 0.761223717838
validate: 0.758524498285


### xgb1: data & nmf

In [15]:
data_train = data[data.SK_ID_CURR < 130000]
data_blending = data[(data.SK_ID_CURR >= 130000) & (data.SK_ID_CURR < 150000)]
data_val = data[(data.SK_ID_CURR >= 150000) & (data.SK_ID_CURR < 170000)]

for source_name in (
    "buro_wide_009",
    "buro_full_wide_009",
    "pos_bal_wide_009",
    "cc_bal_wide_009",
    "buro_bal_timestep1_009",
    "buro_bal_timestep2_009",
    "pos_bal_timestep1_009",
    "pos_bal_timestep2_009",
    "cc_bal_timestep1_009",
    "cc_bal_timestep2_009"
):
    target_name = "./bindata/" + source_name + "_nmf5"
    nmf_data = load_dataframe(target_name)
    nmf_data["SK_ID_CURR"] = nmf_data.SK_ID_CURR.astype("int")
    nmf_data.columns = ["{}_{}".format(source_name, col_) if col_ != "SK_ID_CURR" else col_ for col_ in nmf_data.columns]
    data_train = data_train.merge(right=nmf_data, how="left", on="SK_ID_CURR")
    data_blending = data_blending.merge(right=nmf_data, how="left", on="SK_ID_CURR")
    data_val = data_val.merge(right=nmf_data, how="left", on="SK_ID_CURR")

features = list(data_train.columns)
features.remove("SK_ID_CURR")
print(len(features))

X_num_train = data_train[features].values
X_num_blending = data_blending[features].values
X_num_val = data_val[features].values

print(X_num_train.shape)
print(X_num_blending.shape)
print(X_num_val.shape)

y_train = ydata_train.TARGET.values
y_blending = ydata_blending.TARGET.values
y_val = ydata_val.TARGET.values

xgb_params = {
     "objective": "binary:logistic",
     "booster": "gbtree",
     "eval_metric": "auc",
     "nthread": 8,
     "eta": 0.025,
     "max_depth": 6,
     "min_child_weight": 19,
     "gamma": 0,
     "subsample": 0.8,
     "colsample_bytree": 0.632,
     "alpha": 0,
     "lambda": 0.05,
     "nrounds": 250
}

xgb_clf = XgbWrapper(params=xgb_params)
xgb_clf.train(X_num_train, y_train)
pred_blending1 = xgb_clf.predict(X_num_blending)
pred_val1 = xgb_clf.predict(X_num_val)

print("blending:", roc_auc_score(y_blending, pred_blending1))
print("validate:", roc_auc_score(y_val, pred_val1))

429
(25803, 429)
(17392, 429)
(17182, 429)
blending: 0.761760124161
validate: 0.76009622081


### xgb2: data & 4wide

In [16]:
data_train = data[data.SK_ID_CURR < 130000]
data_blending = data[(data.SK_ID_CURR >= 130000) & (data.SK_ID_CURR < 150000)]
data_val = data[(data.SK_ID_CURR >= 150000) & (data.SK_ID_CURR < 170000)]

for source_name in (
    "buro_wide_009",
    "buro_full_wide_009",
    "pos_bal_wide_009",
    "cc_bal_wide_009"
):
    target_name = "./bindata/" + source_name
    nmf_data = load_dataframe(target_name)
    nmf_data["SK_ID_CURR"] = nmf_data.SK_ID_CURR.astype("int")
    nmf_data.columns = ["{}_{}".format(source_name, col_) if col_ != "SK_ID_CURR" else col_ for col_ in nmf_data.columns]
    data_train = data_train.merge(right=nmf_data, how="left", on="SK_ID_CURR")
    data_blending = data_blending.merge(right=nmf_data, how="left", on="SK_ID_CURR")
    data_val = data_val.merge(right=nmf_data, how="left", on="SK_ID_CURR")

features = list(data_train.columns)
features.remove("SK_ID_CURR")
print(len(features))

X_num_train = data_train[features].values
X_num_blending = data_blending[features].values
X_num_val = data_val[features].values

print(X_num_train.shape)
print(X_num_blending.shape)
print(X_num_val.shape)

y_train = ydata_train.TARGET.values
y_blending = ydata_blending.TARGET.values
y_val = ydata_val.TARGET.values

xgb_params = {
     "objective": "binary:logistic",
     "booster": "gbtree",
     "eval_metric": "auc",
     "nthread": 8,
     "eta": 0.025,
     "max_depth": 6,
     "min_child_weight": 19,
     "gamma": 0,
     "subsample": 0.8,
     "colsample_bytree": 0.632,
     "alpha": 0,
     "lambda": 0.05,
     "nrounds": 250
}

xgb_clf = XgbWrapper(params=xgb_params)
xgb_clf.train(X_num_train, y_train)
pred_blending2 = xgb_clf.predict(X_num_blending)
pred_val2 = xgb_clf.predict(X_num_val)

print("blending:", roc_auc_score(y_blending, pred_blending2))
print("validate:", roc_auc_score(y_val, pred_val2))

906
(25803, 906)
(17392, 906)
(17182, 906)
blending: 0.763302570922
validate: 0.761314749364


### xgb3: data & timestep1

In [17]:
data_train = data[data.SK_ID_CURR < 130000]
data_blending = data[(data.SK_ID_CURR >= 130000) & (data.SK_ID_CURR < 150000)]
data_val = data[(data.SK_ID_CURR >= 150000) & (data.SK_ID_CURR < 170000)]

for source_name in (
    "buro_bal_timestep1_009",
    "pos_bal_timestep1_009",
    "cc_bal_timestep1_009"
):
    target_name = "./bindata/" + source_name
    nmf_data = load_dataframe(target_name)
    nmf_data["SK_ID_CURR"] = nmf_data.SK_ID_CURR.astype("int")
    nmf_data.columns = ["{}_{}".format(source_name, col_) if col_ != "SK_ID_CURR" else col_ for col_ in nmf_data.columns]
    data_train = data_train.merge(right=nmf_data, how="left", on="SK_ID_CURR")
    data_blending = data_blending.merge(right=nmf_data, how="left", on="SK_ID_CURR")
    data_val = data_val.merge(right=nmf_data, how="left", on="SK_ID_CURR")

features = list(data_train.columns)
features.remove("SK_ID_CURR")
print(len(features))

X_num_train = data_train[features].values
X_num_blending = data_blending[features].values
X_num_val = data_val[features].values

print(X_num_train.shape)
print(X_num_blending.shape)
print(X_num_val.shape)

y_train = ydata_train.TARGET.values
y_blending = ydata_blending.TARGET.values
y_val = ydata_val.TARGET.values

xgb_params = {
     "objective": "binary:logistic",
     "booster": "gbtree",
     "eval_metric": "auc",
     "nthread": 8,
     "eta": 0.025,
     "max_depth": 6,
     "min_child_weight": 19,
     "gamma": 0,
     "subsample": 0.8,
     "colsample_bytree": 0.632,
     "alpha": 0,
     "lambda": 0.05,
     "nrounds": 250
}

xgb_clf = XgbWrapper(params=xgb_params)
xgb_clf.train(X_num_train, y_train)
pred_blending3 = xgb_clf.predict(X_num_blending)
pred_val3 = xgb_clf.predict(X_num_val)

print("blending:", roc_auc_score(y_blending, pred_blending3))
print("validate:", roc_auc_score(y_val, pred_val3))

673
(25803, 673)
(17392, 673)
(17182, 673)
blending: 0.761386169203
validate: 0.758360127963


### xgb4: data & timestep2

In [18]:
data_train = data[data.SK_ID_CURR < 130000]
data_blending = data[(data.SK_ID_CURR >= 130000) & (data.SK_ID_CURR < 150000)]
data_val = data[(data.SK_ID_CURR >= 150000) & (data.SK_ID_CURR < 170000)]

for source_name in (
    "buro_bal_timestep2_009",
    "pos_bal_timestep2_009",
    "cc_bal_timestep2_009"
):
    target_name = "./bindata/" + source_name
    nmf_data = load_dataframe(target_name)
    nmf_data["SK_ID_CURR"] = nmf_data.SK_ID_CURR.astype("int")
    nmf_data.columns = ["{}_{}".format(source_name, col_) if col_ != "SK_ID_CURR" else col_ for col_ in nmf_data.columns]
    data_train = data_train.merge(right=nmf_data, how="left", on="SK_ID_CURR")
    data_blending = data_blending.merge(right=nmf_data, how="left", on="SK_ID_CURR")
    data_val = data_val.merge(right=nmf_data, how="left", on="SK_ID_CURR")

features = list(data_train.columns)
features.remove("SK_ID_CURR")
print(len(features))

X_num_train = data_train[features].values
X_num_blending = data_blending[features].values
X_num_val = data_val[features].values

print(X_num_train.shape)
print(X_num_blending.shape)
print(X_num_val.shape)

y_train = ydata_train.TARGET.values
y_blending = ydata_blending.TARGET.values
y_val = ydata_val.TARGET.values

xgb_params = {
     "objective": "binary:logistic",
     "booster": "gbtree",
     "eval_metric": "auc",
     "nthread": 8,
     "eta": 0.025,
     "max_depth": 6,
     "min_child_weight": 19,
     "gamma": 0,
     "subsample": 0.8,
     "colsample_bytree": 0.632,
     "alpha": 0,
     "lambda": 0.05,
     "nrounds": 250
}

xgb_clf = XgbWrapper(params=xgb_params)
xgb_clf.train(X_num_train, y_train)
pred_blending4 = xgb_clf.predict(X_num_blending)
pred_val4 = xgb_clf.predict(X_num_val)

print("blending:", roc_auc_score(y_blending, pred_blending4))
print("validate:", roc_auc_score(y_val, pred_val4))

2461
(25803, 2461)
(17392, 2461)
(17182, 2461)
blending: 0.762032413188
validate: 0.761070800523


## stacking

https://www.kaggle.com/eliotbarr/stacking-test-sklearn-xgboost-catboost-lightgbm

In [19]:
blending_stack = (
    pred_blending0,
    pred_blending1,
    pred_blending2,
    pred_blending3,
    pred_blending4
)

val_stack = (
    pred_val0,
    pred_val1,
    pred_val2,
    pred_val3,
    pred_val4
)

blending_aucs = [roc_auc_score(y_blending, pred_) for pred_ in blending_stack]
val_aucs = [roc_auc_score(y_val, pred_) for pred_ in val_stack]

In [26]:
metric_data = pd.DataFrame({
    "blending": blending_aucs,
    "val": val_aucs
})

metric_data.loc["mean"] = {
    "blending": roc_auc_score(y_blending, sum(blending_stack)),
    "val": roc_auc_score(y_val, sum(val_stack))
}

metric_data

Unnamed: 0,blending,val
0,0.761224,0.758524
1,0.76176,0.760096
2,0.763303,0.761315
3,0.761386,0.75836
4,0.762032,0.761071
mean,0.764013,0.761811


In [30]:
from sklearn.linear_model import LogisticRegression
# from sklearn.preprocessing import StandardScaler

X_pred_bld = np.array(blending_stack).T
X_pred_val = np.array(val_stack).T

clf = LogisticRegression()
clf.fit(X_pred_bld, y_blending)
pred_bld_val = clf.predict_proba(X_pred_val)[:, 1]
print(roc_auc_score(y_val, pred_bld_val))

0.762250054132


In [32]:
pred_weight = clf.coef_[0]
pred_weight

array([ 0.74069456,  1.90968882,  2.18610268,  1.44764993,  3.00154614])

## submmit

### xgb0

In [34]:
gc.collect()

data_train = data
data_val = test

features = list(data_train.columns)
features.remove("SK_ID_CURR")
print(len(features))

X_num_train = data_train[features].values
X_num_val = data_val[features].values
y_train = y.TARGET.values

xgb_params = {
     "objective": "binary:logistic",
     "booster": "gbtree",
     "eval_metric": "auc",
     "nthread": 8,
     "eta": 0.025,
     "max_depth": 6,
     "min_child_weight": 19,
     "gamma": 0,
     "subsample": 0.8,
     "colsample_bytree": 0.632,
     "alpha": 0,
     "lambda": 0.05,
     "nrounds": 2000
}

xgb_clf = XgbWrapper(params=xgb_params)
xgb_clf.train(X_num_train, y_train)
pred_sub0 = xgb_clf.predict(X_num_val)

gc.collect()

379


21

In [35]:
np.save("./result/submission-010-sub0", pred_sub0)

### xgb1

In [36]:
gc.collect()

data_train = data
data_val = test

for source_name in (
    "buro_wide_009",
    "buro_full_wide_009",
    "pos_bal_wide_009",
    "cc_bal_wide_009",
    "buro_bal_timestep1_009",
    "buro_bal_timestep2_009",
    "pos_bal_timestep1_009",
    "pos_bal_timestep2_009",
    "cc_bal_timestep1_009",
    "cc_bal_timestep2_009"
):
    target_name = "./bindata/" + source_name + "_nmf5"
    nmf_data = load_dataframe(target_name)
    nmf_data["SK_ID_CURR"] = nmf_data.SK_ID_CURR.astype("int")
    nmf_data.columns = ["{}_{}".format(source_name, col_) if col_ != "SK_ID_CURR" else col_ for col_ in nmf_data.columns]
    data_train = data_train.merge(right=nmf_data, how="left", on="SK_ID_CURR")
    data_val = data_val.merge(right=nmf_data, how="left", on="SK_ID_CURR")

features = list(data_train.columns)
features.remove("SK_ID_CURR")
print(len(features))

X_num_train = data_train[features].values
X_num_val = data_val[features].values
y_train = y.TARGET.values

xgb_params = {
     "objective": "binary:logistic",
     "booster": "gbtree",
     "eval_metric": "auc",
     "nthread": 8,
     "eta": 0.025,
     "max_depth": 6,
     "min_child_weight": 19,
     "gamma": 0,
     "subsample": 0.8,
     "colsample_bytree": 0.632,
     "alpha": 0,
     "lambda": 0.05,
     "nrounds": 2000
}

xgb_clf = XgbWrapper(params=xgb_params)
xgb_clf.train(X_num_train, y_train)
pred_sub1 = xgb_clf.predict(X_num_val)

np.save("./result/submission-010-sub1", pred_sub1)

gc.collect()

429


352

### xgb2

In [37]:
gc.collect()

data_train = data
data_val = test

for source_name in (
    "buro_wide_009",
    "buro_full_wide_009",
    "pos_bal_wide_009",
    "cc_bal_wide_009"
):
    target_name = "./bindata/" + source_name
    nmf_data = load_dataframe(target_name)
    nmf_data["SK_ID_CURR"] = nmf_data.SK_ID_CURR.astype("int")
    nmf_data.columns = ["{}_{}".format(source_name, col_) if col_ != "SK_ID_CURR" else col_ for col_ in nmf_data.columns]
    data_train = data_train.merge(right=nmf_data, how="left", on="SK_ID_CURR")
    data_val = data_val.merge(right=nmf_data, how="left", on="SK_ID_CURR")

features = list(data_train.columns)
features.remove("SK_ID_CURR")
print(len(features))

X_num_train = data_train[features].values
X_num_val = data_val[features].values
y_train = y.TARGET.values

xgb_params = {
     "objective": "binary:logistic",
     "booster": "gbtree",
     "eval_metric": "auc",
     "nthread": 8,
     "eta": 0.025,
     "max_depth": 6,
     "min_child_weight": 19,
     "gamma": 0,
     "subsample": 0.8,
     "colsample_bytree": 0.632,
     "alpha": 0,
     "lambda": 0.05,
     "nrounds": 2000
}

xgb_clf = XgbWrapper(params=xgb_params)
xgb_clf.train(X_num_train, y_train)
pred_sub2 = xgb_clf.predict(X_num_val)

np.save("./result/submission-010-sub2", pred_sub2)

gc.collect()

906


255

### xgb3

In [38]:
gc.collect()

data_train = data
data_val = test

for source_name in (
    "buro_bal_timestep1_009",
    "pos_bal_timestep1_009",
    "cc_bal_timestep1_009"
):
    target_name = "./bindata/" + source_name
    nmf_data = load_dataframe(target_name)
    nmf_data["SK_ID_CURR"] = nmf_data.SK_ID_CURR.astype("int")
    nmf_data.columns = ["{}_{}".format(source_name, col_) if col_ != "SK_ID_CURR" else col_ for col_ in nmf_data.columns]
    data_train = data_train.merge(right=nmf_data, how="left", on="SK_ID_CURR")
    data_val = data_val.merge(right=nmf_data, how="left", on="SK_ID_CURR")

features = list(data_train.columns)
features.remove("SK_ID_CURR")
print(len(features))

X_num_train = data_train[features].values
X_num_val = data_val[features].values
y_train = y.TARGET.values

xgb_params = {
     "objective": "binary:logistic",
     "booster": "gbtree",
     "eval_metric": "auc",
     "nthread": 8,
     "eta": 0.025,
     "max_depth": 6,
     "min_child_weight": 19,
     "gamma": 0,
     "subsample": 0.8,
     "colsample_bytree": 0.632,
     "alpha": 0,
     "lambda": 0.05,
     "nrounds": 2000
}

xgb_clf = XgbWrapper(params=xgb_params)
xgb_clf.train(X_num_train, y_train)
pred_sub3 = xgb_clf.predict(X_num_val)

np.save("./result/submission-010-sub3", pred_sub3)

gc.collect()

673


193

In [42]:
np.save("./result/X_pred_bld-010", X_pred_bld)

In [43]:
np.save("./result/X_pred_val-010", X_pred_val)

In [51]:
del data_train
del data_val
gc.collect()

NameError: name 'data_train' is not defined

In [49]:
data.SK_ID_CURR.nunique()

307511

### xgb4

In [None]:
gc.collect()

data_train = data.astype(np.float32)
del data
gc.collect()

for source_name in (
    "buro_bal_timestep2_009",
    "pos_bal_timestep2_009",
    "cc_bal_timestep2_009"
):
    target_name = "./bindata/" + source_name
    nmf_data = load_dataframe(target_name).astype(np.float32)
    nmf_data["SK_ID_CURR"] = nmf_data.SK_ID_CURR.astype("int")
    nmf_data.columns = ["{}_{}".format(source_name, col_) if col_ != "SK_ID_CURR" else col_ for col_ in nmf_data.columns]
    data_train = data_train.merge(right=nmf_data, how="left", on="SK_ID_CURR")

features = list(data_train.columns)
features.remove("SK_ID_CURR")
print(len(features))

X_num_train = data_train[features].values
del data_train
gc.collect()
y_train = y.TARGET.values

xgb_params = {
     "objective": "binary:logistic",
     "booster": "gbtree",
     "eval_metric": "auc",
     "nthread": 8,
     "eta": 0.025,
     "max_depth": 6,
     "min_child_weight": 19,
     "gamma": 0,
     "subsample": 0.8,
     "colsample_bytree": 0.632,
     "alpha": 0,
     "lambda": 0.05,
     "nrounds": 2000
}

xgb_clf = XgbWrapper(params=xgb_params)
xgb_clf.train(X_num_train, y_train)

del X_num_train
gc.collect()

data_val = test.astype(np.float32)
del test
gc.collect()
for source_name in (
    "buro_bal_timestep2_009",
    "pos_bal_timestep2_009",
    "cc_bal_timestep2_009"
):
    target_name = "./bindata/" + source_name
    nmf_data = load_dataframe(target_name).astype(np.float32)
    nmf_data["SK_ID_CURR"] = nmf_data.SK_ID_CURR.astype("int")
    nmf_data.columns = ["{}_{}".format(source_name, col_) if col_ != "SK_ID_CURR" else col_ for col_ in nmf_data.columns]
    data_val = data_val.merge(right=nmf_data, how="left", on="SK_ID_CURR")

X_num_val = data_val[features].values
pred_sub4 = xgb_clf.predict(X_num_val)
np.save("./result/submission-010-sub4", pred_sub4)

gc.collect()