In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score
from sklearn.model_selection import KFold
# from lightgbm import LGBMClassifier

import matplotlib.pyplot as plt
import seaborn as sns
import gc
import os
import sys

## save & load

In [2]:
def save_dataframe(path, dataframe):
    np.save(path + ".data", dataframe.values)
    np.save(path + ".header", dataframe.columns)
    
def load_dataframe(path):
    data = np.load(path + ".data.npy")
    header = np.load(path + ".header.npy")
    return pd.DataFrame(data=data, columns=header)

def save_dataframe32(path, dataframe, keep=[]):
    col64 = [col_ for col_ in dataframe.columns if col_ in keep]
    col32 = [col_ for col_ in dataframe.columns if col_ not in keep]
    dataframe64 = dataframe[col64]
    dataframe32 = dataframe[col32]
    np.save(path + ".data64", dataframe64.values)
    np.save(path + ".header64", col64)
    np.save(path + ".data32", dataframe32.values.astype(np.float32))
    np.save(path + ".header32", col32)

def load_dataframe32(path):
    path_data32 = path + ".data32.npy"
    path_header32 = path + ".header32.npy"
    path_data64 = path + ".data64.npy"
    path_header64 = path + ".header64.npy"
    result = pd.DataFrame()
    if os.path.exists(path_data32):
        data32 = np.load(path_data32)
        header32 = np.load(path_header32)
        df32 = pd.DataFrame(data=data32, columns=header32)
        result = pd.concat([result, df32], axis=1)
    if os.path.exists(path_data64):
        data64 = np.load(path_data64)
        header64 = np.load(path_header64)
        df64 = pd.DataFrame(data=data64, columns=header64)
        result = pd.concat([result, df64], axis=1)
    return result

In [3]:
data = load_dataframe32("./bindata/data_009")
test = load_dataframe32("./bindata/test_009")
y = load_dataframe32("./bindata/y_009")

In [13]:
print("data:", round(sys.getsizeof(data) / 1024 / 1024, 2), "mb")
print("test:", round(sys.getsizeof(test) / 1024 / 1024, 2), "mb")
print("   y:", round(sys.getsizeof(y) / 1024 / 1024, 2), "mb")

data: 445.76 mb
test: 70.66 mb
   y: 2.35 mb


## before xgbA

In [14]:
import xgboost as xgb

class XgbWrapper:
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [15]:
data_train = data[data.SK_ID_CURR < 130000]
data_val = data[(data.SK_ID_CURR >= 130000) & (data.SK_ID_CURR < 150000)]

ydata_train = y[y.SK_ID_CURR < 130000]
ydata_val = y[(y.SK_ID_CURR >= 130000) & (y.SK_ID_CURR < 150000)]

In [16]:
features = list(data_train.columns)
features.remove("SK_ID_CURR")
print(len(features))

379


## xgbA

In [17]:
X_num_train = data_train[features].values
X_num_val = data_val[features].values

y_train = ydata_train.TARGET.values
y_val = ydata_val.TARGET.values

xgb_params = {
     "objective": "binary:logistic",
     "booster": "gbtree",
     "eval_metric": "auc",
     "nthread": 8,
     "eta": 0.025,
     "max_depth": 6,
     "min_child_weight": 19,
     "gamma": 0,
     "subsample": 0.8,
     "colsample_bytree": 0.632,
     "alpha": 0,
     "lambda": 0.05,
     "nrounds": 250
}

xgb_clf = XgbWrapper(params=xgb_params)
xgb_clf.train(X_num_train, y_train)
pred_val = xgb_clf.predict(X_num_val)

In [18]:
roc_auc_score(y_val, pred_val)

0.7612237178381257

In [47]:
pred_sub1 = xgb_clf.predict(test[features].values)

test["SK_ID_CURR"] = test.SK_ID_CURR.astype("int")
test["TARGET"] = pred_sub1
test[["SK_ID_CURR", "TARGET"]].to_csv("./result/submission-011-A.csv", index=False)

## before xgbB

In [40]:
data2 = data[
    (data.AMT_INCOME_TOTAL <= test.AMT_INCOME_TOTAL.max()) &
    (data.AMT_CREDIT_x <= test.AMT_CREDIT_x.max()) &
    (data.AMT_ANNUITY <= test.AMT_ANNUITY.max()) &
    (data.AMT_REQ_CREDIT_BUREAU_HOUR <= test.AMT_REQ_CREDIT_BUREAU_HOUR.max()) &
    (data.AMT_REQ_CREDIT_BUREAU_DAY <= test.AMT_REQ_CREDIT_BUREAU_DAY.max()) &
    (data.AMT_REQ_CREDIT_BUREAU_WEEK <= test.AMT_REQ_CREDIT_BUREAU_WEEK.max()) &
    (data.AMT_REQ_CREDIT_BUREAU_MON <= test.AMT_REQ_CREDIT_BUREAU_MON.max()) &
    (data.AMT_REQ_CREDIT_BUREAU_QRT <= test.AMT_REQ_CREDIT_BUREAU_QRT.max()) &
    (data.AMT_REQ_CREDIT_BUREAU_YEAR <= test.AMT_REQ_CREDIT_BUREAU_YEAR.max())
]

y2 = y[y.SK_ID_CURR.isin(data2.SK_ID_CURR)]

In [41]:
data2_train = data2[data2.SK_ID_CURR < 130000]
data2_val = data2[(data2.SK_ID_CURR >= 130000) & (data2.SK_ID_CURR < 150000)]

y2data_train = y2[y2.SK_ID_CURR < 130000]
y2data_val = y2[(y2.SK_ID_CURR >= 130000) & (y2.SK_ID_CURR < 150000)]

## xgbB

In [42]:
X2_num_train = data2_train[features].values
X2_num_val = data2_val[features].values

y2_train = y2data_train.TARGET.values
y2_val = y2data_val.TARGET.values

xgb_params = {
     "objective": "binary:logistic",
     "booster": "gbtree",
     "eval_metric": "auc",
     "nthread": 8,
     "eta": 0.025,
     "max_depth": 6,
     "min_child_weight": 19,
     "gamma": 0,
     "subsample": 0.8,
     "colsample_bytree": 0.632,
     "alpha": 0,
     "lambda": 0.05,
     "nrounds": 250
}

xgb_clf2 = XgbWrapper(params=xgb_params)
xgb_clf2.train(X2_num_train, y2_train)
pred_val2 = xgb_clf2.predict(X2_num_val)

In [44]:
roc_auc_score(y2_val, pred_val2)

0.76358417247233767

In [48]:
pred_sub2 = xgb_clf2.predict(test[features].values)

test["SK_ID_CURR"] = test.SK_ID_CURR.astype("int")
test["TARGET"] = pred_sub2
test[["SK_ID_CURR", "TARGET"]].to_csv("./result/submission-011-B.csv", index=False)

## a better validate

In [50]:
pred_val_less = xgb_clf.predict(X2_num_val)
roc_auc_score(y2_val, pred_val_less)

0.76548477574764084