## functions

In [1]:
import pandas as pd
import numpy as np
import os
import gc

import addict
import lightgbm as lgb

def save_dataframe(path, dataframe):
    np.save(path + ".data", dataframe.values)
    np.save(path + ".header", dataframe.columns)


def load_dataframe(path):
    data = np.load(path + ".data.npy")
    header = np.load(path + ".header.npy")
    return pd.DataFrame(data=data, columns=header)


def save_dataframe32(path, dataframe, keep=[]):
    col64 = [col_ for col_ in dataframe.columns if col_ in keep]
    col32 = [col_ for col_ in dataframe.columns if col_ not in keep]
    dataframe64 = dataframe[col64]
    dataframe32 = dataframe[col32]
    np.save(path + ".data64", dataframe64.values)
    np.save(path + ".header64", col64)
    np.save(path + ".data32", dataframe32.values.astype(np.float32))
    np.save(path + ".header32", col32)


def load_dataframe32(path, nrows=None):
    path_data32 = path + ".data32.npy"
    path_header32 = path + ".header32.npy"
    path_data64 = path + ".data64.npy"
    path_header64 = path + ".header64.npy"
    result = pd.DataFrame()
    if os.path.exists(path_data32):
        data32 = np.load(path_data32)
        header32 = np.load(path_header32)
        df32 = pd.DataFrame(data=data32, columns=header32)
        result = pd.concat([result, df32], axis=1)
    if os.path.exists(path_data64):
        data64 = np.load(path_data64)
        header64 = np.load(path_header64)
        df64 = pd.DataFrame(data=data64, columns=header64)
        result = pd.concat([result, df64], axis=1)
    if nrows and nrows > 0:
        return result.head(nrows)
    return result

In [2]:
from sklearn.metrics import roc_auc_score

In [3]:
from keras.layers import Input, Dropout, Dense, BatchNormalization, Activation, concatenate, GRU, Embedding, Flatten, BatchNormalization
from keras.models import Model
from string import punctuation
from os import listdir
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.optimizers import SGD
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
import numpy as np

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score
from sklearn.model_selection import KFold
from sklearn.datasets import load_iris
from lightgbm import LGBMClassifier

import matplotlib.pyplot as plt
import seaborn as sns
import gc
import os
import sys
import time
import lightgbm as lgb

from bayes_opt import BayesianOptimization
from scipy.stats import pearsonr

import warnings
import lightgbm as lgb

warnings.filterwarnings("ignore")

## feature group & model structure

In [51]:
def get_header(feature_name, length=4):
    for char in "0123456789":
        feature_name = feature_name.replace(char, "")
    items = feature_name.split("_")
    if len(items) < length + 1:
        return "short"
    return "_".join(items[:length])

def create_feature_info(features):
    features_info = pd.DataFrame({
        "feature_idx": np.arange(len(features)),
        "feature_name": features
    })

    features_info["feature_header"] = features_info.feature_name.apply(get_header)
    features_info["group_size"] = features_info.feature_header.map(features_info.feature_header.value_counts())
    features_info.loc[features_info.group_size < 10, "feature_header"] = "short"
    features_info["group_size"] = features_info.feature_header.map(features_info.feature_header.value_counts())
    return features_info

In [52]:
def create_scaler_stack(features_info):
    return {key: StandardScaler() for key in features_info.feature_header.unique()}

def create_feature_stack(feature_info):
    return {key: list(features_info[features_info.feature_header == key].feature_name) for key in features_info.feature_header.unique()}

def replace_nan(X):
    X = X.copy()
    X[np.isnan(X)] = 0
    X[X == np.Inf] = X[X != np.Inf].max()
    X[X == -np.Inf] = X[X != -np.Inf].min()
    # print(X.shape, X.max(), X.min())
    return X

In [58]:
def less_ftr(input_dim):
    return int(np.log(input_dim) * np.sqrt(input_dim))

def create_model(feature_stack):
    input_lyr = [Input(shape=[len(ftrs_)], name=key_) for key_, ftrs_ in feature_stack.items()]
    dense0_lyr = [Dense(less_ftr(len(ftrs_)), activation="relu")(input_lyr[i]) for i, (key_, ftrs_) in enumerate(feature_stack.items())]
    main0_lyr = concatenate(dense0_lyr)
    drop0_lyr = Dropout(.75)(main0_lyr)
    main1_lyr = Dense(128, activation="relu")(drop0_lyr)
    output = Dense(1, activation="sigmoid")(main1_lyr)
    model = Model(input_lyr, output)
    model.compile(loss="binary_crossentropy", optimizer="adam")
    return model

In [66]:
from functools import partial

features = list(np.load('./neptune-features/features_246008_1174_0.npy'))
features_info = create_feature_info(features)
features_stack = create_feature_stack(features_info)
create_model_fixed = partial(create_model, features_stack)

In [59]:
features = list(np.load('./neptune-features/features_246008_1174_0.npy'))
features_info = create_feature_info(features)
features_stack = create_feature_stack(features_info)

model = create_model(features_stack)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
short (InputLayer)              (None, 226)          0                                            
__________________________________________________________________________________________________
NAME_EDUCATION_TYPE_CODE (Input (None, 90)           0                                            
__________________________________________________________________________________________________
NAME_FAMILY_STATUS_NAME (InputL (None, 90)           0                                            
__________________________________________________________________________________________________
NAME_FAMILY_STATUS_CODE (InputL (None, 90)           0                                            
__________________________________________________________________________________________________
NAME_EDUCA

## early stop

In [62]:
from keras.models import load_model
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger, Callback
from keras.wrappers.scikit_learn import KerasClassifier

In [81]:
class roc_auc_callback(Callback):
    def __init__(self,training_data,validation_data):
        self.x = training_data[0]
        self.y = training_data[1]
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]

    def on_train_begin(self, logs={}):
        return

    def on_train_end(self, logs={}):
        return

    def on_epoch_begin(self, epoch, logs={}):
        return

    def on_epoch_end(self, epoch, logs={}):
        # y_pred = self.model.predict_proba(self.x, verbose=0)
        y_pred = self.model.predict(self.x, verbose=0)
        roc = roc_auc_score(self.y, y_pred)
        logs['roc_auc'] = roc_auc_score(self.y, y_pred)
        logs['norm_gini'] = ( roc_auc_score(self.y, y_pred) * 2 ) - 1

        # y_pred_val = self.model.predict_proba(self.x_val, verbose=0)
        y_pred_val = self.model.predict(self.x_val, verbose=0)
        roc_val = roc_auc_score(self.y_val, y_pred_val)
        logs['roc_auc_val'] = roc_auc_score(self.y_val, y_pred_val)
        logs['norm_gini_val'] = ( roc_auc_score(self.y_val, y_pred_val) * 2 ) - 1

        # print('\rroc_auc: %s - roc_auc_val: %s - norm_gini: %s - norm_gini_val: %s' % (str(round(roc,5)),str(round(roc_val,5)),str(round((roc*2-1),5)),str(round((roc_val*2-1),5))), end=10*' '+'\n')
        return

    def on_batch_begin(self, batch, logs={}):
        return

    def on_batch_end(self, batch, logs={}):
        return

## keras

In [83]:
# nrows = 1000  
nrows = None

features = list(np.load('./neptune-features/features_246008_1174_0.npy'))
features_info = create_feature_info(features)
features_stack = create_feature_stack(features_info)

train_app = load_dataframe32("./bindata/application_train")
auc_valid_stack = []
pred_valid_stack = []
pred_test_stack = []

run = 0
for i in range(5):
    train_idx_fn = "./neptune-features/train_idx_{}.npy".format(i)
    valid_idx_fn = "./neptune-features/valid_idx_{}.npy".format(i)

    train_offset, valid_offset, test_offset = i * 4, i * 4 + 1, i * 4 + 3
    
    train_data_fn = "./neptune-features/data_246009_1174_{}".format(train_offset)
    valid_data_fn = "./neptune-features/data_61502_1174_{}".format(valid_offset)
    test_data_fn = "./neptune-features/data_48744_1174_{}".format(test_offset)
    
    if i == 0:
        train_data_fn = train_data_fn.replace("246009", "246008")
        valid_data_fn = valid_data_fn.replace("61502", "61503")
    
    gc.collect()

    train_idx = np.load(train_idx_fn)
    valid_idx = np.load(valid_idx_fn)
    
    if nrows:
        train_idx, valid_idx = train_idx[:nrows].copy(), valid_idx[:nrows].copy()

    train_data = load_dataframe32(train_data_fn, nrows)
    valid_data = load_dataframe32(valid_data_fn, nrows)
    test_data = load_dataframe32(test_data_fn, nrows)
    gc.collect()
    

    print(train_idx_fn, valid_idx_fn)
    print(train_idx.shape, valid_idx.shape)
    print(train_data_fn, train_data.shape)
    print(valid_data_fn, valid_data.shape)
    print(test_data_fn, test_data.shape)

    scalar_stack = create_scaler_stack(features_info)
    X_train_mult = {key: scalar_stack[key].fit_transform(replace_nan(train_data[items].values)) for key, items in features_stack.items()}
    X_valid_mult = {key: scalar_stack[key].transform(replace_nan(valid_data[items].values)) for key, items in features_stack.items()}
    X_test_mult = {key: scalar_stack[key].transform(replace_nan(test_data[items].values)) for key, items in features_stack.items()}

    y_train = train_app.loc[train_idx].TARGET
    y_valid = train_app.loc[valid_idx].TARGET
    gc.collect()
    
    callbacks = [
        roc_auc_callback(
            training_data=(X_train_mult, y_train),
            validation_data=(X_valid_mult, y_valid)
        ),  # call this before EarlyStopping
        EarlyStopping(monitor='norm_gini_val', patience=20, mode='max', verbose=1),
        CSVLogger('keras-5fold-run-01-v1-epochs.log', separator=',', append=False),
        ModelCheckpoint(
            '025-keras-5fold-run-01-v1-fold-' + str('%02d' % (i + 1)) + '-run-' + str('%02d' % (run + 1)) + '.check',
            monitor='norm_gini_val', mode='max', # mode must be set to max or Keras will be confused
            save_best_only=True,
            verbose=0
        )
    ]

    np.random.seed(i)
    # create_model(features_stack)
    estimator = KerasClassifier(
        build_fn=create_model_fixed,
        epochs=5000,
        batch_size=500,
        validation_data=(X_valid_mult, y_valid),
        verbose=0,
        shuffle=True,
        callbacks=callbacks
    )
    estimator.fit(X_train_mult, y_train)
    
    del estimator
    estimator = load_model('025-keras-5fold-run-01-v1-fold-' + str('%02d' % (i + 1)) + '-run-' + str('%02d' % (run + 1)) + '.check')
    
    pred_valid = estimator.predict(X_valid_mult)
    pred_valid_stack.append(pred_valid)
    
    auc = roc_auc_score(y_valid, pred_valid)
    auc_valid_stack.append(auc)
    print("fold-{},auc:{}".format(i, auc))

    pred_test = estimator.predict(X_test_mult)
    pred_test_stack.append(pred_test)
    
    # break
    
print("kfold-auc, avg:{:.4}, std:{:.2}".format(np.mean(auc_valid_stack), np.std(auc_valid_stack)))

./neptune-features/train_idx_0.npy ./neptune-features/valid_idx_0.npy
(246008,) (61503,)
./neptune-features/data_246008_1174_0 (246008, 1174)
./neptune-features/data_61503_1174_1 (61503, 1174)
./neptune-features/data_48744_1174_3 (48744, 1174)
Epoch 00087: early stopping
fold-0,auc:0.7866254609675846
./neptune-features/train_idx_1.npy ./neptune-features/valid_idx_1.npy
(246009,) (61502,)
./neptune-features/data_246009_1174_4 (246009, 1174)
./neptune-features/data_61502_1174_5 (61502, 1174)
./neptune-features/data_48744_1174_7 (48744, 1174)
Epoch 00073: early stopping
fold-1,auc:0.7801398565450309
./neptune-features/train_idx_2.npy ./neptune-features/valid_idx_2.npy
(246009,) (61502,)
./neptune-features/data_246009_1174_8 (246009, 1174)
./neptune-features/data_61502_1174_9 (61502, 1174)
./neptune-features/data_48744_1174_11 (48744, 1174)
Epoch 00078: early stopping
fold-2,auc:0.7802676912681713
./neptune-features/train_idx_3.npy ./neptune-features/valid_idx_3.npy
(246009,) (61502,)
./ne

## save oof & submission

In [84]:
def calculate_rank(predictions):
    rank = (1 + pd.Series(predictions).rank().values) / (predictions.shape[0] + 1)
    return rank

In [86]:
pred_test_stack

[array([[0.05016068],
        [0.15800135],
        [0.02343937],
        ...,
        [0.01606217],
        [0.04193696],
        [0.19432412]], dtype=float32), array([[0.06478611],
        [0.17871755],
        [0.05090065],
        ...,
        [0.01883596],
        [0.04330845],
        [0.2660742 ]], dtype=float32), array([[0.06531779],
        [0.16371636],
        [0.03275765],
        ...,
        [0.01619415],
        [0.05017608],
        [0.208277  ]], dtype=float32), array([[0.06209571],
        [0.1564854 ],
        [0.03136708],
        ...,
        [0.0152378 ],
        [0.04115357],
        [0.19500324]], dtype=float32), array([[0.05819519],
        [0.15253244],
        [0.02287433],
        ...,
        [0.01154504],
        [0.03888638],
        [0.19625895]], dtype=float32)]

In [88]:
pred_sample = pd.read_csv("./result/submission-022-blend08.csv")
pred_target = sum([calculate_rank(p_.reshape(pred_sample.shape[0])) for p_ in pred_test_stack]) / 5

pred_sample["TARGET"] = pred_target

pred_sample.to_csv("./result/submission-025-keras01.csv", index=False)

In [89]:
oof_pred = np.zeros(train_app.shape[0])

for i in range(5):
    valid_idx_fn = "./neptune-features/valid_idx_{}.npy".format(i)
    valid_idx = np.load(valid_idx_fn)
    oof_pred[valid_idx] = pred_valid_stack[i].reshape(len(valid_idx))
    
oof_df = train_app[["SK_ID_CURR"]].copy()
oof_df["SK_ID_CURR"] = oof_df.SK_ID_CURR.astype("int")
oof_df["oof_pred"] = oof_pred
oof_df.to_csv("./oof-result/oof-024-keras01.csv", index=False)