In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import random
from sklearn.pipeline import Pipeline


import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Embedding,  Flatten
from tensorflow.keras.models import Model, Sequential
from keras.callbacks import ReduceLROnPlateau
from keras.optimizers import RMSprop
import keras_tuner as kt

from tensorflow.data import Dataset
from sklearn.preprocessing import QuantileTransformer,  KBinsDiscretizer
from sklearn.model_selection import StratifiedKFold
from tensorflow import keras
from sklearn import metrics
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score

import gc
import warnings
warnings.simplefilter('ignore')

# Parameters

In [2]:
target = 'claim'

DEBUG = False

if DEBUG:
    N_ESTIMATORS = 1
    N_SPLITS = 2
    SEED = 2017
    CVSEED = 2017
    EARLY_STOPPING_ROUNDS = 1
    VERBOSE = 100
    BINS = 128
    #N_ITERS = 2
else:
    N_SPLITS = 10
    N_ESTIMATORS = 20000
    EARLY_STOPPING_ROUNDS = 300
    VERBOSE = 1000
    SEED = 2026
    CVSEED = 2017
    BINS = 128
    #N_ITERS = 10

In [3]:
def set_seed(seed=2017):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.compat.v1.set_random_seed(seed)
    
set_seed(SEED)

# Load Dataset

In [4]:
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
test  = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
sub   = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

In [5]:
train['lgb_pred'] = np.load("../input/tps-sep-lv0-base-trees/agg_lgb_oof.npy")
test['lgb_pred'] = np.load("../input/tps-sep-lv0-base-trees/agg_lgb_pred.npy")

train['lgb2_pred'] = np.load("../input/tps-sep-lv0-base-trees/agg_lgb2_oof.npy")
test['lgb2_pred'] = np.load("../input/tps-sep-lv0-base-trees/agg_lgb2_pred.npy")

train['xgb_pred'] = np.load("../input/tps-sep-lv0-base-trees/agg_xgb_oof.npy")
test['xgb_pred'] = np.load("../input/tps-sep-lv0-base-trees/agg_xgb_pred.npy")

train['lgb_bizen_pred'] = np.load("../input/tps-sep-lv0-base-trees/lgb_bizen_oof.npy")
test['lgb_bizen_pred'] = np.load("../input/tps-sep-lv0-base-trees/lgb_bizen_pred.npy")

train['lgb_dmitry_pred'] = np.load("../input/tps-sep-lv0-base-trees/lgb_dmitry_oof.npy")
test['lgb_dmitry_pred'] = np.load("../input/tps-sep-lv0-base-trees/lgb_dmitry_pred.npy")

train['xgb_dmitry_pred'] = np.load("../input/tps-sep-lv0-base-trees/xgb_dmitry_oof.npy")
test['xgb_dmitry_pred'] = np.load("../input/tps-sep-lv0-base-trees/xgb_dmitry_pred.npy")

train['lgb_manav_pred'] = np.load("../input/tps-sep-lv0-base-trees/lgb_manav_oof.npy")
test['lgb_manav_pred'] = np.load("../input/tps-sep-lv0-base-trees/lgb_manav_pred.npy")

train['xgb_manav_pred'] = np.load("../input/tps-sep-lv0-base-trees/xgb_manav_oof.npy")
test['xgb_manav_pred'] = np.load("../input/tps-sep-lv0-base-trees/xgb_manav_pred.npy")

In [6]:
train['ridge_pred'] = np.load("../input/tps-sep-lv0-base-trees/ridge_oof.npy")
test['ridge_pred'] = np.load("../input/tps-sep-lv0-base-trees/ridge_pred.npy")

train['nn_pred'] = np.load("../input/tps-sep-lv0-base-trees/agg_nn_oof.npy")
test['nn_pred'] = np.load("../input/tps-sep-lv0-base-trees/agg_nn_pred.npy")

# Preprocessing

In [7]:
features = [col for col in train.columns if 'pred' in col]

pipe = Pipeline([
       # ('imputer', SimpleImputer(strategy='median',missing_values=np.nan)),
        ("scaler", QuantileTransformer(n_quantiles=BINS,output_distribution='normal')),
        ('bin', KBinsDiscretizer(n_bins=BINS, encode='ordinal',strategy='uniform'))
        ])
#train[features] = pipe.fit_transform(train[features])
#test[features] = pipe.transform(test[features])

In [8]:
test[features]

Unnamed: 0,lgb_pred,lgb2_pred,xgb_pred,lgb_bizen_pred,lgb_dmitry_pred,xgb_dmitry_pred,lgb_manav_pred,xgb_manav_pred,ridge_pred,nn_pred
0,0.558734,0.578699,0.576973,0.561836,0.566536,0.575204,0.564575,0.564908,0.553440,0.564283
1,0.130124,0.120648,0.124073,0.119917,0.126946,0.126340,0.121656,0.120459,0.182040,0.120549
2,0.634854,0.628457,0.630655,0.629326,0.627686,0.635330,0.626681,0.633499,0.631720,0.569874
3,0.121661,0.125785,0.126651,0.128635,0.122506,0.123506,0.128684,0.131022,0.184237,0.120917
4,0.151700,0.150161,0.149483,0.153291,0.146343,0.143437,0.152862,0.153132,0.203202,0.124316
...,...,...,...,...,...,...,...,...,...,...
493469,0.833647,0.829336,0.829659,0.827457,0.825696,0.827184,0.831222,0.816129,0.790480,0.587796
493470,0.114116,0.116351,0.117947,0.111374,0.112238,0.115928,0.111981,0.109487,0.174385,0.120834
493471,0.771955,0.760704,0.758474,0.758873,0.764026,0.754300,0.756039,0.774203,0.751130,0.580247
493472,0.129603,0.132275,0.129645,0.133944,0.135123,0.132036,0.135098,0.136081,0.184900,0.122772


In [9]:
train[target]

0         1
1         0
2         1
3         1
4         1
         ..
957914    0
957915    1
957916    0
957917    1
957918    0
Name: claim, Length: 957919, dtype: int64

# Model

In [10]:
def make_model():
    
    lr = 0.005705572830883387
    dropout =0.5684029315925301
    embed_dim =12
    hidden_dim = 312
    n_layers = 1
    act = 'relu'
    #dstep = hp.Int('decay_steps', min_value=2000, max_value=4000, step=200)
    drate = 0.6620490386166048
    eps =8.956803803898012e-08
    
    
    inputs = Input(train[features].shape[1:])
    X = Embedding(input_dim=BINS, output_dim=embed_dim, embeddings_initializer = "glorot_normal")(inputs)
    X = Dropout(dropout)(X)
    #X = BatchNormalization()(X)
    X = Flatten()(X)
    
    for i in range(n_layers):
        #units = hp.Int('units_{i}'.format(i=i), min_value=8, max_value=256, step=8)
        X = layers.Dense(hidden_dim/(2**i), activation=act, kernel_initializer=tf.keras.initializers.GlorotNormal())(X)
        X = Dropout(dropout)(X)
        #X = BatchNormalization()(X)
    outputs = layers.Dense(1, activation='sigmoid', kernel_initializer=tf.keras.initializers.GlorotNormal())(X)
    model = keras.Model(inputs, outputs)
    
    #learning_rate = hp.Float('learning_rate', min_value=3e-4, max_value=3e-3)
    lr_schedule = keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=lr,
        decay_steps = 450,
        decay_rate= drate)

    optimizer = keras.optimizers.Adam(learning_rate=lr_schedule, epsilon=eps)
    model.compile(loss=keras.losses.binary_crossentropy,
                  optimizer=optimizer,
                  metrics=[tf.keras.metrics.AUC(name='aucroc')])
    #model.summary()
    return model

# NN

In [11]:
def prediction(x, y, batch_size=1024, epochs=100):
    cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=CVSEED)
    val_losses = np.zeros(x.shape[0])
    nn_pred = np.zeros(test.shape[0])
    
    for fold, (train_indices, val_indices) in enumerate(cv.split(x, y)):
      print(f"===== fold {fold} =====")
      x_train, x_valid = x.iloc[train_indices], x.iloc[val_indices]
      y_train, y_valid = y.iloc[train_indices], y.iloc[val_indices]
    
      gc.collect()
        
      x_train[features] = pipe.fit_transform(x_train[features])
      x_valid[features] = pipe.transform(x_valid[features])
      test[features] = pipe.transform(test[features])
        
      
    
      model = make_model()
      model.fit( x_train[features], y_train, 
                validation_data=(x_valid[features] , y_valid),
                shuffle=True,
                verbose=0,
                #callbacks=[model_checkpoint_callback],
                callbacks=[
                #tf.keras.callbacks.ReduceLROnPlateau(monitor='val_aucroc', mode='max', patience=2),
                tf.keras.callbacks.EarlyStopping(monitor='val_aucroc', mode='max', patience=5)  ],
                batch_size=batch_size, 
                epochs=epochs)
      val_losses[val_indices] += model.predict(x_valid[features] )[:,-1]
      nn_pred += model.predict(test[features] )[:,-1]

      auc = roc_auc_score(y_valid, val_losses[val_indices])
      print(f"fold {fold} - nn auc: {auc:.6f}")   
    
      del model
      gc.collect()
     
    nn_pred /= N_SPLITS
    print(f"oof nn_auc = {roc_auc_score(y, val_losses)}")
    
    np.save("nn_oof.npy", val_losses)
    np.save("nn_pred.npy", nn_pred)

    gc.collect()
    return nn_pred

In [12]:
nn_pred = prediction(x=train[features], y=train[target], 
             batch_size=1024, 
             epochs=100,
             #validation_data=(x_val, y_val),
            )

===== fold 0 =====
fold 0 - nn auc: 0.817064
===== fold 1 =====
fold 1 - nn auc: 0.815994
===== fold 2 =====
fold 2 - nn auc: 0.817595
===== fold 3 =====
fold 3 - nn auc: 0.816380
===== fold 4 =====
fold 4 - nn auc: 0.817836
===== fold 5 =====
fold 5 - nn auc: 0.815393
===== fold 6 =====
fold 6 - nn auc: 0.819374
===== fold 7 =====
fold 7 - nn auc: 0.815357
===== fold 8 =====
fold 8 - nn auc: 0.817060
===== fold 9 =====
fold 9 - nn auc: 0.816257
oof nn_auc = 0.8167795482871213


In [13]:
sub[target]=nn_pred
sub.to_csv('submission.csv', index=False)

sub

Unnamed: 0,id,claim
0,957919,0.918941
1,957920,0.874256
2,957921,0.923955
3,957922,0.874321
4,957923,0.877284
...,...,...
493469,1451388,0.944397
493470,1451389,0.873260
493471,1451390,0.937294
493472,1451391,0.874930


# Log

/////// 8 trees + ridge /////////

/// 128 quant normal kbins uniform with dropout, dstep=450, decreasing hidden units ///
2017 ver2
2018 ver3
2019 ver4
2020 ver5
2021 ver6
2022 ver7
2023 ver8
2024 ver9
2025 ver10
2026 ver11

/////// 8 trees + ridge + nn /////////

2017 ver12
2018 ver13
2019 ver14
2020 ver15
2021 ver16
2022 ver17
2023 ver18
2024 ver19
2025 ver20
2026 ver21
