In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold
import tensorflow as tf
import seaborn as sns
import gc

In [2]:
df_train = pd.read_csv('train_with_counts.csv')
df_test = pd.read_csv('test_with_counts.csv')

In [3]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import Adam, SGD
from keras.layers import Dropout, BatchNormalization
from keras.layers.advanced_activations import LeakyReLU, PReLU, ELU
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [4]:
# Funcion que devuelve grid para graficar resultado de LGBM
def get_df_grid(train, var_idx):
    var_i = 'var_' + str(var_idx)
    mn,mx = train[:, 0].min(), train[:, 0].max()
    mnFE = train[:, 1].min()
    mxFE = train[:, 1].max()
    step = 50
    stepB = 15
    w = (mx-mn)/step
    x = w * (np.arange(0,step)+0.5) + mn
    x2 = np.array([])
    for i in range(stepB):
        x2 = np.concatenate([x,x2])
    # 
    df_grid = pd.DataFrame({var_i:x2})
    df_grid[var_i+'_FE'] = mnFE + (mxFE-mnFE)/(stepB-1) * (df_grid.index//step)
    df_grid['pred'] = 0
    return df_grid

In [5]:
def auc(y_true, y_pred):
    f1 = lambda: tf.constant(0, dtype=tf.float64)
    f2 = lambda: tf.py_func(roc_auc_score, (y_true, y_pred), tf.double)
    
    r = tf.case([(tf.equal(tf.reduce_sum(y_true), tf.constant(0, dtype=tf.float32)), f1),
                 (tf.equal(tf.reduce_sum(tf.subtract(tf.ones_like(y_true), y_true)), tf.constant(0, dtype=tf.float32)), f1)
                ], default=f2)
    return r

In [6]:
def get_model(input_shape, lr=0.001, decay=1e-3, hidden_units=100):
    model = Sequential()
    model.add(Dense( hidden_units, input_shape=(input_shape,)))
    # model.add(BatchNormalization())
    model.add(PReLU())
    # model.add(ELU())
    # model.add(Dense( 100, input_shape=(X_train.shape[1],), activation='relu'))
    # model.add(Dense( 50, input_shape=(X_train.shape[1],), activation='relu'))
    # model.add(BatchNormalization())
    # model.add(Dropout(0.5))
    model.add(Dense( 1, input_shape=(input_shape,), activation='sigmoid'))
    model.compile(Adam(lr=lr, decay=decay), loss='binary_crossentropy', metrics=[auc])
    return model

In [7]:
best_model_file_name = 'best_model.h5'

In [8]:
# var_0   Valid aucroc: 0.548506243183567
# var_198 Valid aucroc: 0.55081

In [9]:
def train_var(X_train, X_test, y_train, var_, verbose=1):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    val_predictions = np.zeros(len(y_train))
    test_preds = np.zeros(len(X_test))
    for fold, (trn_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        print(f'################### Fold {fold+1} ########################')
        es = EarlyStopping(monitor='val_auc', patience=10, mode='max', verbose=verbose)
        mc = ModelCheckpoint(best_model_file_name, monitor='val_auc', mode='max', verbose=verbose, save_best_only=True)
        model = get_model(X_train.shape[1], lr=0.001, decay=0, hidden_units=1000)
        # First train
        model.fit(X_train[trn_idx], y_train[trn_idx], 
              epochs=50, batch_size=400, 
              verbose=0,
              callbacks = [es, mc],
              validation_data=(X_train[val_idx], y_train[val_idx]))
#         del model, es, mc
#         model = get_model(X_train.shape[1], lr=0.0001, decay=0, hidden_units=1000)
#         model.load_weights(best_model_file_name)
#         # Second train
#         es = EarlyStopping(monitor='val_auc', patience=10, mode='max', verbose=verbose)
#         mc = ModelCheckpoint(best_model_file_name, monitor='val_auc', mode='max', verbose=verbose, save_best_only=True)
#         model.fit(X_train[trn_idx], y_train[trn_idx], 
#               epochs=50, batch_size=400, 
#               verbose=0,
#               callbacks = [es, mc],
#               validation_data=(X_train[val_idx], y_train[val_idx]))
        model.load_weights(best_model_file_name)
        fold_val_preds = model.predict_proba(X_train[val_idx], verbose=1)
        fold_train_preds = model.predict_proba(X_train[trn_idx], verbose=1)
        val_predictions[val_idx] = fold_val_preds.reshape(-1)
        print('Train auc', roc_auc_score(y_train[trn_idx], fold_train_preds))
        print('Valid auc', roc_auc_score(y_train[val_idx], fold_val_preds))
        test_preds += model.predict(X_test).reshape(-1)/5.0
        del model, es, mc
        gc.collect()
    del X_train, X_test, skf
    print('Valid aucroc:', roc_auc_score(y_train, val_predictions))
    return test_preds, val_predictions

In [10]:
# Normalize
y_train = df_train[['target']].values
df_train_X = df_train.drop(columns=['ID_code', 'target'])
df_train_X_normalized = (df_train_X - df_train_X.mean(axis=0))/df_train_X.std(axis=0)
df_test_X = df_test.drop(columns=['ID_code'])
df_test_X_normalized = (df_test_X - df_test_X.mean(axis=0))/df_test_X.std(axis=0)

In [11]:
# test_preds, val_predictions = train_var(df_train_X_normalized, df_test_X_normalized, y_train, 198, verbose=1)

In [17]:
all_off_filename = 'all_oof_NN_v2.npy'
all_test_pred_filename = 'all_test_preds_NN_v2.npy'

all_oof = np.load(all_off_filename)
unique_values, unique_indexes, unique_inverse, unique_counts  = np.unique(all_oof[0, :], return_counts=True, return_index=True, return_inverse=True)
errors = np.where(unique_counts[unique_inverse]>1)[0]
print(errors)

[ 43  44  73  74 150 151]


In [13]:
# [ 61  62  63  83  84  92  93  97  98  99 100 104 105 149 150 165 166]

In [18]:
import os #34, 43, #73

In [19]:
num_vars = 200
# SAVE OUT-OF-FOLD PREDICTIONS
if os.path.exists(all_off_filename):
    all_oof = np.load(all_off_filename)
else:
    all_oof = np.zeros((len(df_train),num_vars))
# SAVE TEST PREDICTIONS
if os.path.exists(all_test_pred_filename):
    all_test_preds = np.load(all_test_pred_filename)
else:
    all_test_preds = np.zeros((len(df_test),num_vars))


for i in range(num_vars):
    if all_oof[:, i].sum() == 0 or i in errors:
        print(f'################################################################')
        print(f'########################### VAR {i} ##############################')
        print(f'################################################################')
        print()
        features = [f'var_{i}', f'var_{i}_FE']
        X_train = df_train_X_normalized[features].values
        X_test = df_test_X_normalized[features].values
        %time test_preds, val_predictions = train_var(X_train, X_test, y_train, i, verbose=0)
        all_oof[:, i] = val_predictions
        all_test_preds[:, i] = test_preds
        np.save(all_off_filename, all_oof)
        np.save(all_test_pred_filename, all_test_preds)
        print()
    else: 
        print(f'VAR {i} Already calculated', end = ' / ')

VAR 0 Already calculated / VAR 1 Already calculated / VAR 2 Already calculated / VAR 3 Already calculated / VAR 4 Already calculated / VAR 5 Already calculated / VAR 6 Already calculated / VAR 7 Already calculated / VAR 8 Already calculated / VAR 9 Already calculated / VAR 10 Already calculated / VAR 11 Already calculated / VAR 12 Already calculated / VAR 13 Already calculated / VAR 14 Already calculated / VAR 15 Already calculated / VAR 16 Already calculated / VAR 17 Already calculated / VAR 18 Already calculated / VAR 19 Already calculated / VAR 20 Already calculated / VAR 21 Already calculated / VAR 22 Already calculated / VAR 23 Already calculated / VAR 24 Already calculated / VAR 25 Already calculated / VAR 26 Already calculated / VAR 27 Already calculated / VAR 28 Already calculated / VAR 29 Already calculated / VAR 30 Already calculated / VAR 31 Already calculated / VAR 32 Already calculated / VAR 33 Already calculated / VAR 34 Already calculated / VAR 35 Already calculated / VA

In [16]:
var_ = 0
features = [f'var_{var_}', f'var_{var_}_FE']
X_train_unnorm = df_train[features].values
X_train = (X_train_unnorm - X_train_unnorm.mean(axis=0))/X_train_unnorm.std(axis=0)

In [None]:
mn,mx = X_train[:, 0].min(), X_train[:, 0].max()
mnFE = X_train[:, 1].min()
mxFE = X_train[:, 1].max()
step = 50
stepB = 15
x = df_grid['pred'].values
x = np.reshape(x,(stepB,step))
x = np.flip(x,axis=0)

In [None]:
sns.heatmap(x, cmap='RdBu_r', center=0.0) 
# plt.title(var_+' Predictions with Magic',fontsize=16)    
# plt.xticks(np.linspace(0,49,5),np.round(np.linspace(mn,mx,5),1))
# plt.xlabel('Var_'+str(var_))
# s = min(mxFE-mnFE+1,20)
# plt.yticks(np.linspace(mnFE,mxFE,s)-0.5,np.linspace(mxFE,mnFE,s).astype('int'))
# plt.ylabel('Count')
#plt.show()

In [None]:
from matplotlib import pyplot as plt
sns.heatmap(x, cmap='RdBu_r', center=0.0) 
# plt.title(var_+' Predictions with Magic',fontsize=16)    
# plt.xticks(np.linspace(0,49,5),np.round(np.linspace(mn,mx,5),1))
# plt.xlabel('Var_'+str(var_))
# s = min(mxFE-mnFE+1,20)
# plt.yticks(np.linspace(mnFE,mxFE,s)-0.5,np.linspace(mxFE,mnFE,s).astype('int'))
# plt.ylabel('Count')
plt.show()

In [None]:
df_grid