# Description

This notebook runs neural network on merged tables.

In [None]:
import numpy as np
import pandas as pd

import os
import warnings
warnings.filterwarnings('ignore')

import gc

In [None]:
# Directly load saved dataframe
train = pd.read_csv('../input/home-credit-merged/train.csv')
test = pd.read_csv('../input/home-credit-merged/test.csv')

# Submission dataframe
submit = test[['SK_ID_CURR']]

In [None]:
train_labels = train['TARGET']

# Align the training and testing data, keep only columns present in both dataframes
train, test = train.align(test, join = 'inner', axis = 1)

# Add the target back in
train['TARGET'] = train_labels

print('Training Features shape: ', train.shape)
print('Testing Features shape: ', test.shape)

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

# Drop the target from the training data
if 'TARGET' in train:
    train = train.drop(columns = ['TARGET'])
    
# Feature names
features = list(train.columns)

for feat in features:
    # Median imputation of missing values
    imputer = SimpleImputer(strategy = 'median')

    # Scale each feature to 0-1
    scaler = MinMaxScaler(feature_range = (0, 1))

    # Fit on the training data
    imputer.fit(train[feat].values.reshape(-1, 1))

    # Transform both training and testing data
    train[feat] = imputer.transform(train[feat].values.reshape(-1, 1))
    test[feat] = imputer.transform(test[feat].values.reshape(-1, 1))

    # Repeat with the scaler
    scaler.fit(train[feat].values.reshape(-1, 1))
    train[feat] = scaler.transform(train[feat].values.reshape(-1, 1))
    test[feat] = scaler.transform(test[feat].values.reshape(-1, 1))

print('Training data shape: ', train.shape)
print('Testing data shape: ', test.shape)

gc.enable()
del imputer, scaler
gc.collect()

In [None]:
# Use keras to build the neural network
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout, BatchNormalization

from sklearn.metrics import roc_auc_score

# K-fold cross validation
from sklearn.model_selection import KFold
folds = KFold(n_splits=10, shuffle=True, random_state=233)

In [None]:
oof_preds = np.zeros(train.shape[0])
sub_preds = np.zeros(test.shape[0])

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train)):
    trn_x, trn_y = train.iloc[trn_idx], train_labels.iloc[trn_idx]
    val_x, val_y = train.iloc[val_idx], train_labels.iloc[val_idx]
    
    # Neural Network
    nn = Sequential()
    nn.add(Dense(units=400, kernel_initializer='normal', activation='relu', input_dim=train.shape[1]))
    nn.add(BatchNormalization())
    nn.add(Dropout(.1))
    nn.add(Dense(units=100, kernel_initializer='normal', activation='relu'))
    nn.add(BatchNormalization())
    nn.add(Dropout(.1))
    nn.add(Dense(units=20, kernel_initializer='normal', activation='relu'))
    nn.add(BatchNormalization())
    nn.add(Dropout(.1))
    nn.add(Dense(units=1, kernel_initializer='normal', activation='sigmoid'))
    nn.compile(loss='binary_crossentropy', optimizer='adam')
    
    nn.fit(trn_x, trn_y, epochs=10, verbose=2)
    
    oof_preds[val_idx] = nn.predict(val_x).flatten()
    sub_preds += nn.predict(test).flatten() / folds.n_splits
    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
    del nn, trn_x, trn_y, val_x, val_y
    gc.collect()

In [None]:
submit['TARGET'] = sub_preds

submit.head()

In [None]:
# Save the submission to a csv file
submit.to_csv('nn_all_table.csv', index = False)

Private Score: 0.77026, Public Score: 0.77076